From 0104d4ba879e9c34f7872cc0f5f28ce20a2016cd Mon Sep 17 00:00:00 2001 From: Francis Tseng Date: Wed, 18 Sep 2013 13:47:39 -0400 Subject: [PATCH 1/3] python 3 port, retaining 2.7 compatibility --- .gitignore | 2 ++ readability/htmls.py | 13 +++++++++---- readability/readability.py | 36 ++++++++++++++++++++++-------------- 3 files changed, 33 insertions(+), 18 deletions(-) diff --git a/.gitignore b/.gitignore index 84fca1f2..fbf0efe6 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ dist /man nosetests.xml .coverage +dev-env +dev-env2.7 diff --git a/readability/htmls.py b/readability/htmls.py index 9b599935..ca594c82 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -1,16 +1,21 @@ -from cleaners import normalize_spaces, clean_attributes -from encoding import get_encoding +from .cleaners import normalize_spaces, clean_attributes +from .encoding import get_encoding from lxml.html import tostring import logging import lxml.html import re +# Python 2.7 compatibility. +import sys +if sys.version < '3': + str = unicode + logging.getLogger().setLevel(logging.DEBUG) utf8_parser = lxml.html.HTMLParser(encoding='utf-8') def build_doc(page): - if isinstance(page, unicode): + if isinstance(page, str): page_unicode = page else: enc = get_encoding(page) @@ -105,7 +110,7 @@ def shorten_title(doc): def get_body(doc): [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ] - raw_html = unicode(tostring(doc.body or doc)) + raw_html = str(tostring(doc.body or doc)) cleaned = clean_attributes(raw_html) try: #BeautifulSoup(cleaned) #FIXME do we really need to try loading it? diff --git a/readability/readability.py b/readability/readability.py index fc376368..23633131 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -9,13 +9,16 @@ from lxml.html import document_fromstring from lxml.html import fragment_fromstring -from cleaners import clean_attributes -from cleaners import html_cleaner -from htmls import build_doc -from htmls import get_body -from htmls import get_title -from htmls import shorten_title +from .cleaners import clean_attributes +from .cleaners import html_cleaner +from .htmls import build_doc +from .htmls import get_body +from .htmls import get_title +from .htmls import shorten_title +# Python 2.7 compatibility. +if sys.version < '3': + str = unicode logging.basicConfig(level=logging.INFO) log = logging.getLogger() @@ -179,9 +182,9 @@ def summary(self, html_partial=False): continue else: return cleaned_article - except StandardError, e: + except Exception as e: log.exception('error getting summary: ') - raise Unparseable(str(e)), None, sys.exc_info()[2] + raise Unparseable(str(e)) def get_article(self, candidates, best_candidate, html_partial=False): # Now that we have the top candidate, look through its siblings for @@ -231,7 +234,7 @@ def get_article(self, candidates, best_candidate, html_partial=False): return output def select_best_candidate(self, candidates): - sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True) + sorted_candidates = sorted(list(candidates.values()), key=lambda x: x['content_score'], reverse=True) for candidate in sorted_candidates[:5]: elem = candidate['elem'] self.debug("Top 5 : %6.3f %s" % ( @@ -366,7 +369,7 @@ def transform_misused_divs_into_paragraphs(self): # This results in incorrect results in case there is an # buried within an for example if not REGEXES['divToPElementsRe'].search( - unicode(''.join(map(tostring, list(elem))))): + str(b''.join(map(tostring, list(elem))))): #self.debug("Altering %s to p" % (describe(elem))) elem.tag = "p" #print "Fixed element "+describe(elem) @@ -577,15 +580,20 @@ def main(): file = None if options.url: - import urllib - file = urllib.urlopen(options.url) + # Python 2.7 compatibility + # Python 2.7 support. + try: + from urllib import request + except ImportError: + import urllib2 as request + file = request.urlopen(options.url) else: file = open(args[0], 'rt') enc = sys.__stdout__.encoding or 'utf-8' try: - print Document(file.read(), + print(Document(file.read(), debug=options.verbose, - url=options.url).summary().encode(enc, 'replace') + url=options.url).summary().encode(enc, 'replace')) finally: file.close() From d58d563299f0442ee6097755e2651b7fb5e72126 Mon Sep 17 00:00:00 2001 From: Francis Tseng Date: Wed, 18 Sep 2013 13:50:51 -0400 Subject: [PATCH 2/3] encoding regex fix --- readability/encoding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readability/encoding.py b/readability/encoding.py index d05b7f44..efb690a0 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -2,7 +2,7 @@ import chardet def get_encoding(page): - text = re.sub(']*>\s*', ' ', page) + text = re.sub(b']*>\s*', b' ', page) enc = 'utf-8' if not text.strip() or len(text) < 10: return enc # can't guess From 6c0e08d686fd66a627d2005caa2924183670ce66 Mon Sep 17 00:00:00 2001 From: Francis Tseng Date: Wed, 18 Sep 2013 13:52:43 -0400 Subject: [PATCH 3/3] gitignore cleanup --- .gitignore | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitignore b/.gitignore index fbf0efe6..84fca1f2 100644 --- a/.gitignore +++ b/.gitignore @@ -9,5 +9,3 @@ dist /man nosetests.xml .coverage -dev-env -dev-env2.7