From 0104d4ba879e9c34f7872cc0f5f28ce20a2016cd Mon Sep 17 00:00:00 2001
From: Francis Tseng <ftzeng@gmail.com>
Date: Wed, 18 Sep 2013 13:47:39 -0400
Subject: [PATCH 1/3] python 3 port, retaining 2.7 compatibility

---
 .gitignore                 |  2 ++
 readability/htmls.py       | 13 +++++++++----
 readability/readability.py | 36 ++++++++++++++++++++++--------------
 3 files changed, 33 insertions(+), 18 deletions(-)
diff --git a/.gitignore b/.gitignore
index 84fca1f2..fbf0efe6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,5 @@ dist
 /man
 nosetests.xml
 .coverage
+dev-env
+dev-env2.7
diff --git a/readability/htmls.py b/readability/htmls.py
index 9b599935..ca594c82 100644
--- a/readability/htmls.py
+++ b/readability/htmls.py
@@ -1,16 +1,21 @@
-from cleaners import normalize_spaces, clean_attributes
-from encoding import get_encoding
+from .cleaners import normalize_spaces, clean_attributes
+from .encoding import get_encoding
 from lxml.html import tostring
 import logging
 import lxml.html
 import re
 
+# Python 2.7 compatibility.
+import sys
+if sys.version < '3':
+    str = unicode
+
 logging.getLogger().setLevel(logging.DEBUG)
 
 utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
 
 def build_doc(page):
-    if isinstance(page, unicode):
+    if isinstance(page, str):
         page_unicode = page
     else:
         enc = get_encoding(page)
@@ -105,7 +110,7 @@ def shorten_title(doc):
 
 def get_body(doc):
     [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
-    raw_html = unicode(tostring(doc.body or doc))
+    raw_html = str(tostring(doc.body or doc))
     cleaned = clean_attributes(raw_html)
     try:
         #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
diff --git a/readability/readability.py b/readability/readability.py
index fc376368..23633131 100755
--- a/readability/readability.py
+++ b/readability/readability.py
@@ -9,13 +9,16 @@
 from lxml.html import document_fromstring
 from lxml.html import fragment_fromstring
 
-from cleaners import clean_attributes
-from cleaners import html_cleaner
-from htmls import build_doc
-from htmls import get_body
-from htmls import get_title
-from htmls import shorten_title
+from .cleaners import clean_attributes
+from .cleaners import html_cleaner
+from .htmls import build_doc
+from .htmls import get_body
+from .htmls import get_title
+from .htmls import shorten_title
 
+# Python 2.7 compatibility.
+if sys.version < '3':
+    str = unicode
 
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger()
@@ -179,9 +182,9 @@ def summary(self, html_partial=False):
                     continue
                 else:
                     return cleaned_article
-        except StandardError, e:
+        except Exception as e:
             log.exception('error getting summary: ')
-            raise Unparseable(str(e)), None, sys.exc_info()[2]
+            raise Unparseable(str(e))
 
     def get_article(self, candidates, best_candidate, html_partial=False):
         # Now that we have the top candidate, look through its siblings for
@@ -231,7 +234,7 @@ def get_article(self, candidates, best_candidate, html_partial=False):
         return output
 
     def select_best_candidate(self, candidates):
-        sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
+        sorted_candidates = sorted(list(candidates.values()), key=lambda x: x['content_score'], reverse=True)
         for candidate in sorted_candidates[:5]:
             elem = candidate['elem']
             self.debug("Top 5 : %6.3f %s" % (
@@ -366,7 +369,7 @@ def transform_misused_divs_into_paragraphs(self):
             # This results in incorrect results in case there is an <img>
             # buried within an <a> for example
             if not REGEXES['divToPElementsRe'].search(
-                    unicode(''.join(map(tostring, list(elem))))):
+                    str(b''.join(map(tostring, list(elem))))):
                 #self.debug("Altering %s to p" % (describe(elem)))
                 elem.tag = "p"
                 #print "Fixed element "+describe(elem)
@@ -577,15 +580,20 @@ def main():
 
     file = None
     if options.url:
-        import urllib
-        file = urllib.urlopen(options.url)
+        # Python 2.7 compatibility
+        # Python 2.7 support.
+        try:
+            from urllib import request
+        except ImportError:
+            import urllib2 as request
+        file = request.urlopen(options.url)
     else:
         file = open(args[0], 'rt')
     enc = sys.__stdout__.encoding or 'utf-8'
     try:
-        print Document(file.read(),
+        print(Document(file.read(),
             debug=options.verbose,
-            url=options.url).summary().encode(enc, 'replace')
+            url=options.url).summary().encode(enc, 'replace'))
     finally:
         file.close()
 

From d58d563299f0442ee6097755e2651b7fb5e72126 Mon Sep 17 00:00:00 2001
From: Francis Tseng <ftzeng@gmail.com>
Date: Wed, 18 Sep 2013 13:50:51 -0400
Subject: [PATCH 2/3] encoding regex fix

---
 readability/encoding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/readability/encoding.py b/readability/encoding.py
index d05b7f44..efb690a0 100644
--- a/readability/encoding.py
+++ b/readability/encoding.py
@@ -2,7 +2,7 @@
 import chardet
 
 def get_encoding(page):
-    text = re.sub('</?[^>]*>\s*', ' ', page)
+    text = re.sub(b'</?[^>]*>\s*', b' ', page)
     enc = 'utf-8'
     if not text.strip() or len(text) < 10:
         return enc # can't guess

From 6c0e08d686fd66a627d2005caa2924183670ce66 Mon Sep 17 00:00:00 2001
From: Francis Tseng <ftzeng@gmail.com>
Date: Wed, 18 Sep 2013 13:52:43 -0400
Subject: [PATCH 3/3] gitignore cleanup

---
 .gitignore | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index fbf0efe6..84fca1f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,5 +9,3 @@ dist
 /man
 nosetests.xml
 .coverage
-dev-env
-dev-env2.7