diff --git a/README b/README index 72b3b3d4..967bcdcf 100644 --- a/README +++ b/README @@ -17,18 +17,17 @@ Based on: - Github users contributions. Installation:: - - easy_install readability-lxml - or - pip install readability-lxml - + -download code + -python setup.py install Usage:: from readability.readability import Document import urllib html = urllib.urlopen(url).read() - readable_article = Document(html).summary() - readable_title = Document(html).short_title() + doc = Document(html) + short_title = doc.short_title() + text_content = doc.text_content() + pub_date = doc.get_publish_date() Command-line usage:: diff --git a/readability/htmls.py b/readability/htmls.py index 9b599935..dac585d8 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -1,6 +1,7 @@ from cleaners import normalize_spaces, clean_attributes from encoding import get_encoding from lxml.html import tostring +from lxml.etree import tounicode import logging import lxml.html import re @@ -44,14 +45,14 @@ def norm_title(title): def get_title(doc): title = doc.find('.//title') - if title is None or len(title.text) == 0: + if title is None or not title.text: return '[no-title]' return norm_title(title.text) def add_match(collection, text, orig): text = norm_title(text) - if len(text.split()) >= 2 and len(text) >= 15: + if len(text) >= 5: if text.replace('"', '') in orig.replace('"', ''): collection.add(text) @@ -61,46 +62,39 @@ def shorten_title(doc): return '' title = orig = norm_title(title.text) - - candidates = set() + candidates = set() for item in ['.//h1', './/h2', './/h3']: for e in list(doc.iterfind(item)): - if e.text: - add_match(candidates, e.text, orig) - if e.text_content(): - add_match(candidates, e.text_content(), orig) - + text = e.text or e.text_content() + add_match(candidates, text, orig) for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']: for e in doc.cssselect(item): - if e.text: - add_match(candidates, e.text, orig) - if e.text_content(): - add_match(candidates, e.text_content(), orig) - + text = e.text or e.text_content() + add_match(candidates, text, orig) if candidates: title = sorted(candidates, key=len)[-1] + #else: + for delimiter in ['|', '-', ' :: ', ' / ', '_', " "]: + if delimiter in title: + parts = orig.split(delimiter) + title = parts[0] + #if len(parts[0].split()) >= 4: + # title = parts[0] + # break + #elif len(parts[-1].split()) >= 4: + # title = parts[-1] + # break else: - for delimiter in [' | ', ' - ', ' :: ', ' / ']: - if delimiter in title: - parts = orig.split(delimiter) - if len(parts[0].split()) >= 4: - title = parts[0] - break - elif len(parts[-1].split()) >= 4: - title = parts[-1] - break - else: - if ': ' in title: - parts = orig.split(': ') - if len(parts[-1].split()) >= 4: - title = parts[-1] - else: - title = orig.split(': ', 1)[1] - - if not 15 < len(title) < 150: + if ': ' in title: + parts = orig.split(': ') + if len(parts[-1].split()) >= 4: + title = parts[-1] + else: + title = orig.split(': ', 1)[1] + + if not 5 < len(title) < 150: return orig - return title def get_body(doc): @@ -113,3 +107,28 @@ def get_body(doc): except Exception: #FIXME find the equivalent lxml error logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) return raw_html + + + + + + + +def remove_ctrl_char(origin_str): + #ctr_chars = [u'\x%02d' % i for i in range(0, 32)] + ctr_chars = [u'\u0000', u'\u0001', u'\u0002', u'\u0003', u'\u0004', u'\u0005', u'\u0006', u'\u0007', u'\u0008', u'\u0009', + u'\u000a', u'\u000b', u'\u000c', u'\u000d', u'\u000e', u'\u000f', u'\u0010', u'\u0011', u'\u0012', u'\u0013', + u'\u0014', u'\u0015', u'\u0016', u'\u0017', u'\u0018', u'\u0019', u'\u001a', u'\u001b', u'\u001c', u'\u001d', + u'\u001e', u'\u001f'] + if not isinstance(origin_str, unicode): + origin_str = unicode(origin_str) + + regex = re.compile(u'|'.join(ctr_chars)) + return regex.subn(u'', origin_str)[0] + + +def merge_space(origin_str): + if not isinstance(origin_str, unicode): + origin_str = unicode(origin_str) + regex = re.compile(u"(\s)+", re.UNICODE) + return regex.subn(u'\\1', origin_str)[0] diff --git a/readability/readability.py b/readability/readability.py index fc376368..c708ec07 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -1,7 +1,10 @@ #!/usr/bin/env python -import logging +#coding=utf-8 import re import sys +import copy +import logging +import datetime from collections import defaultdict from lxml.etree import tostring @@ -9,17 +12,25 @@ from lxml.html import document_fromstring from lxml.html import fragment_fromstring -from cleaners import clean_attributes -from cleaners import html_cleaner from htmls import build_doc from htmls import get_body from htmls import get_title from htmls import shorten_title +from htmls import merge_space +from encoding import get_encoding +from cleaners import html_cleaner +from cleaners import clean_attributes logging.basicConfig(level=logging.INFO) log = logging.getLogger() +DATE_REGEX = ( + re.compile(u'(?P\\d{4})(年|:|-|\/)(?P\\d{1,2})(月|:|-|\/)(?P\\d{1,2}).*?(?P\\d{1,2})(时|:)(?P\\d{1,2})(分|:)(?P\\d{1,2})', + re.UNICODE), + re.compile(u'(?P\\d{4})(年|:|-|\/)(?P\\d{1,2})(月|:|-|\/)(?P\\d{1,2}).*?(?P\\d{1,2})(时|:)(?P\\d{1,2})', re.UNICODE), + re.compile(u'(?P\\d{4})(年|:|-|\/)(?P\\d{1,2})(月|:|-|\/)(?P\\d{1,2})', re.UNICODE), + ) REGEXES = { 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I), @@ -27,6 +38,11 @@ 'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I), 'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I), 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I), + 'dateRe': re.compile('time|date'), + 'dateAttrScoreRe': re.compile('post|article|pub|art'), + 'dateTextScoreRe': re.compile(u'发表|post|时间'), + 'authorRe': re.compile('author|auth|editor|user|owner|nick'), + 'authorTextRe': re.compile(u'(作者:|作者:|楼主:)(?P.*)\s') #'replaceBrsRe': re.compile('(]*>[ \n\r\t]*){2,}',re.I), #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), #'trimRe': re.compile('^\s+|\s+$/'), @@ -95,15 +111,95 @@ def __init__(self, input, **options): - url: will allow adjusting links to be absolute """ - self.input = input + if isinstance(input, unicode): + self.input = input + else: + enc = get_encoding(input) + self.input = input.decode(enc, u'ignore') + self.input = merge_space(self.input) self.options = options self.html = None + #self.post_title = None + #self.pub_date = None + #self.author = None + self.trans_html = None + self.trans_flag = False def _html(self, force=False): if force or self.html is None: self.html = self._parse(self.input) return self.html + def get_publish_date(self): + if not hasattr(self, "pub_date"): + html = self._html() + candidates = [] + for elem in self.tags(html, 'span', 'em', 'p', 'li', 'a', 'td', 'i', 'font', 'div'): + text = elem.text + if not text or len(text) < 8 or len(text) > 50: + continue + score = 0 + text += elem.get('title', '') + elem.get('data-field', '') + for regex in DATE_REGEX: + match = regex.search(text) + if match: + attribute_text = '%s %s' % (elem.get('id', ''), elem.get('class', '')) + if len(attribute_text) > 1: + if REGEXES['dateRe'].search(attribute_text): + score += 20 + if REGEXES['dateAttrScoreRe'].search(attribute_text): + score += 10 + if REGEXES['dateTextScoreRe'].search(text) or REGEXES['dateTextScoreRe'].search(elem.getparent().text_content()): + score += 100 + if len(candidates) <= 0: + score = 5 + candidate = {'result': match.groupdict(), 'score': score} + candidates.append(candidate) + break + + if score > 25: + break + candidates = sorted(candidates, key=lambda x: x['score'], reverse=True) + date_kwargs = candidates[0]['result'] if candidates else {} + if date_kwargs: + for key, value in date_kwargs.items(): + date_kwargs[key] = int(value) + self.pub_date = datetime.datetime(**date_kwargs) + else: + self.pub_date = None + return self.pub_date + + def get_author(self): + if not hasattr(self, 'author'): + html = self._html() + candidates = [] + for elem in self.tags(html, 'span', 'em', 'strong', 'p', 'li', 'a', 'td', 'div'): + text = elem.text and elem.text.strip() + if not text or len(text) > 50 or len(text) < 2: + continue + score = 0 + match = REGEXES['authorTextRe'].search(text) or REGEXES['authorTextRe'].search(elem.getparent().text_content()) + if match: + return match.group('author') + attribute_text = '%s %s' % (elem.get('id', ''), elem.get('class', '')) + if len(attribute_text) > 1: + score = 0 + match = REGEXES['authorRe'].search(attribute_text) + if match: + #print elem.tag, text.encode('utf-8'), attribute_text + score += 10 + if 'nick' in attribute_text or 'name' in attribute_text: + score += 5 + if not len(candidates): + score += 3 + score += 2.0/len(text) + candidates.append({'txt': text, 'score': score}) + + candidates = sorted(candidates, key=lambda x: x['score'], reverse=True) + self.author = candidates[0]['txt'] if candidates else u"" + + return self.author + def _parse(self, input): doc = build_doc(input) doc = html_cleaner.clean_html(doc) @@ -115,37 +211,35 @@ def _parse(self, input): return doc def content(self): - return get_body(self._html(True)) + return get_body(self._html()) def title(self): - return get_title(self._html(True)) + return get_title(self._html()) def short_title(self): - return shorten_title(self._html(True)) - - def get_clean_html(self): - return clean_attributes(tounicode(self.html)) + if not hasattr(self, 'post_title'): + self.post_title = shorten_title(self._html()) - def summary(self, html_partial=False): - """Generate the summary of the html docuemnt + return self.post_title - :param html_partial: return only the div of the document, don't wrap - in html and body tags. + def text_content(self): + if not self.trans_flag: + self.transform() + return self.trans_html.text_content() if self.trans_html is not None else u"" - """ + def transform(self, html_partial=False): try: ruthless = True while True: - self._html(True) - for i in self.tags(self.html, 'script', 'style'): + html = copy.deepcopy(self._html()) + for i in self.tags(html, 'script', 'style'): i.drop_tree() - for i in self.tags(self.html, 'body'): + for i in self.tags(html, 'body'): i.set('id', 'readabilityBody') if ruthless: - self.remove_unlikely_candidates() - self.transform_misused_divs_into_paragraphs() - candidates = self.score_paragraphs() - + html = self.remove_unlikely_candidates(html) + html = self.transform_misused_divs_into_paragraphs(html) + candidates = self.score_paragraphs(html) best_candidate = self.select_best_candidate(candidates) if best_candidate: @@ -164,10 +258,11 @@ def summary(self, html_partial=False): log.debug( ("Ruthless and lenient parsing did not work. " "Returning raw html")) - article = self.html.find('body') + article = html.find('body') if article is None: - article = self.html - cleaned_article = self.sanitize(article, candidates) + article = html + html = self.sanitize(article, candidates) + cleaned_article = self.get_clean_html(html) article_length = len(cleaned_article or '') retry_length = self.options.get( 'retry_length', @@ -178,11 +273,28 @@ def summary(self, html_partial=False): # Loop through and try again. continue else: - return cleaned_article + self.trans_flag = True + self.trans_html = html + return self.trans_html except StandardError, e: log.exception('error getting summary: ') raise Unparseable(str(e)), None, sys.exc_info()[2] + + def get_clean_html(self, html): + return clean_attributes(tounicode(html)) + + def summary(self, html_partial=False): + """Generate the summary of the html docuemnt + + :param html_partial: return only the div of the document, don't wrap + in html and body tags. + + """ + if not self.trans_flag: + self.transform() + return self.get_clean_html(self.trans_html) + def get_article(self, candidates, best_candidate, html_partial=False): # Now that we have the top candidate, look through its siblings for # content that might also be related. @@ -196,7 +308,9 @@ def get_article(self, candidates, best_candidate, html_partial=False): else: output = document_fromstring('
') best_elem = best_candidate['elem'] - for sibling in best_elem.getparent().getchildren(): + siblings = best_elem.getparent().getchildren() if best_elem.getparent() is not None else [best_elem,] + for sibling in siblings: + #for sibling in best_elem.getparent().getchildren(): # in lxml there no concept of simple text # if isinstance(sibling, NavigableString): continue append = False @@ -253,13 +367,15 @@ def get_link_density(self, elem): total_length = text_length(elem) return float(link_length) / max(total_length, 1) - def score_paragraphs(self, ): + def score_paragraphs(self, html): + if html is None: + html = self._html() MIN_LEN = self.options.get( 'min_text_length', self.TEXT_LENGTH_THRESHOLD) candidates = {} ordered = [] - for elem in self.tags(self._html(), "p", "pre", "td"): + for elem in self.tags(html, "p", "pre", "td"): parent_node = elem.getparent() if parent_node is None: continue @@ -347,8 +463,10 @@ def debug(self, *a): if self.options.get('debug', False): log.debug(*a) - def remove_unlikely_candidates(self): - for elem in self.html.iter(): + def remove_unlikely_candidates(self, html): + if html is None: + html = self._html() + for elem in html.iter(): s = "%s %s" % (elem.get('class', ''), elem.get('id', '')) if len(s) < 2: continue @@ -356,9 +474,12 @@ def remove_unlikely_candidates(self): if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag not in ['html', 'body']: self.debug("Removing unlikely candidate - %s" % describe(elem)) elem.drop_tree() + return html - def transform_misused_divs_into_paragraphs(self): - for elem in self.tags(self.html, 'div'): + def transform_misused_divs_into_paragraphs(self, html): + if html is None: + html = self._html() + for elem in self.tags(html, 'div'): # transform
s that do not contain other block elements into #

s #FIXME: The current implementation ignores all descendants that @@ -371,7 +492,7 @@ def transform_misused_divs_into_paragraphs(self): elem.tag = "p" #print "Fixed element "+describe(elem) - for elem in self.tags(self.html, 'div'): + for elem in self.tags(html, 'div'): if elem.text and elem.text.strip(): p = fragment_fromstring('

') p.text = elem.text @@ -389,6 +510,7 @@ def transform_misused_divs_into_paragraphs(self): if child.tag == 'br': #print 'Dropped
at '+describe(elem) child.drop_tree() + return html def tags(self, node, *tag_names): for tag_name in tag_names: @@ -452,10 +574,10 @@ def sanitize(self, node, candidates): #if el.tag == 'div' and counts["img"] >= 1: # continue - if counts["p"] and counts["img"] > counts["p"]: - reason = "too many images (%s)" % counts["img"] - to_remove = True - elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol": + #if counts["p"] and counts["img"] > counts["p"]: + # reason = "too many images (%s)" % counts["img"] + # to_remove = True + if counts["li"] > counts["p"] and tag != "ul" and tag != "ol": reason = "more

  • s than

    s" to_remove = True elif counts["input"] > (counts["p"] / 3): @@ -532,9 +654,9 @@ def sanitize(self, node, candidates): if not self.options.get('attributes', None): #el.attrib = {} #FIXME:Checkout the effects of disabling this pass - - self.html = node - return self.get_clean_html() + return node + #self.trans_html = node + #return self.get_clean_html(self.trans_html) class HashableElement(): @@ -589,5 +711,12 @@ def main(): finally: file.close() +def search(date_str): + for regex in DATE_REGEX: + match = regex.search(date_str) + if match: + print match.groupdict() + if __name__ == '__main__': - main() + #main() + search(u'2013-01-26 15:32') diff --git a/tests/benchmark.py b/tests/benchmark.py new file mode 100644 index 00000000..32057f54 --- /dev/null +++ b/tests/benchmark.py @@ -0,0 +1,17 @@ +#coding=utf-8 + +from readability.readability import Document + +def main(): + html = open('./samples/21853124_0.shtml').read() + doc = Document(html) + doc.transform() + doc.get_publish_date() + doc.short_title() + doc.text_content() + +if __name__ == '__main__': + from timeit import Timer + t = Timer("main()", "from __main__ import main") + print t.repeat(3, number=100) + diff --git a/tests/samples/21853124_0.shtml b/tests/samples/21853124_0.shtml new file mode 100644 index 00000000..8ebf233c --- /dev/null +++ b/tests/samples/21853124_0.shtml @@ -0,0 +1,1719 @@ + + + + + + + + + 杨雄当选上海市长 “70后”时光辉当选副市长_资讯频道_凤凰网 + + + + + + + + + + + + + + +

    +
    +

    凤凰网首页 + 手机凤凰网 + 新闻客户端 +

    + + + + +
    + + +
    + +
    + + + + + + + +
    +
    +
    + +
    解放军新年第一“剑” +

    解放军新年第一“剑”

    习近平阐明底线,海军3主力舰高调穿岛链,南海舰队赴黄岩岛。

    +
    + + +
    + + +
    风,不是毒雾终结者

    风,不是毒雾终结者

    希望大风带走雾霾的同时,别悄悄带走治理毒雾的决心。

    + + + +
    毛泽东未能完成中国的现代化

    毛泽东未能完成中国的现代化

    沈志华:毛泽东要完成中国的现代化,但他未能完成建国后的任务。

    + + +
    + +
    +
    + + + + + +
    +
    +
    +
    +

    杨雄当选上海市长 “70后”时光辉当选副市长

    +
    +

    + 2013年02月01日 11:16
    来源:解放日报

    + + +
    +
    +
    +
    '正在加载中...'
    +
    + +

    +

    杨雄

    +

    +

    时光辉

    +

    【杨雄当选上海市巿长】上海市十四届人大一次会议今天上午举行第四次全体会议,选举杨雄为上海市人民政府市长,屠光绍、艾宝俊、沈晓明、赵雯、姜平、周波、翁铁慧、时光辉为副市长。记者王海燕

    +

    杨雄简历

    +

    杨雄,男,1953年11月生,汉族,浙江杭州人,学历研究生,经济学硕士,高级经济师。现任上海市委副书记、常务副市长、市政府党组书记。

    +

    杨雄1985年7月毕业于中国社会科学院研究生院。历任:市经济研究中心副处长,上海实事公司综合信息部副经理,市计委长远计划综合处副处长、处长,市计委主任助理兼计划投资处处长,市计委副主任,上海联和投资有限公司总经理、市信息投资股份有限公司董事长、上海航空公司董事长、上海航空股份有限公司监事会主席。2001年2月,任市政府副秘书长。

    +

    2003年2月,当选上海市副市长。

    +

    2007年5月任中共上海市委常委、上海市副市长。

    +

    2008年1月任中共上海市委常委、上海市常务副市长。

    +

    2012年12月任中共上海市委副书记、上海市常务副市长、上海市政府党组书记。

    +

    2013年2月任上海市长。

    +

    时光辉简历

    +

    时光辉,男,汉族,1970年1月生,安徽阜阳人,大学学历,经济学硕士,高级工程师,中共党员,1991年7月参加工作,1993年7月加入中国共产党。

    +

    1991年7月同济大学毕业后,历任上海市市政二公司宝钢分公司施工员、助理工程师、质监科科长、副经理,市政二公司经理助理、副总经理,市政工程建设处(公司)处长、总经理,市市政工程管理局计划财务处处长兼局专营办公室主任,市市政工程管理局副局长,中共静安区委常委,静安区委常委、副区长,中共奉贤区委副书记等职。

    +

    2008年6月任上海市奉贤区委副书记、代区长。

    +

    2008年7月30日当选上海奉贤区区长。

    +

    2011年8月任中共奉贤区委书记。

    +

    2013年2月任上海副市长。 + +

    +
    + +
    + + + + + + + + +
    + +
    + +
    +
    + [责任编辑:PN039] 标签:上海市
    +
    + + + + + +
    打印转发
    +
    +
     
    +
    + + +
    +
    +
    + 3g.ifeng.com 用手机随时随地看新闻 +
    +
    + +
    + + +
    + + + + + + + + +
    +
    +
    + + + + +
    + + + +
    + + + +
    +
    + + + + + + + + + +
    +
    + + + + + + + + + +
    +
    +
    + +
    + + + + + + +
    + + + + + +
    + + + + +
    + + + +
    +
    +
    + + + + + + + +
    +
    + +
    + + + +

    商讯

    +
      +
    • +
      + +
      +
    • + +
    • +
      + +
      +
    • + +
    • +
      + +
      +
    • + +
    • +
      + +
      +
    • +
    + + + +
    +
    + + + + + + +
    + + + + +
    + + +
    + + + +
    +
    +

    新闻图片

    + +
    +
    + + + + + + +
    + +
    + +图 + +
    + + +
    + +
    + + + + + + + +
    +

    男人必看:让男人“强壮”的秘密武器

    +
    图片说明
    + +
    + +
    + + + + +
    + +
    + + +
    +
    + + + + + + + +
    +
    +
    + + + + + + + + + +
    +
    +
    + + + + +
    +
    + + + + + + + + +
    + +
    + +
    +
    + + + +
    +
    +
    +
    +
    + + + + + + + +
    +
    + + +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/test_pub_date.py b/tests/test_pub_date.py new file mode 100644 index 00000000..04b26e2b --- /dev/null +++ b/tests/test_pub_date.py @@ -0,0 +1,18 @@ +#coding=utf-8 + +import unittest, datetime +from readability.readability import Document + +class PubDateTestCase(unittest.TestCase): + def setUp(self): + self.html = open('./samples/21853124_0.shtml').read() + + def test_pub_date(self): + html = self.html + doc = Document(html) + doc.transform() + self.assertEqual(datetime.datetime(2013, 2, 1, 11, 16), doc.get_publish_date()) + self.assertEqual('PN039', doc.get_author()) + +if __name__ == '__main__': + unittest.main()