python-readability/readability/htmls.py at develop · ProgrammerHuang/python-readability

History

144 lines (114 loc) · 3.9 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

from lxml.html import tostring

import lxml.html

import re

from .cleaners import normalize_spaces, clean_attributes

from .encoding import get_encoding

from .compat import str_

utf8_parser = lxml.html.HTMLParser(encoding="utf-8")

def build_doc(page):

if isinstance(page, str_):

encoding = None

decoded_page = page

else:

encoding = get_encoding(page) or "utf-8"

decoded_page = page.decode(encoding, "replace")

# XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters

doc = lxml.html.document_fromstring(

decoded_page.encode("utf-8", "replace"), parser=utf8_parser

)

return doc, encoding

def js_re(src, pattern, flags, repl):

return re.compile(pattern, flags).sub(src, repl.replace("$", "\\"))

def normalize_entities(cur_title):

entities = {

u"\u2014": "-",

u"\u2013": "-",

u"—": "-",

u"–": "-",

u"\u00A0": " ",

u"\u00AB": '"',

u"\u00BB": '"',

u""": '"',

}

for c, r in entities.items():

if c in cur_title:

cur_title = cur_title.replace(c, r)

return cur_title

def norm_title(title):

return normalize_entities(normalize_spaces(title))

def get_title(doc):

title = doc.find(".//title")

if title is None or title.text is None or len(title.text) == 0:

return "[no-title]"

return norm_title(title.text)

def add_match(collection, text, orig):

text = norm_title(text)

if len(text.split()) >= 2 and len(text) >= 15:

if text.replace('"', "") in orig.replace('"', ""):

collection.add(text)

TITLE_CSS_HEURISTICS = [

"#title",

"#head",

"#heading",

".pageTitle",

".news_title",

".title",

".head",

".heading",

".contentheading",

".small_header_red",

]

def shorten_title(doc):

title = doc.find(".//title")

if title is None or title.text is None or len(title.text) == 0:

return ""

title = orig = norm_title(title.text)

candidates = set()

for item in [".//h1", ".//h2", ".//h3"]:

for e in list(doc.iterfind(item)):

if e.text:

add_match(candidates, e.text, orig)

if e.text_content():

add_match(candidates, e.text_content(), orig)

for item in TITLE_CSS_HEURISTICS:

for e in doc.cssselect(item):

if e.text:

add_match(candidates, e.text, orig)

if e.text_content():

add_match(candidates, e.text_content(), orig)

if candidates:

title = sorted(candidates, key=len)[-1]

else:

for delimiter in [" | ", " - ", " :: ", " / "]:

if delimiter in title:

parts = orig.split(delimiter)

if len(parts[0].split()) >= 4:

title = parts[0]

break

elif len(parts[-1].split()) >= 4:

title = parts[-1]

break

else:

if ": " in title:

parts = orig.split(": ")

if len(parts[-1].split()) >= 4:

title = parts[-1]

else:

title = orig.split(": ", 1)[1]

if not 15 < len(title) < 150:

return orig

return title

# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py

def get_body(doc):

for elem in doc.xpath(".//script | .//link | .//style"):

elem.drop_tree()

# tostring() always return utf-8 encoded string

# FIXME: isn't better to use tounicode?

raw_html = str_(tostring(doc.body or doc))

cleaned = clean_attributes(raw_html)

try:

# BeautifulSoup(cleaned) #FIXME do we really need to try loading it?

return cleaned

except Exception: # FIXME find the equivalent lxml error

# logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))

return raw_html

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

htmls.py

Latest commit

History

htmls.py

File metadata and controls