forked from buriy/python-readability
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhtmls.py
More file actions
144 lines (114 loc) · 3.9 KB
/
htmls.py
File metadata and controls
144 lines (114 loc) · 3.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from lxml.html import tostring
import lxml.html
import re
from .cleaners import normalize_spaces, clean_attributes
from .encoding import get_encoding
from .compat import str_
utf8_parser = lxml.html.HTMLParser(encoding="utf-8")
def build_doc(page):
if isinstance(page, str_):
encoding = None
decoded_page = page
else:
encoding = get_encoding(page) or "utf-8"
decoded_page = page.decode(encoding, "replace")
# XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
doc = lxml.html.document_fromstring(
decoded_page.encode("utf-8", "replace"), parser=utf8_parser
)
return doc, encoding
def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace("$", "\\"))
def normalize_entities(cur_title):
entities = {
u"\u2014": "-",
u"\u2013": "-",
u"—": "-",
u"–": "-",
u"\u00A0": " ",
u"\u00AB": '"',
u"\u00BB": '"',
u""": '"',
}
for c, r in entities.items():
if c in cur_title:
cur_title = cur_title.replace(c, r)
return cur_title
def norm_title(title):
return normalize_entities(normalize_spaces(title))
def get_title(doc):
title = doc.find(".//title")
if title is None or title.text is None or len(title.text) == 0:
return "[no-title]"
return norm_title(title.text)
def add_match(collection, text, orig):
text = norm_title(text)
if len(text.split()) >= 2 and len(text) >= 15:
if text.replace('"', "") in orig.replace('"', ""):
collection.add(text)
TITLE_CSS_HEURISTICS = [
"#title",
"#head",
"#heading",
".pageTitle",
".news_title",
".title",
".head",
".heading",
".contentheading",
".small_header_red",
]
def shorten_title(doc):
title = doc.find(".//title")
if title is None or title.text is None or len(title.text) == 0:
return ""
title = orig = norm_title(title.text)
candidates = set()
for item in [".//h1", ".//h2", ".//h3"]:
for e in list(doc.iterfind(item)):
if e.text:
add_match(candidates, e.text, orig)
if e.text_content():
add_match(candidates, e.text_content(), orig)
for item in TITLE_CSS_HEURISTICS:
for e in doc.cssselect(item):
if e.text:
add_match(candidates, e.text, orig)
if e.text_content():
add_match(candidates, e.text_content(), orig)
if candidates:
title = sorted(candidates, key=len)[-1]
else:
for delimiter in [" | ", " - ", " :: ", " / "]:
if delimiter in title:
parts = orig.split(delimiter)
if len(parts[0].split()) >= 4:
title = parts[0]
break
elif len(parts[-1].split()) >= 4:
title = parts[-1]
break
else:
if ": " in title:
parts = orig.split(": ")
if len(parts[-1].split()) >= 4:
title = parts[-1]
else:
title = orig.split(": ", 1)[1]
if not 15 < len(title) < 150:
return orig
return title
# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py
def get_body(doc):
for elem in doc.xpath(".//script | .//link | .//style"):
elem.drop_tree()
# tostring() always return utf-8 encoded string
# FIXME: isn't better to use tounicode?
raw_html = str_(tostring(doc.body or doc))
cleaned = clean_attributes(raw_html)
try:
# BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
return cleaned
except Exception: # FIXME find the equivalent lxml error
# logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
return raw_html