forked from buriy/python-readability
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathencoding.py
More file actions
61 lines (50 loc) · 1.97 KB
/
encoding.py
File metadata and controls
61 lines (50 loc) · 1.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import re
import chardet
import logging
log = logging.getLogger(__name__)
RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', re.I)
RE_PRAGMA = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', re.I)
RE_XML = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
CHARSETS = {
'big5': 'big5hkscs',
'gb2312': 'gb18030',
'ascii': 'utf-8',
'maccyrillic': 'cp1251',
'win1251': 'cp1251',
'win-1251': 'cp1251',
'windows-1251': 'cp1251',
}
def fix_charset(encoding):
"""Overrides encoding when charset declaration
or charset determination is a subset of a larger
charset. Created because of issues with Chinese websites"""
encoding = encoding.lower()
return CHARSETS.get(encoding, encoding)
def get_encoding(page):
declared_encodings = (RE_CHARSET.findall(page) +
RE_PRAGMA.findall(page) +
RE_XML.findall(page))
log.debug("Document has the following encodings: %s" % declared_encodings)
# Try declared encodings, if any
for declared_encoding in declared_encodings:
encoding = fix_charset(declared_encoding)
try:
page.decode(encoding)
log.info('Using encoding "%s"' % encoding)
return encoding
except UnicodeDecodeError:
log.info('Encoding "%s", specified in the document as "%s" '
'didn\'t work' % (encoding, declared_encoding))
print "Content encoding didn't work:", encoding
# Fallback to chardet if declared encodings fail
text = re.sub('</?[^>]*>\s*', ' ', page)
enc = 'utf-8'
if not text.strip() or len(text) < 10:
log.debug("Can't guess encoding because text is too short")
return enc
res = chardet.detect(text)
enc = fix_charset(res['encoding'])
log.info('Trying encoding "%s" guessed '
'with confidence %.2f' % (enc, res['confidence']))
#print '->', enc, "%.2f" % res['confidence']
return enc