-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Expand file tree
/
Copy pathmodules.py
More file actions
231 lines (203 loc) · 8.45 KB
/
modules.py
File metadata and controls
231 lines (203 loc) · 8.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
'''MODULE_TYPES: mapping from type-code returned by
imp.find_module to Module subclass'''
import semmle.python.parser.tokenizer
import semmle.python.parser.tsg_parser
import re
import os
from blib2to3.pgen2 import tokenize
import codecs
from semmle.python.passes.labeller import Labeller
from semmle.util import base64digest
from semmle.profiling import timers
__all__ = [ 'PythonSourceModule' ]
class PythonSourceModule(object):
kind = None
def __init__(self, name, path, logger, bytes_source = None):
assert isinstance(path, str), path
self.name = name # May be None
self.path = path
if bytes_source is None:
with timers["load"]:
with open(self.path, 'rb') as src:
bytes_source = src.read()
if BIN_PYTHON.match(bytes_source):
self.kind = "Script"
self._ast = None
self._py_ast = None
self._lines = None
self._line_types = None
self._comments = None
self._tokens = None
self.logger = logger
with timers["decode"]:
self.encoding, self.bytes_source = semmle.python.parser.tokenizer.encoding_from_source(bytes_source)
if self.encoding != 'utf-8':
logger.debug("File '%s' has encoding %s.", path, self.encoding)
try:
self._source = self.bytes_source.decode(self.encoding)
self._illegal_encoding = False
except Exception as ex:
self.logger.warning("%s has encoding '%s'", path, self.encoding)
#Set source to a latin-1 decoding of source string (which cannot fail).
#Attempting to get the AST will raise a syntax error as expected.
self._source = self.bytes_source.decode("latin-1")
self._illegal_encoding = str(ex)
self._source = normalize_line_endings(self._source)
#Strip BOM
if self._source.startswith(u'\ufeff'):
self._source = self._source[1:]
self._secure_hash = base64digest(self._source)
assert isinstance(self._source, str)
@property
def source(self):
return self._source
@property
def lines(self):
if self._lines is None:
def genline():
src = self._source
#Handle non-linux line endings
src = src.replace("\r\n", "\n").replace("\r", "\n")
length = len(src)
start = 0
while True:
end = src.find(u'\n', start)
if end < 0:
if start < length:
yield src[start:]
return
yield src[start:end+1]
start = end+1
self._lines = list(genline())
return self._lines
@property
def tokens(self):
if self._tokens is None:
with timers["tokenize"]:
tokenizer = semmle.python.parser.tokenizer.Tokenizer(self._source)
self._tokens = list(tokenizer.tokens())
return self._tokens
@property
def ast(self):
# The ast will be modified by the labeller, so we cannot share it with the py_ast property.
# However, we expect py_ast to be accessed and used before ast, so we avoid reparsing in that case.
if self._ast is None:
if self._illegal_encoding:
message = self._illegal_encoding
error = SyntaxError(message)
error.filename = self.path
error.lineno, error.offset = offending_byte_position(message, self.bytes_source)
raise error
self._ast = self.py_ast
self._ast.trap_name = self.trap_name
self._py_ast = None
with timers["label"]:
Labeller().apply(self)
return self._ast
@property
def old_py_ast(self):
# The py_ast is the raw ast from the Python parser.
if self._py_ast is None:
with timers["old_py_ast"]:
self.logger.debug("Trying old parser on %s", self.path)
self._py_ast = semmle.python.parser.parse(self.tokens, self.logger)
self.logger.debug("Old parser successful on %s", self.path)
else:
self.logger.debug("Found (during old_py_ast) parse tree for %s in cache", self.path)
return self._py_ast
@property
def py_ast(self):
try:
# If the `CODEQL_PYTHON_DISABLE_OLD_PARSER` flag is present, we do not try to use the
# old parser, and instead jump straight to the exception handler.
if os.environ.get("CODEQL_PYTHON_DISABLE_OLD_PARSER"):
self.logger.debug("Old parser disabled, skipping old parse attempt for %s", self.path)
raise Exception("Skipping old parser")
# Otherwise, we first try to parse the source with the old Python parser.
self._py_ast = self.old_py_ast
return self._py_ast
except Exception as ex:
# If that fails, try to parse the source with the new Python parser (unless it has been
# explicitly disabled).
#
# Like PYTHONUNBUFFERED for Python, we treat any non-empty string as meaning the
# flag is enabled.
# https://docs.python.org/3/using/cmdline.html#envvar-PYTHONUNBUFFERED
if os.environ.get("CODEQL_PYTHON_DISABLE_TSG_PARSER"):
if isinstance(ex, SyntaxError):
raise ex
else:
raise SyntaxError("Exception %s while parsing %s" % (ex, self.path))
else:
try:
with timers["tsg_py_ast"]:
if self._py_ast is None:
self.logger.debug("Trying tsg-python on %s", self.path)
self._py_ast = semmle.python.parser.tsg_parser.parse(self.path, self.logger)
self.logger.debug("tsg-python successful on %s", self.path)
else:
self.logger.debug("Found (during py_ast) parse tree for %s in cache", self.path)
return self._py_ast
except SyntaxError as ex:
raise ex
except Exception as ex:
raise SyntaxError("Exception %s in tsg-python while parsing %s" % (ex, self.path))
@property
def trap_name(self):
return type(self).__name__ + ':' + self.path + ":" + self._secure_hash
def get_hash_key(self, token):
return base64digest(self.path + u":" + self._secure_hash + token)
def get_encoding(self):
'Returns encoding of source'
return self.encoding
@property
def comments(self):
''' Returns an iterable of comments in the form:
test, start, end where start and end are line. column
pairs'''
if self._comments is None:
self._lexical()
return self._comments
def close(self):
self.bytes_source = None
self._source = None
self._ast = None
self._line_types = None
self._comments = None
self._lines = None
def _lexical(self):
self._comments = []
for kind, text, start, end in self.tokens:
if kind == tokenize.COMMENT:
self._comments.append((text, start, end))
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()
NEWLINE = b'\n'
OFFENDING_BYTE_RE = re.compile(r"decode byte \w+ in position (\d+):")
def offending_byte_position(message, string):
m = OFFENDING_BYTE_RE.search(message)
if m is None:
return (0,0)
badposition = int(m.group(1))
prefix = string[:badposition]
line = prefix.count(NEWLINE) + 1
column = badposition - prefix.rfind(NEWLINE) - 1
return (line, column)
BIN_PYTHON = re.compile(b'#! *(/usr|/bin|/local)*/?(env)? *python')
def is_script(path):
'''Is the file at `path` a script? (does it start with #!... python)'''
try:
with open(path, "rb") as contents:
start = contents.read(100)
return bool(BIN_PYTHON.match(start))
except Exception:
return False
def normalize_line_endings(src):
#Our tokenizer expects single character `\n`, `\r` or `\f` as line endings.
src = src.replace(u'\r\n', u'\n')
#Our parser expects that there are no unterminated lines.
if src and src[-1] != u'\n':
return src + u'\n'
return src