forked from aboutcode-org/scancode-toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbashparse.py
More file actions
413 lines (319 loc) · 12.5 KB
/
bashparse.py
File metadata and controls
413 lines (319 loc) · 12.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/scancode-toolkit for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
import os
import attr
from parameter_expansion import pe
from pygmars import parse
from pygmars import Token
from pygmars import tree
from packagedcode.bashlex import BashShellLexer
"""
Extract and resolve variable from a Bash or shell script.
"""
# Tracing flags
TRACE = False or os.environ.get('SCANCODE_DEBUG_BASHPARSE', False)
if TRACE:
TRACE = int(TRACE)
VALIDATE = False or os.environ.get('SCANCODE_DEBUG_BASHPARSE_VALIDATE', False)
# Tracing flags
def logger_debug(*args):
pass
if TRACE:
import logging
import sys
logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.DEBUG)
def logger_debug(*args):
logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
#
# A simple and minimal pygmars grammar to collect bash/shell variables.
variables_grammar = (
'SHELL-ARRAY: '
'<ARRAYSEP-START>'
'<TEXT-WS| TEXT-WS-LF | COMMENT |LITERAL-STRING-DOUBLE | LITERAL-STRING-SINGLE | TEXT>+'
'<ARRAYSEP-END>'
' # An array'
'\n'
# This collects variables like: pkgname="gcc compiler" or pkgname=gcc .
# With the TEXT-WS-LF (newline) (not captured) at the start we ensure we get
# only things that start on a new line, e.g. that should be top level
# variable declaration rather than inside a function.
'SHELL-VARIABLE: '
'(?:<TEXT-WS-LF>|^)'
'<NAME-VARIABLE>'
'<OPERATOR-EQUAL>'
'<LITERAL-STRING-DOUBLE | LITERAL-STRING-SINGLE | TEXT | SHELL-ARRAY>'
' # A Shell variable'
'\n'
'EMPTY-SHELL-VARIABLE: '
'(?:<TEXT-WS-LF>|^)'
'<NAME-VARIABLE>'
'<OPERATOR-EQUAL>'
'(?:<TEXT-WS-LF>?|$)'
' # Am empty Shell variable'
)
def collect_shell_variables(location, resolve=False, needed_variables=None):
"""
Return a tuple of (``shell variables``, ``errors``) from collecting top-
level variables defined in bash script at ``location``.
``shell variables`` is a mapping of {name: value}
``errors`` a list of error message strings.
Optionally ``resolve`` the variables with emulated shell parameter
expansion. If the set ``needed_variables`` is provided, only return
variables with a named present in this set and only report errors for
variables with a name listed in this set.
"""
with open(location) as inp:
text = inp.read()
return collect_shell_variables_from_text_as_dict(
text=text,
resolve=resolve,
needed_variables=needed_variables,
)
#
# Convenience functions on parse tree labels
def is_shell_variable(node):
return node.label.startswith('SHELL-VARIABLE')
def is_empty_shell_variable(node):
return node.label.startswith('EMPTY-SHELL-VARIABLE')
def is_array(node):
return node.label.startswith('SHELL-ARRAY')
def is_array_sep(node):
return node.label.startswith('ARRAYSEP')
def is_whitespace(node):
return node.label.startswith('TEXT-WS')
def is_comment(node):
return node.label.startswith('COMMENT')
def is_ignorable(node):
return is_whitespace(node) or is_comment(node)
def is_decoration(node):
return is_ignorable(node) or is_array_sep(node)
@attr.s
class ShellVariable:
name = attr.ib()
value = attr.ib()
is_array = attr.ib(default=False, repr=False)
def is_resolved(self):
"""
Return True if this variable is fully resolved and does not need further
shell expansion.
"""
if self.is_array:
for item in self.value:
return not any(c in item for c in '${}')
return not any(c in self.value for c in '${}')
@classmethod
def from_node(cls, node):
"""
Return a ShellVariable built from a parse tree node or None.
"""
def get_content(_node, _length):
_content = [n for n in _node if not is_ignorable(n)]
assert len(_content) == _length, (
f'Unknown shell assignment syntax: {_node}'
)
return _content
if is_empty_shell_variable(node):
# removes space nodes and comment nodes
content = get_content(node, 2)
# we should be left with two elements: name=
name_token , _equal_token = content
return cls(name=name_token.value, value='', is_array=False)
if not is_shell_variable(node):
return
content = get_content(node, 3)
# we should be left with three elements: name = value
name_token , _equal_token , value_token = content
if is_array(value_token):
array = True
items = [
i for i in value_token.leaves()
if not is_decoration(i)
]
value = [dequote(vt) for vt in items]
else:
# a plain value string
array = False
value = dequote(value_token)
sv = cls(name=name_token.value, value=value, is_array=array)
return sv
@classmethod
def validate(cls, variables, needed_variables=None):
"""
Return a list of error message if some variables in a ``variables``
list of ShellVariable are not valid.
"""
def reportable(v):
if needed_variables:
return v.name in needed_variables
return True
seen = dict()
errors = []
for var in variables:
# check for duplicate names, but these could be redefinitions
if reportable(var):
if var.name in seen:
errors.append(
f'Duplicate variable name: {var.name!r} value: {var.value!r} '
f'existing value: {seen[var.name]!r}'
)
else:
seen[var.name] = var.value
return errors
@classmethod
def resolve(cls, variables, needed_variables=None):
"""
Resolve each variables in a ``variables`` list of ShellVariable. Return
a tuple of (list with updated variables, list of error messages). Do not
report errors for variable with a name listed in the
``needed_variables`` set if provided.
"""
def reportable(v):
if needed_variables:
return v.name in needed_variables
return True
# mapping of variables that we use for resolution
# the mappings and values are updated as resolution progresses
environment = {}
errors = []
for var in variables:
if not environment:
if reportable(var) and not var.is_resolved():
errors.append(f'Unresolvable first variable: {var}')
if not var.is_array:
# we do not know how to expand an array
environment[var.name] = var.value
continue
if var.is_resolved():
if not var.is_array:
# we do not know how to expand an array
environment[var.name] = var.value
continue
try:
if var.is_array:
expanded = []
for item in var.value:
exp = pe.expand(item, env=environment)
if reportable(var) and ' ' in item and ' ' not in expanded:
errors.append(f'Expansion munged spaces in value: {item}')
expanded.append(exp)
else:
expanded = pe.expand(var.value, env=environment)
if reportable(var) and ' ' in var.value and ' ' not in expanded:
errors.append(f'Expansion munged spaces in value: {var.value}')
if TRACE:
logger_debug(
f'Resolved variable: {var} to: {expanded} '
f'with envt: {environment} '
)
var.value = expanded
if not var.is_array:
# we do not know how to expand an array
environment[var.name] = expanded
if reportable(var) and not var.is_resolved():
errors.append(
f'Partially resolved variable: {var} envt: '
f'{environment}'
)
except Exception as e:
if reportable(var):
errors.append(f'Failed to expand variable: {var} error: {e}')
return variables, errors
def to_dict(self):
return {self.name: self.value}
def dequote(token):
"""
Return a token value stripped from quotes based on its token label.
"""
quote_style_by_token_label = {
'LITERAL-STRING-DOUBLE':'"',
'LITERAL-STRING-SINGLE': "'",
}
qs = quote_style_by_token_label.get(token.label)
s = token.value
if qs and s.startswith(qs) and s.endswith(qs):
return s[1:-1]
return s
def collect_shell_variables_from_text_as_dict(text, resolve=False, needed_variables=None):
"""
Return a tuple of (variables, errors) from collecting top-level variables
defined in bash script ``text`` string. ``variables`` is a mapping of {name:
value} and ``errors`` a list of error message strings.
Optionally ``resolve`` the variables with emulated shell parameter
expansion. If the set ``needed_variables`` is provided, only return
variables with a named present in this set and only report errors for
variables with a name listed in this set.
"""
vrs, errs = collect_shell_variables_from_text(text, resolve, needed_variables)
return {v.name: v.value for v in vrs}, errs
def collect_shell_variables_from_text(text, resolve=False, needed_variables=None):
"""
Return a tuple of (variables, errors) from collecting top-level variables
defined in bash script ``text`` string.``variables`` is a list of
ShellVariable objects and ``errors`` a list of error message strings.
Optionally ``resolve`` the variables with emulated shell parameter
expansion. If the set ``needed_variables`` is provided, only return
variables with a named present in this set and only report errors for
variables with a name listed in this set.
"""
parse_tree = parse_shell(text)
variables = []
# then walk the parse parse_tree to get variables
for node in parse_tree:
if TRACE:
logger_debug(f'collect_shell_variables: parse_tree node: {node}')
if not isinstance(node, tree.Tree):
if TRACE: logger_debug(f' skipped: {node}')
continue
variable = ShellVariable.from_node(node)
if TRACE: logger_debug(f' variable: {variable}')
if variable:
variables.append(variable)
errors = ShellVariable.validate(variables, needed_variables)
if resolve:
variables, rerrors = ShellVariable.resolve(variables, needed_variables)
errors.extend(rerrors)
if needed_variables:
variables = [v for v in variables if v.name in needed_variables]
return variables, errors
def get_tokens(text):
"""
Return a Tokens list from ``text``.
"""
lexer = BashShellLexer()
pygtokens = lexer.get_tokens_unprocessed(text)
return list(Token.from_pygments_tokens(pygtokens))
def parse_shell(text, grammar=variables_grammar, loop=1, trace=TRACE, validate=VALIDATE):
"""
Return a pygmars parse Tree built from a ``text`` string.
"""
tokens = get_tokens(text)
# then build a parse parse_tree based on tagged tokens
parser = parse.Parser(
grammar=grammar,
trace=trace,
validate=validate,
loop=loop,
)
if TRACE:
tokens = list(tokens)
logger_debug(f'parse_shell: parsing tokens #: {len(tokens)}')
if TRACE:
logger_debug(f'parse_shell: calling parser.parse')
parse_tree = parser.parse(tokens)
if TRACE:
logger_debug(f'parse_shell: got parse_tree: {parse_tree}')
return parse_tree
if __name__ == '__main__':
import json
import sys # NOQA
test_file = sys.argv[1]
results, errs = collect_shell_variables(test_file, resolve=True)
print(json.dumps(dict(variables=results, errors=errs), indent=2))