# # Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. # SPDX-License-Identifier: Apache-2.0 # See http://www.apache.org/licenses/LICENSE-2.0 for the license text. # See https://github.com/nexB/scancode-toolkit for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # import os import attr from parameter_expansion import pe from pygmars import parse from pygmars import Token from pygmars import tree from packagedcode.bashlex import BashShellLexer """ Extract and resolve variable from a Bash or shell script. """ # Tracing flags TRACE = False or os.environ.get('SCANCODE_DEBUG_BASHPARSE', False) if TRACE: TRACE = int(TRACE) VALIDATE = False or os.environ.get('SCANCODE_DEBUG_BASHPARSE_VALIDATE', False) # Tracing flags def logger_debug(*args): pass if TRACE: import logging import sys logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.DEBUG) def logger_debug(*args): logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args)) # # A simple and minimal pygmars grammar to collect bash/shell variables. variables_grammar = ( 'SHELL-ARRAY: ' '' '+' '' ' # An array' '\n' # This collects variables like: pkgname="gcc compiler" or pkgname=gcc . # With the TEXT-WS-LF (newline) (not captured) at the start we ensure we get # only things that start on a new line, e.g. that should be top level # variable declaration rather than inside a function. 'SHELL-VARIABLE: ' '(?:|^)' '' '' '' ' # A Shell variable' '\n' 'EMPTY-SHELL-VARIABLE: ' '(?:|^)' '' '' '(?:?|$)' ' # Am empty Shell variable' ) def collect_shell_variables(location, resolve=False, needed_variables=None): """ Return a tuple of (``shell variables``, ``errors``) from collecting top- level variables defined in bash script at ``location``. ``shell variables`` is a mapping of {name: value} ``errors`` a list of error message strings. Optionally ``resolve`` the variables with emulated shell parameter expansion. If the set ``needed_variables`` is provided, only return variables with a named present in this set and only report errors for variables with a name listed in this set. """ with open(location) as inp: text = inp.read() return collect_shell_variables_from_text_as_dict( text=text, resolve=resolve, needed_variables=needed_variables, ) # # Convenience functions on parse tree labels def is_shell_variable(node): return node.label.startswith('SHELL-VARIABLE') def is_empty_shell_variable(node): return node.label.startswith('EMPTY-SHELL-VARIABLE') def is_array(node): return node.label.startswith('SHELL-ARRAY') def is_array_sep(node): return node.label.startswith('ARRAYSEP') def is_whitespace(node): return node.label.startswith('TEXT-WS') def is_comment(node): return node.label.startswith('COMMENT') def is_ignorable(node): return is_whitespace(node) or is_comment(node) def is_decoration(node): return is_ignorable(node) or is_array_sep(node) @attr.s class ShellVariable: name = attr.ib() value = attr.ib() is_array = attr.ib(default=False, repr=False) def is_resolved(self): """ Return True if this variable is fully resolved and does not need further shell expansion. """ if self.is_array: for item in self.value: return not any(c in item for c in '${}') return not any(c in self.value for c in '${}') @classmethod def from_node(cls, node): """ Return a ShellVariable built from a parse tree node or None. """ def get_content(_node, _length): _content = [n for n in _node if not is_ignorable(n)] assert len(_content) == _length, ( f'Unknown shell assignment syntax: {_node}' ) return _content if is_empty_shell_variable(node): # removes space nodes and comment nodes content = get_content(node, 2) # we should be left with two elements: name= name_token , _equal_token = content return cls(name=name_token.value, value='', is_array=False) if not is_shell_variable(node): return content = get_content(node, 3) # we should be left with three elements: name = value name_token , _equal_token , value_token = content if is_array(value_token): array = True items = [ i for i in value_token.leaves() if not is_decoration(i) ] value = [dequote(vt) for vt in items] else: # a plain value string array = False value = dequote(value_token) sv = cls(name=name_token.value, value=value, is_array=array) return sv @classmethod def validate(cls, variables, needed_variables=None): """ Return a list of error message if some variables in a ``variables`` list of ShellVariable are not valid. """ def reportable(v): if needed_variables: return v.name in needed_variables return True seen = dict() errors = [] for var in variables: # check for duplicate names, but these could be redefinitions if reportable(var): if var.name in seen: errors.append( f'Duplicate variable name: {var.name!r} value: {var.value!r} ' f'existing value: {seen[var.name]!r}' ) else: seen[var.name] = var.value return errors @classmethod def resolve(cls, variables, needed_variables=None): """ Resolve each variables in a ``variables`` list of ShellVariable. Return a tuple of (list with updated variables, list of error messages). Do not report errors for variable with a name listed in the ``needed_variables`` set if provided. """ def reportable(v): if needed_variables: return v.name in needed_variables return True # mapping of variables that we use for resolution # the mappings and values are updated as resolution progresses environment = {} errors = [] for var in variables: if not environment: if reportable(var) and not var.is_resolved(): errors.append(f'Unresolvable first variable: {var}') if not var.is_array: # we do not know how to expand an array environment[var.name] = var.value continue if var.is_resolved(): if not var.is_array: # we do not know how to expand an array environment[var.name] = var.value continue try: if var.is_array: expanded = [] for item in var.value: exp = pe.expand(item, env=environment) if reportable(var) and ' ' in item and ' ' not in expanded: errors.append(f'Expansion munged spaces in value: {item}') expanded.append(exp) else: expanded = pe.expand(var.value, env=environment) if reportable(var) and ' ' in var.value and ' ' not in expanded: errors.append(f'Expansion munged spaces in value: {var.value}') if TRACE: logger_debug( f'Resolved variable: {var} to: {expanded} ' f'with envt: {environment} ' ) var.value = expanded if not var.is_array: # we do not know how to expand an array environment[var.name] = expanded if reportable(var) and not var.is_resolved(): errors.append( f'Partially resolved variable: {var} envt: ' f'{environment}' ) except Exception as e: if reportable(var): errors.append(f'Failed to expand variable: {var} error: {e}') return variables, errors def to_dict(self): return {self.name: self.value} def dequote(token): """ Return a token value stripped from quotes based on its token label. """ quote_style_by_token_label = { 'LITERAL-STRING-DOUBLE':'"', 'LITERAL-STRING-SINGLE': "'", } qs = quote_style_by_token_label.get(token.label) s = token.value if qs and s.startswith(qs) and s.endswith(qs): return s[1:-1] return s def collect_shell_variables_from_text_as_dict(text, resolve=False, needed_variables=None): """ Return a tuple of (variables, errors) from collecting top-level variables defined in bash script ``text`` string. ``variables`` is a mapping of {name: value} and ``errors`` a list of error message strings. Optionally ``resolve`` the variables with emulated shell parameter expansion. If the set ``needed_variables`` is provided, only return variables with a named present in this set and only report errors for variables with a name listed in this set. """ vrs, errs = collect_shell_variables_from_text(text, resolve, needed_variables) return {v.name: v.value for v in vrs}, errs def collect_shell_variables_from_text(text, resolve=False, needed_variables=None): """ Return a tuple of (variables, errors) from collecting top-level variables defined in bash script ``text`` string.``variables`` is a list of ShellVariable objects and ``errors`` a list of error message strings. Optionally ``resolve`` the variables with emulated shell parameter expansion. If the set ``needed_variables`` is provided, only return variables with a named present in this set and only report errors for variables with a name listed in this set. """ parse_tree = parse_shell(text) variables = [] # then walk the parse parse_tree to get variables for node in parse_tree: if TRACE: logger_debug(f'collect_shell_variables: parse_tree node: {node}') if not isinstance(node, tree.Tree): if TRACE: logger_debug(f' skipped: {node}') continue variable = ShellVariable.from_node(node) if TRACE: logger_debug(f' variable: {variable}') if variable: variables.append(variable) errors = ShellVariable.validate(variables, needed_variables) if resolve: variables, rerrors = ShellVariable.resolve(variables, needed_variables) errors.extend(rerrors) if needed_variables: variables = [v for v in variables if v.name in needed_variables] return variables, errors def get_tokens(text): """ Return a Tokens list from ``text``. """ lexer = BashShellLexer() pygtokens = lexer.get_tokens_unprocessed(text) return list(Token.from_pygments_tokens(pygtokens)) def parse_shell(text, grammar=variables_grammar, loop=1, trace=TRACE, validate=VALIDATE): """ Return a pygmars parse Tree built from a ``text`` string. """ tokens = get_tokens(text) # then build a parse parse_tree based on tagged tokens parser = parse.Parser( grammar=grammar, trace=trace, validate=validate, loop=loop, ) if TRACE: tokens = list(tokens) logger_debug(f'parse_shell: parsing tokens #: {len(tokens)}') if TRACE: logger_debug(f'parse_shell: calling parser.parse') parse_tree = parser.parse(tokens) if TRACE: logger_debug(f'parse_shell: got parse_tree: {parse_tree}') return parse_tree if __name__ == '__main__': import json import sys # NOQA test_file = sys.argv[1] results, errs = collect_shell_variables(test_file, resolve=True) print(json.dumps(dict(variables=results, errors=errs), indent=2))