forked from aboutcode-org/scancode-toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlicensedcode_test_utils.py
More file actions
402 lines (342 loc) · 12.3 KB
/
licensedcode_test_utils.py
File metadata and controls
402 lines (342 loc) · 12.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/scancode-toolkit for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
import io
import os
import traceback
from time import time
import attr
import pytest
import saneyaml
from license_expression import Licensing
from commoncode import text
from commoncode.testcase import get_test_file_pairs
from scancode_config import REGEN_TEST_FIXTURES
"""
Data-driven tests using expectations stored in YAML files.
"""
@attr.attrs(slots=True)
class LicenseTest(object):
"""
A license detection test is used to verify that license detection works
correctly
It consists of two files with the same base name: a .yml file with test data
and a test file with any other extension that needs to be tested for
detection
The following data are loaded from the .yml file:
- a test file to scan for licenses,
- a list of expected licenses expressions to detect
- optional notes.
- a boolean flag expected_failure set to True if a test is expected to fail
for now.
If the list of license expressions is empty, then this test should not
detect any license in the test file.
"""
data_file = attr.attrib(default=None)
test_file = attr.attrib(default=None)
test_file_name = attr.attrib(default=None)
license_expressions = attr.attrib(default=attr.Factory(list))
notes = attr.attrib(default=None)
expected_failure = attr.attrib(default=False)
language = attr.attrib(default='en')
licensing = Licensing()
def __attrs_post_init__(self, *args, **kwargs):
if self.test_file:
_, _, self.test_file_name = self.test_file.partition(
os.path.join('licensedcode', 'data') + os.sep)
data = {}
if self.data_file:
try:
with io.open(self.data_file, encoding='utf-8') as df:
data = saneyaml.load(df.read()) or {}
except Exception as e:
raise Exception(f'Failed to read: file://{self.data_file}', e)
self.license_expressions = data.pop('license_expressions', [])
self.language = data.pop('language', 'en')
self.notes = data.pop('notes', None)
# True if the test is expected to fail
self.expected_failure = data.pop('expected_failure', False)
if data:
raise Exception(
'Unknown data elements: ' + repr(data) +
' for: file://' + self.data_file)
if self.license_expressions:
for i, exp in enumerate(self.license_expressions[:]):
try:
expression = self.licensing.parse(exp)
except:
raise Exception(
'Unable to parse License rule expression: '
f'{exp!r} for: file://{self.data_file}\n' +
traceback.format_exc()
)
if expression is None:
raise Exception(
'Unable to parse License rule expression: '
f'{exp!r} for: file://{self.data_file}'
)
new_exp = expression.render()
self.license_expressions[i] = new_exp
else:
if not self.notes:
raise Exception(
'A license test without expected license_expressions should '
f'have explanatory notes: for: file://{self.data_file}'
)
def to_dict(self):
dct = {}
if self.license_expressions:
dct['license_expressions'] = self.license_expressions
if self.language and self.language != 'en':
dct['language'] = self.language
if self.expected_failure:
dct['expected_failure'] = self.expected_failure
if self.notes:
dct['notes'] = self.notes
return dct
def dump(self):
"""
Dump a representation of self to its YAML data file
"""
as_yaml = saneyaml.dump(self.to_dict())
with io.open(self.data_file, 'w', encoding='utf-8') as df:
df.write(as_yaml)
def get_content(self):
"""
Return a byte strings of the test file content.
"""
with open(self.test_file, 'rb') as df:
d = df.read()
return d
def get_test_method_name(self, prefix='test_detection_'):
test_file_name = self.test_file_name
test_name = '{prefix}{test_file_name}'.format(**locals())
test_name = text.python_safe_name(test_name)
if not isinstance(test_name, str):
test_name = test_name.decode('utf-8')
return test_name
@staticmethod
def load_from(test_dir):
"""
Return an iterable of LicenseTest objects loaded from `test_dir`
"""
return [
LicenseTest(data_file, test_file)
for data_file, test_file
in get_test_file_pairs(test_dir)
]
def build_tests(test_dir, clazz, unknown_detection=False, regen=REGEN_TEST_FIXTURES):
"""
Dynamically build license_test methods from a sequence of LicenseTest and
attach these method to the clazz license test class.
"""
license_tests = LicenseTest.load_from(test_dir)
# TODO: check that we do not have duplicated tests with same data and text
for license_test in license_tests:
test_name = license_test.get_test_method_name()
test_file = license_test.test_file
# closure on the license_test params
test_method = make_test(
license_test,
unknown_detection=unknown_detection,
regen=regen,
)
# avoid duplicated test method
if hasattr(clazz, test_name):
msg = (
f'Duplicated test method name: {test_name}: file://{test_file}'
)
raise Exception(msg)
# attach that method to our license_test class
setattr(clazz, test_name, test_method)
def make_test(license_test, unknown_detection=False, regen=REGEN_TEST_FIXTURES):
"""
Build and return a test function closing on tests arguments for a
license_test LicenseTest object.
"""
test_name = license_test.get_test_method_name()
from licensedcode import cache
from licensedcode.tracing import get_texts
expected_expressions = license_test.license_expressions or []
test_file = license_test.test_file
test_data_file = license_test.data_file
expected_failure = license_test.expected_failure
def closure_test_function(*args, **kwargs):
idx = cache.get_index()
matches = idx.match(
location=test_file,
min_score=0,
unknown_licenses=unknown_detection,
)
if not matches:
matches = []
detected_expressions = [match.rule.license_expression for match in matches]
# use detection as expected and dump test back
if regen:
if not expected_failure:
license_test.license_expressions = detected_expressions
license_test.dump()
return
if detected_expressions != expected_expressions:
# On failure, we compare against more result data to get additional
# failure details, including the test_file and full match details
expected = expected_expressions + ['======================', '']
results_failure_trace = (
detected_expressions[:]
+['======================', '']
)
for match in matches:
qtext, itext = get_texts(match)
rule_file = match.rule.rule_file()
results_failure_trace.extend(['',
'======= MATCH ====', repr(match),
'======= Matched Query Text for:',
f'file://{test_file}'
])
if test_data_file:
results_failure_trace.append(f'file://{test_data_file}')
results_failure_trace.append('')
results_failure_trace.append(qtext)
results_failure_trace.extend(['',
'======= Matched Rule Text for:',
f'file://{rule_file}',
'',
itext,
])
if not matches:
results_failure_trace.extend(['',
'======= NO MATCH ====',
'======= Not Matched Query Text for:',
f'file://{test_file}'
])
if test_data_file:
results_failure_trace.append(f'file://{test_data_file}')
# this assert will always fail and provide a detailed failure trace
assert '\n'.join(results_failure_trace) == '\n'.join(expected)
closure_test_function.__name__ = test_name
if expected_failure:
closure_test_function = pytest.mark.xfail(closure_test_function) # NOQA
return closure_test_function
# A small legalese to use in tests. This must be a sorted mapping of common
# license-specific words aka. legalese as {token: id}
# see legalese.py on how to re-create and update this mapping
mini_legalese = {
'accordance': 0,
'according': 1,
'accused': 2,
'acknowledgement': 3,
'admission': 4,
'admitted': 5,
'agreement': 6,
'alleged': 7,
'allowance': 8,
'alternatively': 9,
'assessment': 10,
'assessments': 11,
'choices': 12,
'complementary': 13,
'complications': 14,
'covered': 15,
'damages': 16,
'determines': 17,
'distribute': 18,
'distribution': 19,
'enforcement': 20,
'exceeding': 21,
'exceeds': 22,
'existed': 23,
'fragments': 24,
'general': 25,
'gnu': 26,
'ignored': 27,
'liability': 28,
'license': 29,
'licensed': 30,
'literal': 31,
'means': 32,
'observed': 33,
'plaintiff': 34,
'responded': 35,
'ultimately': 36,
'volunteer': 37,
'warranties': 38,
'warranty': 39
}
def query_run_tokens_with_unknowns(query_run):
"""
Yield the original token ids stream with unknown tokens represented
by None.
"""
unknowns = query_run.query.unknowns_by_pos
# yield anything at the start only if this is the first query run
if query_run.start == 0:
for _ in range(unknowns.get(-1, 0)):
yield None
for pos, tid in query_run.tokens_with_pos():
yield tid
if pos == query_run.end:
break
for _ in range(unknowns.get(pos, 0)):
yield None
def query_tokens_with_unknowns(qry):
"""
Yield the original tokens stream of a Query `qry` with unknown tokens
represented by None.
"""
unknowns = qry.unknowns_by_pos
# yield anything at the start
for _ in range(unknowns.get(-1, 0)):
yield None
for pos, token in enumerate(qry.tokens):
yield token
for _ in range(unknowns.get(pos, 0)):
yield None
def create_rule_from_text_file_and_expression(
text_file,
license_expression=None,
identifier=None,
**kwargs
):
"""
Return a new Rule object from a ``text_file`` and a ``license_expression``.
"""
license_expression = license_expression or 'mit'
if os.path.exists(text_file):
from licensedcode.models import get_rule_text
text = get_rule_text(location=text_file)
else:
text = ''
return create_rule_from_text_and_expression(
text=text,
license_expression=license_expression,
identifier=identifier,
**kwargs,
)
def create_rule_from_text_and_expression(
text=None,
license_expression=None,
identifier=None,
**kwargs,
):
"""
Return a new Rule object from a ``text``, a ``license_expression`` and a
rule ``identifier``.
"""
from licensedcode.models import Rule
license_expression = license_expression or 'mit'
text = text or ''
identifier = identifier or f'_tst_{time()}_{len(text)}_{license_expression}'
rule = Rule(
license_expression=license_expression,
text=text,
is_synthetic=True,
identifier=identifier,
**kwargs,
)
rule.setup()
return rule