forked from aboutcode-org/scancode-toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathoutput_csv.py
More file actions
360 lines (282 loc) · 11 KB
/
output_csv.py
File metadata and controls
360 lines (282 loc) · 11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/scancode-toolkit for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
import attr
import csv
import logging
import os
import warnings
import saneyaml
from commoncode.cliutils import PluggableCommandLineOption
from commoncode.cliutils import OUTPUT_GROUP
from plugincode.output import output_impl
from plugincode.output import OutputPlugin
from formattedcode import FileOptionType
# Tracing flags
TRACE = os.environ.get('SCANCODE_DEBUG_OUTPUT_CSV', False)
def logger_debug(*args):
pass
logger = logging.getLogger(__name__)
if TRACE:
import sys
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.DEBUG)
def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
DEPRECATED_MSG = (
'The --csv option is deprecated and will be replaced by new CSV and '
'tabular output formats in the next ScanCode release. '
'Visit https://github.com/nexB/scancode-toolkit/issues/3043 to provide inputs and feedback.'
)
@output_impl
class CsvOutput(OutputPlugin):
options = [
PluggableCommandLineOption(('--csv',),
type=FileOptionType(mode='w', encoding='utf-8', lazy=True),
metavar='FILE',
help='[DEPRECATED] Write scan output as CSV to FILE. ' + DEPRECATED_MSG,
help_group=OUTPUT_GROUP,
sort_order=30),
]
def is_enabled(self, csv, **kwargs):
return csv
def process_codebase(self, codebase, csv, **kwargs):
warnings.warn(
DEPRECATED_MSG,
DeprecationWarning,
stacklevel=1
)
import click
click.secho('[DEPRECATION WARNING] ' + DEPRECATED_MSG, err=True)
results = self.get_files(codebase, **kwargs)
write_csv(results, csv)
def write_csv(results, output_file):
# FIXME: this is reading all in memory
results = list(results)
headers = dict([
('info', []),
('license', []),
('copyright', []),
('email', []),
('url', []),
('package', []),
])
# note: FIXME: headers are collected as a side effect and this is not great
rows = list(flatten_scan(results, headers))
ordered_headers = []
for key_group in headers.values():
ordered_headers.extend(key_group)
w = csv.DictWriter(output_file, fieldnames=ordered_headers)
w.writeheader()
for r in rows:
w.writerow(r)
def flatten_scan(scan, headers):
"""
Yield ordered dictionaries of key/values flattening the sequence
data in a single line-separated value and keying always by path,
given a ScanCode `scan` results list. Update the `headers` mapping
sequences with seen keys as a side effect.
"""
seen = set()
def collect_keys(mapping, key_group):
"""Update the headers with new keys."""
keys = mapping.keys()
headers[key_group].extend(k for k in keys if k not in seen)
seen.update(keys)
for scanned_file in scan:
path = scanned_file.pop('path')
# removing any slash at the begening of the path
path = path.lstrip('/')
# use a trailing slash for directories
if scanned_file.get('type') == 'directory' and not path.endswith('/'):
path += '/'
errors = scanned_file.pop('scan_errors', [])
# FIXME: info are NOT lists: lists are the actual scans
file_info = dict(path=path)
file_info.update(
(
(k, v) for k, v in scanned_file.items()
if not isinstance(v, (list, dict))
)
)
# Scan errors are joined in a single multi-line value
file_info['scan_errors'] = '\n'.join(errors)
collect_keys(file_info, 'info')
yield file_info
for detection in scanned_file.get('license_detections', []):
license_expression = detection["license_expression"]
detection_log = detection.get("detection_log", []) or []
detection_log = '\n'.join(detection_log)
license_matches = detection["matches"]
for match in license_matches:
lic = dict(path=path)
lic["license_expression"] = license_expression
lic["detection_log"] = detection_log
for k, val in match.items():
# do not include matched text for now.
if k == 'matched_text':
continue
if k == 'licenses':
license_keys = []
for license_item in val:
license_keys.append(license_item["key"])
k = 'license_match__' + k
lic[k] = '\n'.join(license_keys)
continue
if k in ('score', 'match_coverage', 'rule_relevance'):
val = with_two_decimals(val)
# lines are present in multiple scans: keep their column name as
# not scan-specific. Prefix othe columns with license__
if k not in ('start_line', 'end_line',):
k = 'license_match__' + k
lic[k] = val
collect_keys(lic, 'license')
yield lic
for copyr in scanned_file.get('copyrights', []):
inf = dict(path=path)
inf['copyright'] = copyr['copyright']
inf['start_line'] = copyr['start_line']
inf['end_line'] = copyr['start_line']
collect_keys(inf, 'copyright')
yield inf
for copyr in scanned_file.get('holders', []):
inf = dict(path=path)
inf['holder'] = copyr['holder']
inf['start_line'] = copyr['start_line']
inf['end_line'] = copyr['start_line']
collect_keys(inf, 'copyright')
yield inf
for copyr in scanned_file.get('authors', []):
inf = dict(path=path)
inf['author'] = copyr['author']
inf['start_line'] = copyr['start_line']
inf['end_line'] = copyr['start_line']
collect_keys(inf, 'copyright')
yield inf
for email in scanned_file.get('emails', []):
email_info = dict(path=path)
email_info.update(email)
collect_keys(email_info, 'email')
yield email_info
for url in scanned_file.get('urls', []):
url_info = dict(path=path)
url_info.update(url)
collect_keys(url_info, 'url')
yield url_info
for package in scanned_file.get('package_data', []):
flat = flatten_package(package, path)
collect_keys(flat, 'package')
yield flat
def with_two_decimals(val):
"""
Return a normalized score string with two decimal values
"""
if isinstance(val, (float, int)):
val = '{:.2f}'.format(val)
if not isinstance(val, str):
val = str(val)
return val
def pretty(data):
"""
Return a unicode text pretty representation of data (as YAML or else) if
data is a sequence or mapping or the data as-is otherwise
"""
if not data:
return None
seqtypes = list, tuple
maptypes = dict, dict
coltypes = seqtypes + maptypes
if isinstance(data, seqtypes):
if len(data) == 1 and isinstance(data[0], str):
return data[0].strip()
if isinstance(data, coltypes):
return saneyaml.dump(
data, indent=2, encoding='utf-8').decode('utf-8').strip()
return data
def get_package_columns(_columns=set()):
"""
Return (and cache in_columns) a set of package column names included in the
CSV output.
Some columsn are excluded for now such as lists of mappings: these do not
serialize well to CSV
"""
if _columns:
return _columns
from packagedcode.models import PackageData
package_data_fields = [field.name for field in attr.fields(PackageData)]
# exclude some columns for now that contain list of items
excluded_columns = {
# list of strings
'keywords',
# list of dicts
'parties',
'dependencies',
'source_packages',
}
# some extra columns for components
extra_columns = [
'purl',
'components',
'owner_name',
'reference_notes',
'description',
'notice_filename',
'notice_url',
]
fields = package_data_fields + extra_columns
_columns = set(f for f in fields if f not in excluded_columns)
return _columns
def flatten_package(_package, path, prefix='package__'):
# known package columns
package_columns = get_package_columns()
pack = dict(path=path)
for k, val in _package.items():
if k not in package_columns:
continue
# prefix columns with "package__"
nk = prefix + k
if k == 'version':
val = val or ''
if val and not val.lower().startswith('v'):
# prefix versions with a v to avoid spreadsheet tools to mistake
# a version for a number or date when reading CSVs (common with
# Excel and LibreOffice).
val = 'v ' + val
pack[nk] = val
continue
# these may come from a component matched
if k == 'components' and val and isinstance(val, list):
for component in val:
for component_key, component_val in component.items():
if component_key not in package_columns:
continue
component_new_key = nk + '__' + component_key
if component_val is None:
pack[component_new_key] = ''
continue
if isinstance(component_val, list):
component_val = '\n'.join(component_val)
if not isinstance(component_val, str):
component_val = repr(component_val)
existing = pack.get(component_new_key) or []
if not isinstance(existing, list):
existing = [existing]
if TRACE:
logger_debug('component_new_key:', component_new_key, 'existing:', type(existing), repr(existing))
logger_debug('component_key:', component_key, 'component_val:', type(component_val), repr(component_val))
pack[component_new_key] = ' \n'.join(existing + [component_val])
continue
# everything else
pack[nk] = ''
if isinstance(val, str):
pack[nk] = val
else:
# Use repr if not a string
if val:
pack[nk] = repr(val)
return pack