Skip to content

Commit

Permalink
Add SPDX generation using spdx-tools
Browse files Browse the repository at this point in the history
This is set up to produce the same output as the current spdx generation module while utilising the spdx-tools library. The goal is to replace the current module with this new one, which will allow easy migration to more SPDX formats as well as SPDXv3.

Signed-off-by: Armin Tänzer <[email protected]>
  • Loading branch information
armintaenzertng committed Jun 23, 2023
1 parent 62507ed commit ce04e78
Show file tree
Hide file tree
Showing 14 changed files with 913 additions and 0 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,5 @@ GitPython~=3.1
prettytable~=3.6
packageurl-python>=0.10.4
license-expression>=30.1
spdx-tools>=0.8.0a3

2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ tern.formats =
yaml = tern.formats.yaml.generator:YAML
html = tern.formats.html.generator:HTML
cyclonedxjson = tern.formats.cyclonedx.cyclonedxjson.generator:CycloneDXJSON
spdxjson_new = tern.formats.spdx_new.spdxjson.generator:SpdxJSON
spdxtagvalue_new = tern.formats.spdx_new.spdxtagvalue.generator:SpdxTagValue
tern.extensions =
cve_bin_tool = tern.extensions.cve_bin_tool.executor:CveBinTool
scancode = tern.extensions.scancode.executor:Scancode
Expand Down
Empty file.
15 changes: 15 additions & 0 deletions tern/formats/spdx_new/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from spdx_tools.spdx.model import Version

DOCUMENT_ID = 'SPDXRef-DOCUMENT'
DOCUMENT_NAME = 'Tern report for {image_name}'
SPDX_VERSION = 'SPDX-2.2'
DATA_LICENSE = 'CC0-1.0'
DOCUMENT_COMMENT = 'This document was generated by ' \
'the Tern Project: https://github.com/tern-tools/tern'
DOCUMENT_NAMESPACE = 'https://spdx.org/spdxdocs/tern-' \
'report-{version}-{image}-{uuid}'
LICENSE_LIST_VERSION = Version(3, 20)
CREATOR_NAME = 'tern-{version}'
DOCUMENT_NAME_SNAPSHOT = 'Tern SPDX JSON SBoM' # TODO: different name here that is not specific to JSON
DOCUMENT_NAMESPACE_SNAPSHOT = 'https://spdx.org/spdxdocs/tern-report-' \
'{timestamp}-{uuid}'
125 changes: 125 additions & 0 deletions tern/formats/spdx_new/file_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2021 VMware, Inc. All Rights Reserved.
# SPDX-License-Identifier: BSD-2-Clause

"""
File level helpers for SPDX document generator
"""
from datetime import datetime
from typing import List

from spdx_tools.spdx.model import File as SpdxFile, SpdxNone, SpdxNoAssertion, Checksum, ChecksumAlgorithm

from tern.classes.file_data import FileData
from tern.classes.image import Image
from tern.classes.image_layer import ImageLayer
from tern.classes.template import Template
from tern.formats.spdx_new.layer_helpers import get_layer_checksum
from tern.formats.spdx_new.general_helpers import get_package_license_declared, get_file_spdxref


def get_layer_files_list(layer_obj: ImageLayer, template: Template, timestamp: datetime) -> List[SpdxFile]:
"""Given a layer object and the SPDX template mapping, return a list
of SPDX Files for each file in the layer"""
spdx_files: List[SpdxFile] = []
file_refs = set()
for filedata in layer_obj.files:
# we do not know the layer's id, so we will use the timestamp instead
file_ref = get_file_spdxref(filedata, str(timestamp))
if file_ref not in file_refs:
spdx_files.append(get_file_dict(filedata, template, str(timestamp)))
file_refs.add(file_ref)
return spdx_files


def get_files_list(image_obj: Image, template: Template) -> List[SpdxFile]:
'''Given an image_obj object, and the SPDX template mapping, return a list
of SPDX dictionary representations for each file in each layer of the
image.'''
file_list: List[SpdxFile] = []

# use file refs to keep track of duplicate files that may be located
# in different places in the filesystem
file_refs = set()
for layer in image_obj.layers:
if layer.files_analyzed:
layer_checksum_value = get_layer_checksum(layer).value
for filedata in layer.files:
# we use the layer checksum as the layer id
file_ref = get_file_spdxref(filedata, layer_checksum_value)
if file_ref not in file_refs:
file_list.append(get_file_dict(filedata, template, layer_checksum_value))
file_refs.add(file_ref)
return file_list


def get_file_dict(filedata: FileData, template: Template, layer_id: str) -> SpdxFile:
"""Given a FileData object and its SPDX template mapping, return an
SPDX representation of the file. A layer_id is used to
distinguish copies of the same file occurring in different places in the
image"""
mapping = filedata.to_dict(template)

if filedata.licenses:
license_info_in_file = []
for lic in set(filedata.licenses):
# Add the license expression to the list if it is a valid SPDX
# identifier; otherwise, add the LicenseRef
license_info_in_file.append(get_package_license_declared(lic))
else:
license_info_in_file = [SpdxNone()]

file_notice = get_file_notice(filedata)
file_comment = get_file_comment(filedata)
file_contributors = get_file_contributors(filedata)

return SpdxFile(
spdx_id=get_file_spdxref(filedata, layer_id),
name=mapping['FileName'],
checksums=[get_file_checksum(filedata)],
license_concluded=SpdxNoAssertion(), # we don't provide this
copyright_text=SpdxNoAssertion(), # we don't know this
file_types=[mapping['FileType']] if mapping['FileType'] else None,
license_info_in_file=license_info_in_file,
notice=file_notice if file_notice else None,
comment=file_comment if file_comment else None,
contributors=file_contributors if file_contributors else None,
)


def get_file_checksum(filedata: FileData) -> Checksum:
"""Given a FileData object, return the checksum required by SPDX.
Currently, the spec requires a SHA1 checksum"""
return Checksum(ChecksumAlgorithm.SHA1, filedata.get_checksum('sha1'))


def get_file_notice(filedata: FileData) -> str:
"""Return a formatted string with all copyrights found in a file. Return
an empty string if there are no copyrights"""
notice = ''
for cp in filedata.copyrights:
notice = notice + cp + '\n'
return notice


def get_file_comment(filedata: FileData) -> str:
"""Return a formatted comment string with all file level notices. Return
an empty string if no notices are present"""
comment = ''
for origin in filedata.origins.origins:
comment = comment + '{}:'.format(origin.origin_str) + '\n'
for notice in origin.notices:
comment = comment + \
'{}: {}'.format(notice.level, notice.message) + '\n'
return comment


def get_file_contributors(filedata: FileData) -> List[str]:
"""The SPDX spec allows for an optional list of file contributors.
If there are any authors found in the file, return a list of authors.
If empty, return an empty list"""
contributors = []
for author in filedata.authors:
contributors.append(author)
return contributors
128 changes: 128 additions & 0 deletions tern/formats/spdx_new/general_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2021 VMware, Inc. All Rights Reserved.
# SPDX-License-Identifier: BSD-2-Clause

"""
General helpers for SPDX document generator
"""
import datetime
import hashlib
import io
import re
import uuid
from datetime import datetime
from typing import Union, Callable, IO, Tuple

from license_expression import get_spdx_licensing, LicenseExpression, Licensing
from spdx_tools.spdx.model import SpdxNone, Document

from tern.classes.file_data import FileData
from tern.classes.image import Image
from tern.classes.image_layer import ImageLayer
from tern.classes.package import Package


def get_uuid() -> str:
return str(uuid.uuid4())


def get_current_timestamp() -> datetime:
return datetime.utcnow().replace(microsecond=0)


def get_string_id(string: str) -> str:
"""Return a unique identifier for the given string"""
return hashlib.sha256(string.encode('utf-8')).hexdigest()[-7:]


def get_license_ref(license_string: str) -> str:
"""For SPDX format, return a LicenseRef string"""
return 'LicenseRef-' + get_string_id(str(license_string))


def replace_invalid_chars_in_license_expression(license_string: str) -> str:
"""Given a license string, replace common invalid SPDX license characters."""
not_allowed = [',', ';', '/', '&']
if any(x in license_string for x in not_allowed):
# Try to replace common invalid license characters
license_string = license_string.replace(',', ' and')
license_string = license_string.replace('/', '-')
license_string = license_string.replace(';', '.')
license_string = license_string.replace('&', 'and')
return license_string


def is_valid_license_expression(license_string: str) -> bool:
licensing = get_spdx_licensing()
try:
return licensing.validate(license_string).errors == []
# Catch any invalid license chars here
except AttributeError:
return False


def get_package_license_declared(package_license_declared: str) -> Union[LicenseExpression, SpdxNone]:
"""After substituting common invalid SPDX license characters using
the is_spdx_license_expression() function, determines if the declared
license string for a package or file is a valid SPDX license expression.
If license expression is valid after substitutions, return the updated string.
If not, return the LicenseRef of the original declared license expression
passed in to the function. If a blank string is passed in, return `NONE`."""
if package_license_declared:
package_license_declared = replace_invalid_chars_in_license_expression(package_license_declared)
if is_valid_license_expression(package_license_declared):
return Licensing().parse(package_license_declared)

return Licensing().parse(get_license_ref(package_license_declared))
return SpdxNone()


def get_serialized_document_string(spdx_document: Document, writer_function: Callable[[Document, IO[str]], str]) -> str:
with io.StringIO() as stream:
writer_function(spdx_document, stream, validate=False)
return stream.getvalue()


###########################################################################################
# central place for SPDXRef-generators to avoid circular imports as these are widely used #
###########################################################################################

def get_image_spdxref(image_obj: Image) -> str:
"""Given the image object, return an SPDX reference ID"""
# here we return the image name, tag and id
return f'SPDXRef-{image_obj.get_human_readable_id()}'


def get_package_spdxref(package_obj: Package) -> Tuple[str, str]:
"""Given the package obj, return an SPDX reference ID for the binary
and source package, if available"""
pkg_ref = f"{package_obj.name}-{package_obj.version}"
src_ref = ''
if package_obj.src_name:
# differentiate between binary and source package refs
src_ver = package_obj.src_version + "-src"
src_ref = f"{package_obj.src_name}-{src_ver}"
# replace all the strings that SPDX doesn't like
# allowed characters are: letters, numbers, "." and "-"
clean_pkg_ref = re.sub(r'[:+~_/]', r'-', pkg_ref)
if src_ref:
clean_src_ref = re.sub(r'[:+~/]', r'-', src_ref)
return f'SPDXRef-{clean_pkg_ref}', f'SPDXRef-{clean_src_ref}'
return f'SPDXRef-{clean_pkg_ref}', ''


def get_layer_spdxref(layer_obj: ImageLayer) -> str:
"""Given the layer object, return an SPDX reference ID"""
# here we return the shortened diff_id of the layer
return f'SPDXRef-{layer_obj.diff_id[:10]}'


def get_file_spdxref(filedata: FileData, layer_id: str) -> str:
"""Given a FileData object, return a unique identifier for the SPDX
document. According to the spec, this should be of the form: SPDXRef-<id>
We will use a combination of the file name, checksum and layer_id and
calculate a hash of this string"""
file_string = filedata.path + filedata.checksum[:7] + layer_id
fileid = get_string_id(file_string)
return f'SPDXRef-{fileid}'
68 changes: 68 additions & 0 deletions tern/formats/spdx_new/image_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2021 VMware, Inc. All Rights Reserved.
# SPDX-License-Identifier: BSD-2-Clause

"""
Image level helpers for SPDX document generator
Images for SPDX act like a Package
"""
from typing import List

from spdx_tools.spdx.model import ExtractedLicensingInfo, Package as SpdxPackage, \
SpdxNoAssertion

from tern.classes.image import Image
from tern.classes.template import Template
from tern.formats.spdx_new.layer_helpers import get_layer_licenses
from tern.formats.spdx_new.general_helpers import get_license_ref, get_uuid, is_valid_license_expression, \
get_image_spdxref
from tern.utils.general import get_git_rev_or_version


def get_image_extracted_licenses(image_obj: Image) -> List[ExtractedLicensingInfo]:
"""Given an image_obj, return a unique list of extractedLicensingInfo
that contains all the file and package LicenseRef and their corresponding plain text."""

unique_licenses = set()
for layer in image_obj.layers:
# Get all of the unique file licenses, if they exist
unique_licenses.update(get_layer_licenses(layer))
# Next, collect any package licenses not already accounted for
for package in layer.packages:
if package.pkg_license:
unique_licenses.add(package.pkg_license)
# Add debian licenses from copyright text as one license
if package.pkg_licenses:
unique_licenses.add(", ".join(package.pkg_licenses))
extracted_licensing_info = []
for lic in list(unique_licenses):
valid_spdx = is_valid_license_expression(lic)
if not valid_spdx:
extracted_licensing_info.append(ExtractedLicensingInfo(license_id=get_license_ref(lic), extracted_text=lic))

return extracted_licensing_info


def get_image_dict(image_obj: Image, template: Template) -> SpdxPackage: # TODO: these kind of functions don't produce dicts anymore, rename them
"""Given an image object and the template object for SPDX, return the
SPDX Package for the given image."""
mapping = image_obj.to_dict(template)
return SpdxPackage(
spdx_id=get_image_spdxref(image_obj),
name=mapping["PackageName"],
download_location=SpdxNoAssertion(),
version=mapping["PackageVersion"],
supplier=SpdxNoAssertion(),
files_analyzed=False,
license_concluded=SpdxNoAssertion(),
license_declared=SpdxNoAssertion(),
copyright_text=SpdxNoAssertion(),
)


def get_document_namespace(image_obj: Image) -> str:
"""Given the image object, return a unique SPDX document uri.
This is a combination of the tool name and version, the image name
and the uuid"""
return f'https://spdx.org/spdxdocs/tern-report-{get_git_rev_or_version()[1]}-{image_obj.name}-{get_uuid()}'
Loading

0 comments on commit ce04e78

Please sign in to comment.