Add SPDX generation using spdx-tools

This is set up to produce the same output as the current spdx generation module while utilising the spdx-tools library. The goal is to replace the current module with this new one, which will allow easy migration to more SPDX formats as well as SPDXv3. Signed-off-by: Armin Tänzer <[email protected]>
tern-tools · Jun 23, 2023 · ce04e78 · ce04e78
1 parent 62507ed
commit ce04e78
Show file tree

Hide file tree

Showing 14 changed files with 913 additions and 0 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -18,4 +18,5 @@ GitPython~=3.1
 prettytable~=3.6
 packageurl-python>=0.10.4
 license-expression>=30.1
+spdx-tools>=0.8.0a3
 
diff --git a/setup.cfg b/setup.cfg
@@ -52,6 +52,8 @@ tern.formats =
     yaml = tern.formats.yaml.generator:YAML
     html = tern.formats.html.generator:HTML
     cyclonedxjson = tern.formats.cyclonedx.cyclonedxjson.generator:CycloneDXJSON
+    spdxjson_new = tern.formats.spdx_new.spdxjson.generator:SpdxJSON
+    spdxtagvalue_new = tern.formats.spdx_new.spdxtagvalue.generator:SpdxTagValue
 tern.extensions =
     cve_bin_tool = tern.extensions.cve_bin_tool.executor:CveBinTool
     scancode = tern.extensions.scancode.executor:Scancode

diff --git a/tern/formats/spdx_new/__init__.py b/tern/formats/spdx_new/__init__.py
diff --git a/tern/formats/spdx_new/constants.py b/tern/formats/spdx_new/constants.py
@@ -0,0 +1,15 @@
+from spdx_tools.spdx.model import Version
+
+DOCUMENT_ID = 'SPDXRef-DOCUMENT'
+DOCUMENT_NAME = 'Tern report for {image_name}'
+SPDX_VERSION = 'SPDX-2.2'
+DATA_LICENSE = 'CC0-1.0'
+DOCUMENT_COMMENT = 'This document was generated by ' \
+    'the Tern Project: https://github.com/tern-tools/tern'
+DOCUMENT_NAMESPACE = 'https://spdx.org/spdxdocs/tern-' \
+    'report-{version}-{image}-{uuid}'
+LICENSE_LIST_VERSION = Version(3, 20)
+CREATOR_NAME = 'tern-{version}'
+DOCUMENT_NAME_SNAPSHOT = 'Tern SPDX JSON SBoM'  # TODO: different name here that is not specific to JSON
+DOCUMENT_NAMESPACE_SNAPSHOT = 'https://spdx.org/spdxdocs/tern-report-' \
+    '{timestamp}-{uuid}'
diff --git a/tern/formats/spdx_new/file_helpers.py b/tern/formats/spdx_new/file_helpers.py
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 VMware, Inc. All Rights Reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""
+File level helpers for SPDX document generator
+"""
+from datetime import datetime
+from typing import List
+
+from spdx_tools.spdx.model import File as SpdxFile, SpdxNone, SpdxNoAssertion, Checksum, ChecksumAlgorithm
+
+from tern.classes.file_data import FileData
+from tern.classes.image import Image
+from tern.classes.image_layer import ImageLayer
+from tern.classes.template import Template
+from tern.formats.spdx_new.layer_helpers import get_layer_checksum
+from tern.formats.spdx_new.general_helpers import get_package_license_declared, get_file_spdxref
+
+
+def get_layer_files_list(layer_obj: ImageLayer, template: Template, timestamp: datetime) -> List[SpdxFile]:
+    """Given a layer object and the SPDX template mapping, return a list
+    of SPDX Files for each file in the layer"""
+    spdx_files: List[SpdxFile] = []
+    file_refs = set()
+    for filedata in layer_obj.files:
+        # we do not know the layer's id, so we will use the timestamp instead
+        file_ref = get_file_spdxref(filedata, str(timestamp))
+        if file_ref not in file_refs:
+            spdx_files.append(get_file_dict(filedata, template, str(timestamp)))
+            file_refs.add(file_ref)
+    return spdx_files
+
+
+def get_files_list(image_obj: Image, template: Template) -> List[SpdxFile]:
+    '''Given an image_obj object, and the SPDX template mapping, return a list
+    of SPDX dictionary representations for each file in each layer of the
+    image.'''
+    file_list: List[SpdxFile] = []
+
+    # use file refs to keep track of duplicate files that may be located
+    # in different places in the filesystem
+    file_refs = set()
+    for layer in image_obj.layers:
+        if layer.files_analyzed:
+            layer_checksum_value = get_layer_checksum(layer).value
+            for filedata in layer.files:
+                # we use the layer checksum as the layer id
+                file_ref = get_file_spdxref(filedata, layer_checksum_value)
+                if file_ref not in file_refs:
+                    file_list.append(get_file_dict(filedata, template, layer_checksum_value))
+                    file_refs.add(file_ref)
+    return file_list
+
+
+def get_file_dict(filedata: FileData, template: Template, layer_id: str) -> SpdxFile:
+    """Given a FileData object and its SPDX template mapping, return an
+    SPDX representation of the file. A layer_id is used to
+    distinguish copies of the same file occurring in different places in the
+    image"""
+    mapping = filedata.to_dict(template)
+
+    if filedata.licenses:
+        license_info_in_file = []
+        for lic in set(filedata.licenses):
+            # Add the license expression to the list if it is a valid SPDX
+            # identifier; otherwise, add the LicenseRef
+            license_info_in_file.append(get_package_license_declared(lic))
+    else:
+        license_info_in_file = [SpdxNone()]
+
+    file_notice = get_file_notice(filedata)
+    file_comment = get_file_comment(filedata)
+    file_contributors = get_file_contributors(filedata)
+
+    return SpdxFile(
+        spdx_id=get_file_spdxref(filedata, layer_id),
+        name=mapping['FileName'],
+        checksums=[get_file_checksum(filedata)],
+        license_concluded=SpdxNoAssertion(),  # we don't provide this
+        copyright_text=SpdxNoAssertion(),     # we don't know this
+        file_types=[mapping['FileType']] if mapping['FileType'] else None,
+        license_info_in_file=license_info_in_file,
+        notice=file_notice if file_notice else None,
+        comment=file_comment if file_comment else None,
+        contributors=file_contributors if file_contributors else None,
+    )
+
+
+def get_file_checksum(filedata: FileData) -> Checksum:
+    """Given a FileData object, return the checksum required by SPDX.
+    Currently, the spec requires a SHA1 checksum"""
+    return Checksum(ChecksumAlgorithm.SHA1, filedata.get_checksum('sha1'))
+
+
+def get_file_notice(filedata: FileData) -> str:
+    """Return a formatted string with all copyrights found in a file. Return
+    an empty string if there are no copyrights"""
+    notice = ''
+    for cp in filedata.copyrights:
+        notice = notice + cp + '\n'
+    return notice
+
+
+def get_file_comment(filedata: FileData) -> str:
+    """Return a formatted comment string with all file level notices. Return
+    an empty string if no notices are present"""
+    comment = ''
+    for origin in filedata.origins.origins:
+        comment = comment + '{}:'.format(origin.origin_str) + '\n'
+        for notice in origin.notices:
+            comment = comment + \
+                '{}: {}'.format(notice.level, notice.message) + '\n'
+    return comment
+
+
+def get_file_contributors(filedata: FileData) -> List[str]:
+    """The SPDX spec allows for an optional list of file contributors.
+    If there are any authors found in the file, return a list of authors.
+    If empty, return an empty list"""
+    contributors = []
+    for author in filedata.authors:
+        contributors.append(author)
+    return contributors
diff --git a/tern/formats/spdx_new/general_helpers.py b/tern/formats/spdx_new/general_helpers.py
@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 VMware, Inc. All Rights Reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""
+General helpers for SPDX document generator
+"""
+import datetime
+import hashlib
+import io
+import re
+import uuid
+from datetime import datetime
+from typing import Union, Callable, IO, Tuple
+
+from license_expression import get_spdx_licensing, LicenseExpression, Licensing
+from spdx_tools.spdx.model import SpdxNone, Document
+
+from tern.classes.file_data import FileData
+from tern.classes.image import Image
+from tern.classes.image_layer import ImageLayer
+from tern.classes.package import Package
+
+
+def get_uuid() -> str:
+    return str(uuid.uuid4())
+
+
+def get_current_timestamp() -> datetime:
+    return datetime.utcnow().replace(microsecond=0)
+
+
+def get_string_id(string: str) -> str:
+    """Return a unique identifier for the given string"""
+    return hashlib.sha256(string.encode('utf-8')).hexdigest()[-7:]
+
+
+def get_license_ref(license_string: str) -> str:
+    """For SPDX format, return a LicenseRef string"""
+    return 'LicenseRef-' + get_string_id(str(license_string))
+
+
+def replace_invalid_chars_in_license_expression(license_string: str) -> str:
+    """Given a license string, replace common invalid SPDX license characters."""
+    not_allowed = [',', ';', '/', '&']
+    if any(x in license_string for x in not_allowed):
+        # Try to replace common invalid license characters
+        license_string = license_string.replace(',', ' and')
+        license_string = license_string.replace('/', '-')
+        license_string = license_string.replace(';', '.')
+        license_string = license_string.replace('&', 'and')
+    return license_string
+
+
+def is_valid_license_expression(license_string: str) -> bool:
+    licensing = get_spdx_licensing()
+    try:
+        return licensing.validate(license_string).errors == []
+    # Catch any invalid license chars here
+    except AttributeError:
+        return False
+
+
+def get_package_license_declared(package_license_declared: str) -> Union[LicenseExpression, SpdxNone]:
+    """After substituting common invalid SPDX license characters using
+    the is_spdx_license_expression() function, determines if the declared
+    license string for a package or file is a valid SPDX license expression.
+    If license expression is valid after substitutions, return the updated string.
+    If not, return the LicenseRef of the original declared license expression
+    passed in to the function. If a blank string is passed in, return `NONE`."""
+    if package_license_declared:
+        package_license_declared = replace_invalid_chars_in_license_expression(package_license_declared)
+        if is_valid_license_expression(package_license_declared):
+            return Licensing().parse(package_license_declared)
+
+        return Licensing().parse(get_license_ref(package_license_declared))
+    return SpdxNone()
+
+
+def get_serialized_document_string(spdx_document: Document, writer_function: Callable[[Document, IO[str]], str]) -> str:
+    with io.StringIO() as stream:
+        writer_function(spdx_document, stream, validate=False)
+        return stream.getvalue()
+
+
+###########################################################################################
+# central place for SPDXRef-generators to avoid circular imports as these are widely used #
+###########################################################################################
+
+def get_image_spdxref(image_obj: Image) -> str:
+    """Given the image object, return an SPDX reference ID"""
+    # here we return the image name, tag and id
+    return f'SPDXRef-{image_obj.get_human_readable_id()}'
+
+
+def get_package_spdxref(package_obj: Package) -> Tuple[str, str]:
+    """Given the package obj, return an SPDX reference ID for the binary
+    and source package, if available"""
+    pkg_ref = f"{package_obj.name}-{package_obj.version}"
+    src_ref = ''
+    if package_obj.src_name:
+        # differentiate between binary and source package refs
+        src_ver = package_obj.src_version + "-src"
+        src_ref = f"{package_obj.src_name}-{src_ver}"
+    # replace all the strings that SPDX doesn't like
+    # allowed characters are: letters, numbers, "." and "-"
+    clean_pkg_ref = re.sub(r'[:+~_/]', r'-', pkg_ref)
+    if src_ref:
+        clean_src_ref = re.sub(r'[:+~/]', r'-', src_ref)
+        return f'SPDXRef-{clean_pkg_ref}', f'SPDXRef-{clean_src_ref}'
+    return f'SPDXRef-{clean_pkg_ref}', ''
+
+
+def get_layer_spdxref(layer_obj: ImageLayer) -> str:
+    """Given the layer object, return an SPDX reference ID"""
+    # here we return the shortened diff_id of the layer
+    return f'SPDXRef-{layer_obj.diff_id[:10]}'
+
+
+def get_file_spdxref(filedata: FileData, layer_id: str) -> str:
+    """Given a FileData object, return a unique identifier for the SPDX
+    document. According to the spec, this should be of the form: SPDXRef-<id>
+    We will use a combination of the file name, checksum and layer_id and
+    calculate a hash of this string"""
+    file_string = filedata.path + filedata.checksum[:7] + layer_id
+    fileid = get_string_id(file_string)
+    return f'SPDXRef-{fileid}'
diff --git a/tern/formats/spdx_new/image_helpers.py b/tern/formats/spdx_new/image_helpers.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 VMware, Inc. All Rights Reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""
+Image level helpers for SPDX document generator
+Images for SPDX act like a Package
+"""
+from typing import List
+
+from spdx_tools.spdx.model import ExtractedLicensingInfo, Package as SpdxPackage, \
+    SpdxNoAssertion
+
+from tern.classes.image import Image
+from tern.classes.template import Template
+from tern.formats.spdx_new.layer_helpers import get_layer_licenses
+from tern.formats.spdx_new.general_helpers import get_license_ref, get_uuid, is_valid_license_expression, \
+    get_image_spdxref
+from tern.utils.general import get_git_rev_or_version
+
+
+def get_image_extracted_licenses(image_obj: Image) -> List[ExtractedLicensingInfo]:
+    """Given an image_obj, return a unique list of extractedLicensingInfo
+    that contains all the file and package LicenseRef and their corresponding plain text."""
+
+    unique_licenses = set()
+    for layer in image_obj.layers:
+        # Get all of the unique file licenses, if they exist
+        unique_licenses.update(get_layer_licenses(layer))
+        # Next, collect any package licenses not already accounted for
+        for package in layer.packages:
+            if package.pkg_license:
+                unique_licenses.add(package.pkg_license)
+            # Add debian licenses from copyright text as one license
+            if package.pkg_licenses:
+                unique_licenses.add(", ".join(package.pkg_licenses))
+    extracted_licensing_info = []
+    for lic in list(unique_licenses):
+        valid_spdx = is_valid_license_expression(lic)
+        if not valid_spdx:
+            extracted_licensing_info.append(ExtractedLicensingInfo(license_id=get_license_ref(lic), extracted_text=lic))
+
+    return extracted_licensing_info
+
+
+def get_image_dict(image_obj: Image, template: Template) -> SpdxPackage:  # TODO: these kind of functions don't produce dicts anymore, rename them
+    """Given an image object and the template object for SPDX, return the
+    SPDX Package for the given image."""
+    mapping = image_obj.to_dict(template)
+    return SpdxPackage(
+        spdx_id=get_image_spdxref(image_obj),
+        name=mapping["PackageName"],
+        download_location=SpdxNoAssertion(),
+        version=mapping["PackageVersion"],
+        supplier=SpdxNoAssertion(),
+        files_analyzed=False,
+        license_concluded=SpdxNoAssertion(),
+        license_declared=SpdxNoAssertion(),
+        copyright_text=SpdxNoAssertion(),
+    )
+
+
+def get_document_namespace(image_obj: Image) -> str:
+    """Given the image object, return a unique SPDX document uri.
+    This is a combination of the tool name and version, the image name
+    and the uuid"""
+    return f'https://spdx.org/spdxdocs/tern-report-{get_git_rev_or_version()[1]}-{image_obj.name}-{get_uuid()}'