opossum-tool · abraemer · Jan 16, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 13, 2025
diff --git a/src/opossum_lib/cli.py b/src/opossum_lib/cli.py
@@ -14,6 +14,7 @@
 from opossum_lib.opossum.file_generation import write_opossum_information_to_file
 from opossum_lib.opossum.opossum_file import OpossumInformation
 from opossum_lib.opossum.read_opossum_file import read_opossum_file
+from opossum_lib.scancode.convert_scancode_to_opossum import convert_scancode_to_opossum
 from opossum_lib.spdx.convert_to_opossum import convert_spdx_to_opossum_information
 
 
@@ -35,6 +36,12 @@ def opossum_file() -> None:
     multiple=True,
     type=click.Path(exists=True),
 )
+@click.option(
+    "--scan-code-json",
+    help="ScanCode json files used as input.",
+    multiple=True,
+    type=click.Path(exists=True),
+)
 @click.option(
     "--outfile",
     "-o",
@@ -43,16 +50,18 @@ def opossum_file() -> None:
     help="The file path to write the generated opossum document to. "
     'If appropriate, the extension ".opossum" will be appended.',
 )
-def generate(spdx: list[str], opossum: list[str], outfile: str) -> None:
+def generate(
+    spdx: list[str], scan_code_json: list[str], opossum: list[str], outfile: str
+) -> None:
     """
     Generate an Opossum file from various other file formats.
 
     \b
     Currently supported input formats:
       - SPDX
     """
-    validate_input_exit_on_error(spdx, opossum)
-    opossum_information = convert_after_valid_input(spdx, opossum)
+    validate_input_exit_on_error(spdx, scan_code_json, opossum)
+    opossum_information = convert_after_valid_input(spdx, scan_code_json, opossum)
 
     if not outfile.endswith(".opossum"):
         outfile += ".opossum"
@@ -63,8 +72,10 @@ def generate(spdx: list[str], opossum: list[str], outfile: str) -> None:
     write_opossum_information_to_file(opossum_information, Path(outfile))
 
 
-def validate_input_exit_on_error(spdx: list[str], opossum: list[str]) -> None:
-    total_number_of_files = len(spdx) + len(opossum)
+def validate_input_exit_on_error(
+    spdx: list[str], scan_code_json: list[str], opossum: list[str]
+) -> None:
+    total_number_of_files = len(spdx) + len(scan_code_json) + len(opossum)
     if total_number_of_files == 0:
         logging.warning("No input provided. Exiting.")
         sys.exit(1)
@@ -74,11 +85,14 @@ def validate_input_exit_on_error(spdx: list[str], opossum: list[str]) -> None:
 
 
 def convert_after_valid_input(
-    spdx: list[str], opossum_files: list[str]
+    spdx: list[str], scan_code_json: list[str], opossum_files: list[str]
 ) -> OpossumInformation:
     if len(spdx) == 1:
         the_spdx_file = spdx[0]
         return convert_spdx_to_opossum_information(the_spdx_file)
+    elif len(scan_code_json) == 1:
+        the_scan_code_json = scan_code_json[0]
+        return convert_scancode_to_opossum(the_scan_code_json)
     else:
         opossum_input_file = opossum_files[0]
         return read_opossum_file(opossum_input_file)

diff --git a/src/opossum_lib/scancode/__init__.py b/src/opossum_lib/scancode/__init__.py
diff --git a/src/opossum_lib/scancode/convert_scancode_to_opossum.py b/src/opossum_lib/scancode/convert_scancode_to_opossum.py
@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: TNG Technology Consulting GmbH <https://www.tngtech.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+import json
+import logging
+import sys
+import uuid
+
+from opossum_lib.opossum.opossum_file import (
+    Metadata,
+    OpossumInformation,
+)
+from opossum_lib.scancode.model import ScanCodeData
+from opossum_lib.scancode.resource_tree import (
+    convert_to_opossum_resources,
+    create_attribution_mapping,
+    scancode_to_file_tree,
+)
+
+
+def convert_scancode_to_opossum(filename: str) -> OpossumInformation:
+    logging.info(f"Converting scancode to opossum {filename}")
+
+    try:
+        with open(filename) as inp:
+            json_data = json.load(inp)
+    except json.JSONDecodeError as jsde:
+        logging.error(f"Error decoding json for file {filename}. Message: {jsde.msg}")
+        sys.exit(1)
+    except UnicodeDecodeError:
+        logging.error(f"Error decoding json for file {filename}.")
+        sys.exit(1)
+
+    scanCodeData = ScanCodeData.model_validate(json_data)
+    filetree = scancode_to_file_tree(scanCodeData)
+    resources = convert_to_opossum_resources(filetree)
+    externalAttributions, resourcesToAttributions = create_attribution_mapping(filetree)
+
+    return OpossumInformation(
+        metadata=create_opossum_metadata(scanCodeData),
+        resources=resources,
+        externalAttributions=externalAttributions,
+        resourcesToAttributions=resourcesToAttributions,
+        attributionBreakpoints=[],
+        externalAttributionSources={},
+    )
+
+
+def create_opossum_metadata(scancode_data: ScanCodeData) -> Metadata:
+    if len(scancode_data.headers) == 0:
+        logging.error("ScanCode data is missing the header!")
+        sys.exit(1)
+    elif len(scancode_data.headers) > 1:
+        logging.error(f"ScanCode data has {len(scancode_data.headers)} headers!")
+        sys.exit(1)
+
+    the_header = scancode_data.headers[0]
+
+    metadata = {}
+    metadata["projectId"] = str(uuid.uuid4())
+    metadata["fileCreationDate"] = the_header.end_timestamp
+    metadata["projectTitle"] = "ScanCode file"
+
+    return Metadata.model_validate(metadata)
diff --git a/src/opossum_lib/scancode/helpers.py b/src/opossum_lib/scancode/helpers.py
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: TNG Technology Consulting GmbH <https://www.tngtech.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+import os.path
+
+from pydantic import BaseModel
+from pydantic_core import SchemaValidator
+
+
+def path_segments(path: str) -> list[str]:
+    path = os.path.normpath(path)
+    return path.split(os.sep)
+
+
+def check_schema(model: BaseModel) -> None:
+    schema_validator = SchemaValidator(schema=model.__pydantic_core_schema__)
+    schema_validator.validate_python(model.__dict__)
diff --git a/src/opossum_lib/scancode/model.py b/src/opossum_lib/scancode/model.py
@@ -0,0 +1,151 @@
+# SPDX-FileCopyrightText: TNG Technology Consulting GmbH <https://www.tngtech.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from typing import Any
+
+from pydantic import BaseModel
+
+
+class Options(BaseModel, extra="ignore"):
+    input: list[str]
+
+
+class SystemEnvironment(BaseModel):
+    operating_system: str
+    cpu_architecture: str
+    platform: str
+    platform_version: str
+    python_version: str
+
+
+class ExtraData(BaseModel):
+    system_environment: SystemEnvironment
+    spdx_license_list_version: str
+    files_count: int
+
+
+class Header(BaseModel):
+    tool_name: str
+    tool_version: str
+    options: Options
+    notice: str
+    start_timestamp: str
+    end_timestamp: str
+    output_format_version: str
+    duration: float
+    message: Any
+    errors: list
+    warnings: list
+    extra_data: ExtraData
+
+
+class ReferenceMatch(BaseModel):
+    license_expression: str
+    license_expression_spdx: str
+    from_file: str
+    start_line: int
+    end_line: int
+    matcher: str
+    score: float
+    matched_length: int
+    match_coverage: float
+    rule_relevance: int
+    rule_identifier: str
+    rule_url: Any
+
+
+class LicenseDetection(BaseModel):
+    identifier: str
+    license_expression: str
+    license_expression_spdx: str
+    detection_count: int
+    reference_matches: list[ReferenceMatch]
+
+
+class Match(BaseModel):
+    license_expression: str
+    license_expression_spdx: str
+    from_file: str
+    start_line: int
+    end_line: int
+    matcher: str
+    score: float
+    matched_length: int
+    match_coverage: float
+    rule_relevance: int
+    rule_identifier: str
+    rule_url: Any
+
+
+class LicenseDetection1(BaseModel):
+    license_expression: str
+    license_expression_spdx: str
+    matches: list[Match]
+    identifier: str
+
+
+class Copyright(BaseModel):
+    copyright: str
+    start_line: int
+    end_line: int
+
+
+class Holder(BaseModel):
+    holder: str
+    start_line: int
+    end_line: int
+
+
+class Url(BaseModel):
+    url: str
+    start_line: int
+    end_line: int
+
+
+class File(BaseModel):
+    path: str
+    type: str
+    name: str
+    base_name: str
+    extension: str
+    size: int
+    date: str | None
+    sha1: str | None
+    md5: str | None
+    sha256: str | None
+    mime_type: str | None
+    file_type: str | None
+    programming_language: str | None
+    is_binary: bool
+    is_text: bool
+    is_archive: bool
+    is_media: bool
+    is_source: bool
+    is_script: bool
+    package_data: list
+    for_packages: list
+    detected_license_expression: str | None
+    detected_license_expression_spdx: str | None
+    license_detections: list[LicenseDetection1]
+    license_clues: list
+    percentage_of_license_text: float
+    copyrights: list[Copyright]
+    holders: list[Holder]
+    authors: list
+    emails: list
+    urls: list[Url]
+    files_count: int
+    dirs_count: int
+    size_count: int
+    scan_errors: list
+
+
+class ScanCodeData(BaseModel):
+    headers: list[Header]
+    packages: list
+    dependencies: list
+    license_detections: list[LicenseDetection]
+    files: list[File]