Skip to content

feat: convert scan code to .opossum #174

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
c362dd6
feat: Dummy cli interface for scan code files
Hellgartner Jan 10, 2025
9b658aa
feat: extract opossum metadata from scancode json
abraemer Jan 10, 2025
acbbd90
fix: pydantic.Extra is deprecated use string literal instead
abraemer Jan 13, 2025
f13c6de
refactor: separate model validation from metadata creation
abraemer Jan 13, 2025
ecbb705
feat: convert scancode to opossum
abraemer Jan 13, 2025
7587ba9
test: fix test_cli_with_multiple_files on windows?
abraemer Jan 13, 2025
08a5d9d
test: some tests for the conversion[WIP]
abraemer Jan 13, 2025
6e9ca39
fix: prepend paths with "/" to account for root
abraemer Jan 14, 2025
bba92a7
test: ensure validity of the tree of Nodes
abraemer Jan 14, 2025
76e22fd
test: verify attribution mapping
abraemer Jan 14, 2025
771fa4a
feat: consider only best match when generating attributions
abraemer Jan 14, 2025
a22b073
feat: Merge branch 'main' into feat-convert-scan-code
abraemer Jan 14, 2025
9164d18
test: minor cleanup test_cli.py
abraemer Jan 15, 2025
93c4580
test: improve attribution mapping test by using deepcopy
abraemer Jan 15, 2025
f75399e
feat(scancode): include license name in key for attribution for bette…
abraemer Jan 15, 2025
5008333
test: E2E test for scancode with comparison agains a reference
abraemer Jan 15, 2025
71539e4
test(scancode): create test get_attribution_info
abraemer Jan 15, 2025
1e6a75d
refactor(scancode): remove dependency on Resource and go directly to …
abraemer Jan 15, 2025
1415bc6
refactor: improve user-facing texts
abraemer Jan 15, 2025
51ee53d
refactor: address review comments
abraemer Jan 15, 2025
7d4996b
refactor: address further comments
abraemer Jan 15, 2025
52e305b
refactor: Rename the LicenseDetections to improve clarity
abraemer Jan 15, 2025
b1e6f6b
refactor: further improvements
abraemer Jan 16, 2025
5dd967f
Merge branch 'main' into feat-convert-scan-code
abraemer Jan 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions src/opossum_lib/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from opossum_lib.opossum.file_generation import write_opossum_information_to_file
from opossum_lib.opossum.opossum_file import OpossumInformation
from opossum_lib.opossum.read_opossum_file import read_opossum_file
from opossum_lib.scancode.convert_scancode_to_opossum import convert_scancode_to_opossum
from opossum_lib.spdx.convert_to_opossum import convert_spdx_to_opossum_information


Expand All @@ -35,6 +36,12 @@ def opossum_file() -> None:
multiple=True,
type=click.Path(exists=True),
)
@click.option(
"--scan-code-json",
help="ScanCode json files used as input.",
multiple=True,
type=click.Path(exists=True),
)
@click.option(
"--outfile",
"-o",
Expand All @@ -43,16 +50,18 @@ def opossum_file() -> None:
help="The file path to write the generated opossum document to. "
'If appropriate, the extension ".opossum" will be appended.',
)
def generate(spdx: list[str], opossum: list[str], outfile: str) -> None:
def generate(
spdx: list[str], scan_code_json: list[str], opossum: list[str], outfile: str
) -> None:
"""
Generate an Opossum file from various other file formats.

\b
Currently supported input formats:
- SPDX
"""
validate_input_exit_on_error(spdx, opossum)
opossum_information = convert_after_valid_input(spdx, opossum)
validate_input_exit_on_error(spdx, scan_code_json, opossum)
opossum_information = convert_after_valid_input(spdx, scan_code_json, opossum)

if not outfile.endswith(".opossum"):
outfile += ".opossum"
Expand All @@ -63,8 +72,10 @@ def generate(spdx: list[str], opossum: list[str], outfile: str) -> None:
write_opossum_information_to_file(opossum_information, Path(outfile))


def validate_input_exit_on_error(spdx: list[str], opossum: list[str]) -> None:
total_number_of_files = len(spdx) + len(opossum)
def validate_input_exit_on_error(
spdx: list[str], scan_code_json: list[str], opossum: list[str]
) -> None:
total_number_of_files = len(spdx) + len(scan_code_json) + len(opossum)
if total_number_of_files == 0:
logging.warning("No input provided. Exiting.")
sys.exit(1)
Expand All @@ -74,11 +85,14 @@ def validate_input_exit_on_error(spdx: list[str], opossum: list[str]) -> None:


def convert_after_valid_input(
spdx: list[str], opossum_files: list[str]
spdx: list[str], scan_code_json: list[str], opossum_files: list[str]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not new but inconsistent: here we use "opossum_files" while previously the same variable is just called "opossum".

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that this is inconsistent. I think I would prefer to name them all like format_files and made that choice consistently throughout cli.py.

) -> OpossumInformation:
if len(spdx) == 1:
the_spdx_file = spdx[0]
return convert_spdx_to_opossum_information(the_spdx_file)
elif len(scan_code_json) == 1:
the_scan_code_json = scan_code_json[0]
return convert_scancode_to_opossum(the_scan_code_json)
else:
opossum_input_file = opossum_files[0]
return read_opossum_file(opossum_input_file)
Expand Down
Empty file.
66 changes: 66 additions & 0 deletions src/opossum_lib/scancode/convert_scancode_to_opossum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# SPDX-FileCopyrightText: TNG Technology Consulting GmbH <https://www.tngtech.com>
#
# SPDX-License-Identifier: Apache-2.0


import json
import logging
import sys
import uuid

from opossum_lib.opossum.opossum_file import (
Metadata,
OpossumInformation,
)
from opossum_lib.scancode.model import ScanCodeData
from opossum_lib.scancode.resource_tree import (
convert_to_opossum_resources,
create_attribution_mapping,
scancode_to_file_tree,
)


def convert_scancode_to_opossum(filename: str) -> OpossumInformation:
logging.info(f"Converting scancode to opossum {filename}")

try:
with open(filename) as inp:
json_data = json.load(inp)
except json.JSONDecodeError as jsde:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unusual to give errors specific acronyms. generally best to avoid acronyms altogether or use standard ones, like e, in this case.

logging.error(f"Error decoding json for file {filename}. Message: {jsde.msg}")
sys.exit(1)
except UnicodeDecodeError:
logging.error(f"Error decoding json for file {filename}.")
sys.exit(1)

scanCodeData = ScanCodeData.model_validate(json_data)
filetree = scancode_to_file_tree(scanCodeData)
resources = convert_to_opossum_resources(filetree)
externalAttributions, resourcesToAttributions = create_attribution_mapping(filetree)

return OpossumInformation(
metadata=create_opossum_metadata(scanCodeData),
resources=resources,
externalAttributions=externalAttributions,
resourcesToAttributions=resourcesToAttributions,
attributionBreakpoints=[],
externalAttributionSources={},
)


def create_opossum_metadata(scancode_data: ScanCodeData) -> Metadata:
if len(scancode_data.headers) == 0:
logging.error("ScanCode data is missing the header!")
sys.exit(1)
elif len(scancode_data.headers) > 1:
logging.error(f"ScanCode data has {len(scancode_data.headers)} headers!")
sys.exit(1)

the_header = scancode_data.headers[0]

metadata = {}
metadata["projectId"] = str(uuid.uuid4())
metadata["fileCreationDate"] = the_header.end_timestamp
metadata["projectTitle"] = "ScanCode file"

return Metadata.model_validate(metadata)
19 changes: 19 additions & 0 deletions src/opossum_lib/scancode/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# SPDX-FileCopyrightText: TNG Technology Consulting GmbH <https://www.tngtech.com>
#
# SPDX-License-Identifier: Apache-2.0


import os.path

from pydantic import BaseModel
from pydantic_core import SchemaValidator


def path_segments(path: str) -> list[str]:
path = os.path.normpath(path)
return path.split(os.sep)


def check_schema(model: BaseModel) -> None:
schema_validator = SchemaValidator(schema=model.__pydantic_core_schema__)
schema_validator.validate_python(model.__dict__)
151 changes: 151 additions & 0 deletions src/opossum_lib/scancode/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
# SPDX-FileCopyrightText: TNG Technology Consulting GmbH <https://www.tngtech.com>
#
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

from typing import Any

from pydantic import BaseModel


class Options(BaseModel, extra="ignore"):
input: list[str]


class SystemEnvironment(BaseModel):
operating_system: str
cpu_architecture: str
platform: str
platform_version: str
python_version: str


class ExtraData(BaseModel):
system_environment: SystemEnvironment
spdx_license_list_version: str
files_count: int


class Header(BaseModel):
tool_name: str
tool_version: str
options: Options
notice: str
start_timestamp: str
end_timestamp: str
output_format_version: str
duration: float
message: Any
errors: list
warnings: list
extra_data: ExtraData


class ReferenceMatch(BaseModel):
license_expression: str
license_expression_spdx: str
from_file: str
start_line: int
end_line: int
matcher: str
score: float
matched_length: int
match_coverage: float
rule_relevance: int
rule_identifier: str
rule_url: Any


class LicenseDetection(BaseModel):
identifier: str
license_expression: str
license_expression_spdx: str
detection_count: int
reference_matches: list[ReferenceMatch]


class Match(BaseModel):
license_expression: str
license_expression_spdx: str
from_file: str
start_line: int
end_line: int
matcher: str
score: float
matched_length: int
match_coverage: float
rule_relevance: int
rule_identifier: str
rule_url: Any


class LicenseDetection1(BaseModel):
license_expression: str
license_expression_spdx: str
matches: list[Match]
identifier: str


class Copyright(BaseModel):
copyright: str
start_line: int
end_line: int


class Holder(BaseModel):
holder: str
start_line: int
end_line: int


class Url(BaseModel):
url: str
start_line: int
end_line: int


class File(BaseModel):
path: str
type: str
name: str
base_name: str
extension: str
size: int
date: str | None
sha1: str | None
md5: str | None
sha256: str | None
mime_type: str | None
file_type: str | None
programming_language: str | None
is_binary: bool
is_text: bool
is_archive: bool
is_media: bool
is_source: bool
is_script: bool
package_data: list
for_packages: list
detected_license_expression: str | None
detected_license_expression_spdx: str | None
license_detections: list[LicenseDetection1]
license_clues: list
percentage_of_license_text: float
copyrights: list[Copyright]
holders: list[Holder]
authors: list
emails: list
urls: list[Url]
files_count: int
dirs_count: int
size_count: int
scan_errors: list


class ScanCodeData(BaseModel):
headers: list[Header]
packages: list
dependencies: list
license_detections: list[LicenseDetection]
files: list[File]
Loading
Loading