Skip to content

Commit 5f92118

Browse files
committed
Add basic mine cran workflow
Signed-off-by: ziad hany <[email protected]>
1 parent e23ede7 commit 5f92118

File tree

5 files changed

+202
-0
lines changed

5 files changed

+202
-0
lines changed

minecode_pipelines/miners/cran.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
from pathlib import Path
10+
import requests
11+
12+
13+
def get_cran_db(output_file="cran_db.json") -> Path:
14+
"""
15+
Download the CRAN package database (~250MB JSON) in a memory-efficient way.
16+
Saves it to a file instead of loading everything into memory.
17+
"""
18+
19+
url = "https://crandb.r-pkg.org/-/all"
20+
output_path = Path(output_file)
21+
22+
with requests.get(url, stream=True) as response:
23+
response.raise_for_status()
24+
with output_path.open("wb") as f:
25+
for chunk in response.iter_content(chunk_size=8192):
26+
f.write(chunk)
27+
28+
return output_path
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
import os
24+
from scanpipe.pipelines import Pipeline
25+
from scanpipe.pipes import federatedcode
26+
27+
from minecode_pipelines.miners.cran import get_cran_db
28+
from minecode_pipelines.pipes import cran
29+
30+
FEDERATEDCODE_CRAN_GIT_URL = os.environ.get(
31+
"FEDERATEDCODE_CRAN_GIT_URL", ""
32+
)
33+
34+
35+
class MineandPublishCRANPURLs(Pipeline):
36+
"""
37+
Mine all packageURLs from a CRAN R index and publish them to a FederatedCode repo.
38+
"""
39+
40+
@classmethod
41+
def steps(cls):
42+
return (
43+
cls.check_federatedcode_eligibility,
44+
cls.setup_federatedcode_cran,
45+
cls.mine_and_publish_cran_packageurls,
46+
)
47+
48+
def check_federatedcode_eligibility(self):
49+
"""
50+
Check if the project fulfills the following criteria for
51+
pushing the project result to FederatedCode.
52+
"""
53+
federatedcode.check_federatedcode_configured_and_available()
54+
55+
def setup_federatedcode_cran(self):
56+
"""
57+
Clone the FederatedCode CRAN repository and download the CRAN DB JSON file.
58+
"""
59+
self.fed_repo = federatedcode.clone_repository(repo_url=FEDERATEDCODE_CRAN_GIT_URL)
60+
self.db_path = get_cran_db()
61+
62+
def mine_and_publish_cran_packageurls(self):
63+
"""Get cran packageURLs for all mined cran package names."""
64+
cran.mine_and_publish_cran_packageurls(fed_repo=self.fed_repo, db_path=self.db_path)

minecode_pipelines/pipes/cran.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
import json
24+
from pathlib import Path
25+
from packageurl import PackageURL
26+
from aboutcode.hashid import get_package_purls_yml_file_path
27+
from minecode_pipelines.utils import git_stage_purls, commit_and_push_changes
28+
29+
30+
def mine_and_publish_cran_packageurls(fed_repo, db_path):
31+
for purls in extract_cran_packages(db_path):
32+
if not purls:
33+
continue
34+
35+
first_purl = purls[0]
36+
purl_yaml_path = get_package_purls_yml_file_path(first_purl)
37+
git_stage_purls(purls, fed_repo, purl_yaml_path)
38+
39+
commit_and_push_changes(fed_repo)
40+
41+
42+
def extract_cran_packages(json_file_path: str) -> list:
43+
"""
44+
Extract package names and their versions from a CRAN DB JSON file.
45+
"""
46+
db_path = Path(json_file_path)
47+
if not db_path.exists():
48+
raise FileNotFoundError(f"File not found: {db_path}")
49+
50+
with open(db_path, encoding="utf-8") as f:
51+
data = json.load(f)
52+
53+
for pkg_name, pkg_data in data.items():
54+
versions = list(pkg_data.get("versions", {}).keys())
55+
purls = []
56+
for version in versions:
57+
purl = PackageURL(
58+
type="cran",
59+
name=pkg_name,
60+
version=version,
61+
)
62+
purls.append(purl.to_string())
63+
yield purls
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#

minecode_pipelines/utils.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,17 @@
99

1010
import os
1111
import tempfile
12+
import textwrap
13+
from pathlib import Path
1214

1315
from commoncode.fileutils import create_dir
16+
from minecode_pipelines.miners import write_data_to_file
17+
18+
VERSION = os.environ.get("VERSION", "")
19+
PURLDB_ALLOWED_HOST = os.environ.get("FEDERATEDCODE_GIT_ALLOWED_HOST", "")
20+
author_name = os.environ.get("FEDERATEDCODE_GIT_SERVICE_NAME", "")
21+
author_email = os.environ.get("FEDERATEDCODE_GIT_SERVICE_EMAIL", "")
22+
remote_name = os.environ.get("FEDERATEDCODE_GIT_REMOTE_NAME", "origin")
1423

1524

1625
def system_temp_dir(temp_dir=os.getenv("MINECODE_TMP")):
@@ -49,3 +58,33 @@ def get_temp_file(file_name="data", extension=".file", dir_name=""):
4958
temp_dir = get_temp_dir(dir_name)
5059
location = os.path.join(temp_dir, file_name)
5160
return location
61+
62+
63+
def git_stage_purls(purls, repo, purls_file):
64+
"""Write package URLs to a file and stage it in the local Git repository."""
65+
relative_purl_file_path = Path(purls_file)
66+
67+
write_to = Path(repo.working_dir) / relative_purl_file_path
68+
69+
write_data_to_file(path=write_to, data=purls)
70+
71+
repo.index.add([relative_purl_file_path])
72+
return relative_purl_file_path
73+
74+
75+
def commit_and_push_changes(repo):
76+
"""
77+
Commit staged changes to the local repository and push them
78+
to the remote on the current active branch.
79+
"""
80+
81+
commit_message = f"""\
82+
Add/Update list of available package versions
83+
Tool: pkg:github/aboutcode-org/purldb@v{VERSION}
84+
Reference: https://{PURLDB_ALLOWED_HOST}/
85+
Signed-off-by: {author_name} <{author_email}>
86+
"""
87+
88+
default_branch = repo.active_branch.name
89+
repo.index.commit(textwrap.dedent(commit_message))
90+
repo.git.push(remote_name, default_branch, "--no-verify")

0 commit comments

Comments
 (0)