Skip to content

Commit 3169feb

Browse files
authored
Doug/add dedupe logic to sdk (#33)
* Added de-dupe logic for non types results * Version bump * Fix path for module
1 parent f05bd67 commit 3169feb

File tree

4 files changed

+123
-6
lines changed

4 files changed

+123
-6
lines changed

socketdev/core/dedupe.py

+114
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
from collections import defaultdict
2+
from typing import Dict, List, Any
3+
4+
5+
class Dedupe:
6+
@staticmethod
7+
def normalize_file_path(path: str) -> str:
8+
return path.split("/", 1)[-1] if path and "/" in path else path or ""
9+
10+
@staticmethod
11+
def alert_key(alert: dict) -> tuple:
12+
return (
13+
alert["type"],
14+
alert["severity"],
15+
alert["category"],
16+
Dedupe.normalize_file_path(alert.get("file")),
17+
alert.get("start"),
18+
alert.get("end")
19+
)
20+
21+
@staticmethod
22+
def consolidate_and_merge_alerts(package_group: List[Dict[str, Any]]) -> Dict[str, Any]:
23+
def alert_identity(alert: dict) -> tuple:
24+
return (
25+
alert["type"],
26+
alert["severity"],
27+
alert["category"],
28+
Dedupe.normalize_file_path(alert.get("file")),
29+
alert.get("start"),
30+
alert.get("end")
31+
)
32+
33+
alert_map: Dict[tuple, dict] = {}
34+
releases = set()
35+
for pkg in package_group:
36+
release = pkg.get("release") if pkg.get("release") is not None else pkg.get("type")
37+
releases.add(release)
38+
39+
for alert in pkg.get("alerts", []):
40+
identity = alert_identity(alert)
41+
file = Dedupe.normalize_file_path(alert.get("file"))
42+
43+
if identity not in alert_map:
44+
alert_map[identity] = {
45+
"key": alert["key"], # keep the first key seen
46+
"type": alert["type"],
47+
"severity": alert["severity"],
48+
"category": alert["category"],
49+
"file": file,
50+
"start": alert.get("start"),
51+
"end": alert.get("end"),
52+
"releases": [release]
53+
}
54+
else:
55+
if release not in alert_map[identity]["releases"]:
56+
alert_map[identity]["releases"].append(release)
57+
58+
base = package_group[0]
59+
return {
60+
"id": base.get("id"),
61+
"author": base.get("author"),
62+
"size": base.get("size"),
63+
"type": base.get("type"),
64+
"name": base.get("name"),
65+
"namespace": base.get("namespace"),
66+
"version": base.get("version"),
67+
"releases": sorted(releases),
68+
"alerts": list(alert_map.values()),
69+
"score": base.get("score", {}),
70+
"license": base.get("license"),
71+
"licenseDetails": base.get("licenseDetails", []),
72+
"batchIndex": base.get("batchIndex"),
73+
"purl": f"pkg:{base.get('type', 'unknown')}/{base.get('name', 'unknown')}@{base.get('version', '0.0.0')}"
74+
}
75+
76+
@staticmethod
77+
def dedupe(packages: List[Dict[str, Any]], batched: bool = True) -> List[Dict[str, Any]]:
78+
if batched:
79+
grouped = Dedupe.consolidate_by_batch_index(packages)
80+
else:
81+
grouped = Dedupe.consolidate_by_order(packages)
82+
return [Dedupe.consolidate_and_merge_alerts(group) for group in grouped.values()]
83+
84+
@staticmethod
85+
def consolidate_by_batch_index(packages: List[Dict[str, Any]]) -> dict[int, list[dict[str, Any]]]:
86+
grouped: Dict[int, List[Dict[str, Any]]] = defaultdict(list)
87+
for pkg in packages:
88+
grouped[pkg["batchIndex"]].append(pkg)
89+
return grouped
90+
91+
@staticmethod
92+
def consolidate_by_order(packages: List[Dict[str, Any]]) -> dict[int, list[dict[str, Any]]]:
93+
grouped: Dict[int, List[Dict[str, Any]]] = defaultdict(list)
94+
batch_index = 0
95+
package_purl = None
96+
try:
97+
for pkg in packages:
98+
name = pkg["name"]
99+
version = pkg["version"]
100+
namespace = pkg.get("namespace")
101+
ecosystem = pkg.get("type")
102+
new_purl = f"pkg:{ecosystem}/"
103+
if namespace:
104+
new_purl += f"{namespace}/"
105+
new_purl += f"{name}@{version}"
106+
if package_purl is None:
107+
package_purl = new_purl
108+
if package_purl != new_purl:
109+
batch_index += 1
110+
pkg["batchIndex"] = batch_index
111+
grouped[pkg["batchIndex"]].append(pkg)
112+
except Exception as error:
113+
print(error)
114+
return grouped

socketdev/fullscans/__init__.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from typing import Any, Dict, List, Optional, Union
55
from dataclasses import dataclass, asdict, field
66
import urllib.parse
7-
7+
from ..core.dedupe import Dedupe
88
from ..utils import IntegrationType, Utils
99

1010
log = logging.getLogger("socketdev")
@@ -712,6 +712,7 @@ def get(self, org_slug: str, params: dict, use_types: bool = False) -> Union[dic
712712
result = response.json()
713713
if use_types:
714714
return GetFullScanMetadataResponse.from_dict({"success": True, "status": 200, "data": result})
715+
715716
return result
716717

717718
error_message = response.json().get("error", {}).get("message", "Unknown error")
@@ -803,9 +804,9 @@ def stream(self, org_slug: str, full_scan_id: str, use_types: bool = False) -> U
803804
if line != '"' and line != "" and line is not None:
804805
item = json.loads(line)
805806
stream_str.append(item)
806-
for val in stream_str:
807-
artifacts[val["id"]] = val
808-
807+
stream_deduped = Dedupe.dedupe(stream_str, batched=False)
808+
for batch in stream_deduped:
809+
artifacts[batch["id"]] = batch
809810
if use_types:
810811
return FullScanStreamResponse.from_dict({"success": True, "status": 200, "artifacts": artifacts})
811812
return artifacts

socketdev/purl/__init__.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import json
22
import urllib.parse
33
from socketdev.log import log
4+
from ..core.dedupe import Dedupe
45

56

67
class Purl:
@@ -32,7 +33,8 @@ def post(self, license: str = "false", components: list = None, **kwargs) -> lis
3233
purl.append(item)
3334
except json.JSONDecodeError:
3435
continue
35-
return purl
36+
purl_deduped = Dedupe.dedupe(purl)
37+
return purl_deduped
3638

3739
log.error(f"Error posting {components} to the Purl API: {response.status_code}")
3840
print(response.text)

socketdev/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "2.0.16"
1+
__version__ = "2.0.20"

0 commit comments

Comments
 (0)