Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion database/QA/DB006_QA_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

# === 1. File paths ===
base_dir = os.path.dirname(os.path.abspath(__file__))
input_file = os.path.join(base_dir, "../clean data/cleanSample.json") # cleaned dataset
input_file = os.path.join(base_dir, "../clean_data/cleanSample.json") # cleaned dataset
errors_file = os.path.join(base_dir, "errors.json")
summary_file = os.path.join(base_dir, "summary_report.txt")

Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
# === Configuration constants ===
# Edit these paths as needed
# - Find Examples of Input and Output in IOExamples Folder
INPUT_FILE = "database/clean data/IOExamples/rawSample.jsonl"
OUTPUT_FILE = "database/clean data/cleanSample.json"
INPUT_FILE = "database/clean_data/IOExamples/rawSample.jsonl"
OUTPUT_FILE = "database/clean_data/cleanSample.json"

NUTRIENTS_TO_KEEP = {
# Energy
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion database/pipeline/stages/clean_stage.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
logger.setLevel(logging.INFO)

# TODO: Remove sys.path workaround once NutrientUnitNormalisation is packaged as a proper module
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'clean data', 'normalization'))
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'clean_data', 'normalization'))

from NutrientUnitNormalisation import normalize_nutriments_dict

Expand Down
72 changes: 38 additions & 34 deletions mapping/map_enriched_to_product_detail.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,26 @@ def map_enriched_to_product_detail(product: Dict[str, Any]) -> Dict[str, Any]:
"""Map an enriched product record to ProductDetail V1 contract."""
out: Dict[str, Any] = {}

out["barcode"] = product.get("barcode")
out["brand"] = product.get("brand")
out["productName"] = product.get("productName")
out["genericName"] = product.get("genericName")
# Required fields
out["barcode"] = str(product.get("barcode") or "")
out["productName"] = str(product.get("productName") or "")

# Optional fields with correct types/defaults
out["brand"] = product.get("brand") if product.get("brand") is not None else None
out["genericName"] = product.get("genericName") if product.get("genericName") is not None else None
out["additives"] = _safe_list(product.get("additives"))
out["allergens"] = _safe_list(product.get("allergens"))
out["ingredients"] = _safe_list(product.get("ingredients"))
out["ingredientsText"] = product.get("ingredientsText")
# Normalize category data: remove language prefixes, deduplicate, filter empty values
out["ingredientsText"] = product.get("ingredientsText") if product.get("ingredientsText") is not None else None

# Categories
category_data = normalize_category_fields(product.get("categories"))
out["category"] = category_data["category"]
out["categories"] = category_data["categories"]
out["category"] = category_data.get("category") if category_data.get("category") is not None else None
out["categories"] = category_data.get("categories") if category_data.get("categories") is not None else []

out["labels"] = _safe_list(product.get("labels"))
out["nutrientLevels"] = product.get("nutrientLevels") or {}
out["nutriments"] = product.get("nutriments") or {}
out["nutrientLevels"] = dict(product.get("nutrientLevels") or {})
out["nutriments"] = dict(product.get("nutriments") or {})

# Normalise numeric nutriments using existing utility
try:
Expand All @@ -52,31 +55,32 @@ def map_enriched_to_product_detail(product: Dict[str, Any]) -> Dict[str, Any]:
norm = {}

out["nutriments_normalized"] = {
"energy_kj": norm.get("energy_kj"),
"energy_kcal": norm.get("energy_kcal"),
"fat_g": norm.get("fat_g"),
"saturated_fat_g": norm.get("saturated_fat_g"),
"carbohydrates_g": norm.get("carbohydrates_g"),
"sugars_g": norm.get("sugars_g"),
"proteins_g": norm.get("proteins_g"),
"salt_g": norm.get("salt_g"),
"sodium_mg": norm.get("sodium_mg"),
"fiber_g": norm.get("fiber_g"),
"energy_kj": norm.get("energy_kj", None),
"energy_kcal": norm.get("energy_kcal", None),
"fat_g": norm.get("fat_g", None),
"saturated_fat_g": norm.get("saturated_fat_g", None),
"carbohydrates_g": norm.get("carbohydrates_g", None),
"sugars_g": norm.get("sugars_g", None),
"proteins_g": norm.get("proteins_g", None),
"salt_g": norm.get("salt_g", None),
"sodium_mg": norm.get("sodium_mg", None),
"fiber_g": norm.get("fiber_g", None),
}

out["nutriscoreGrade"] = product.get("nutriscoreGrade")
out["productQuantity"] = product.get("productQuantity")
out["productQuantityUnit"] = product.get("productQuantityUnit")
out["servingQuantity"] = product.get("servingQuantity")
out["servingQuantityUnit"] = product.get("servingQuantityUnit")
out["traces"] = product.get("traces")
out["completeness"] = product.get("completeness")
out["nutriscoreGrade"] = product.get("nutriscoreGrade") if product.get("nutriscoreGrade") is not None else None
out["productQuantity"] = product.get("productQuantity") if product.get("productQuantity") is not None else None
out["productQuantityUnit"] = product.get("productQuantityUnit") if product.get("productQuantityUnit") is not None else None
out["servingQuantity"] = product.get("servingQuantity") if product.get("servingQuantity") is not None else None
out["servingQuantityUnit"] = product.get("servingQuantityUnit") if product.get("servingQuantityUnit") is not None else None
out["traces"] = product.get("traces") if product.get("traces") is not None else None
out["completeness"] = product.get("completeness") if product.get("completeness") is not None else None

# Images
images = product.get("images") or {}
out["images"] = {
"root": images.get("root") or "",
"primary": images.get("primary"),
"variants": images.get("variants") or {},
"root": str(images.get("root") or ""),
"primary": images.get("primary") if images.get("primary") is not None else None,
"variants": dict(images.get("variants") or {}),
}

# Tags: use resolver if tags present; otherwise empty lists
Expand All @@ -88,9 +92,9 @@ def map_enriched_to_product_detail(product: Dict[str, Any]) -> Dict[str, Any]:
else:
final = []
removed = []

out["tags"] = {"final": final, "removed": removed}

out["metadata"] = {"source": "local-enriched"}
# Metadata (always present, can be extended)
out["metadata"] = dict(product.get("metadata") or {"source": "local-enriched"})

return out
52 changes: 24 additions & 28 deletions scripts/reports/validation_report.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"barcode": "9300633714437",
"ok": true,
"errors": [],
"time_s": 0.0003
"time_s": 0.0011
},
{
"barcode": "9300633391645",
Expand All @@ -18,31 +18,31 @@
"barcode": "9300695008826",
"ok": true,
"errors": [],
"time_s": 0.0001
"time_s": 0.0
},
{
"barcode": "93552516",
"ok": true,
"errors": [],
"time_s": 0.0001
"time_s": 0.0
},
{
"barcode": "0062020000248",
"ok": true,
"errors": [],
"time_s": 0.0001
"time_s": 0.0
},
{
"barcode": "9338441010052",
"ok": true,
"errors": [],
"time_s": 0.0001
"time_s": 0.0
},
{
"barcode": "0089686170924",
"ok": true,
"errors": [],
"time_s": 0.0001
"time_s": 0.0
},
{
"barcode": "9342584072280",
Expand All @@ -54,20 +54,26 @@
"barcode": "5060195907145",
"ok": true,
"errors": [],
"time_s": 0.0001
"time_s": 0.0
},
{
"barcode": "9339423001075",
"ok": true,
"errors": [],
"time_s": 0.0001
"time_s": 0.0
},
{
"barcode": "9323966105178",
"ok": true,
"errors": [],
"time_s": 0.0
},
{
"barcode": "",
"ok": true,
"errors": [],
"time_s": 0.0
},
{
"barcode": "0009542005948",
"ok": true,
Expand All @@ -94,17 +100,15 @@
},
{
"barcode": "0011210681101",
"ok": false,
"errors": [
"productName must be string"
],
"ok": true,
"errors": [],
"time_s": 0.0
},
{
"barcode": "0011826800071",
"ok": true,
"errors": [],
"time_s": 0.0001
"time_s": 0.0
},
{
"barcode": "0012524702117",
Expand Down Expand Up @@ -182,14 +186,12 @@
"barcode": "0020176790284",
"ok": true,
"errors": [],
"time_s": 0.0
"time_s": 0.0001
},
{
"barcode": "0020662020154",
"ok": false,
"errors": [
"productName must be string"
],
"ok": true,
"errors": [],
"time_s": 0.0
},
{
Expand Down Expand Up @@ -299,19 +301,13 @@
"ok": true,
"errors": [],
"time_s": 0.0
},
{
"barcode": "0039047154391",
"ok": true,
"errors": [],
"time_s": 0.0
}
],
"summary": {
"total_items": 50,
"errors": 2,
"error_rate": 0.04,
"total_time_s": 0.0025,
"avg_time_s": 4.9e-05
"errors": 0,
"error_rate": 0.0,
"total_time_s": 0.0023,
"avg_time_s": 4.5e-05
}
}