From bc4d2c5c6bc9e1243cf444335c3876442f1855ca Mon Sep 17 00:00:00 2001 From: Aarya1801 Date: Mon, 30 Mar 2026 15:30:10 +1100 Subject: [PATCH] BE001: Ensure Product Detail API response matches frontend contract and passes schema validation --- database/QA/DB006_QA_cleaning.py | 2 +- .../IOExamples/cleanSample.json | 0 .../IOExamples/rawSample.jsonl | 0 .../{clean data => clean_data}/__init__.py | 0 .../cleanProductData.py | 4 +- .../cleanSample.json | 0 .../{clean data => clean_data}/constants.py | 0 .../normalization/CategoryHarmonisation.py | 0 .../IngredientStandardisation.py | 0 .../NutrientUnitNormalisation.py | 0 database/pipeline/stages/clean_stage.py | 2 +- mapping/map_enriched_to_product_detail.py | 72 ++++++++++--------- scripts/reports/validation_report.json | 52 +++++++------- 13 files changed, 66 insertions(+), 66 deletions(-) rename database/{clean data => clean_data}/IOExamples/cleanSample.json (100%) rename database/{clean data => clean_data}/IOExamples/rawSample.jsonl (100%) rename database/{clean data => clean_data}/__init__.py (100%) rename database/{clean data => clean_data}/cleanProductData.py (99%) rename database/{clean data => clean_data}/cleanSample.json (100%) rename database/{clean data => clean_data}/constants.py (100%) rename database/{clean data => clean_data}/normalization/CategoryHarmonisation.py (100%) rename database/{clean data => clean_data}/normalization/IngredientStandardisation.py (100%) rename database/{clean data => clean_data}/normalization/NutrientUnitNormalisation.py (100%) diff --git a/database/QA/DB006_QA_cleaning.py b/database/QA/DB006_QA_cleaning.py index b3c095e..05f12b0 100644 --- a/database/QA/DB006_QA_cleaning.py +++ b/database/QA/DB006_QA_cleaning.py @@ -5,7 +5,7 @@ # === 1. File paths === base_dir = os.path.dirname(os.path.abspath(__file__)) -input_file = os.path.join(base_dir, "../clean data/cleanSample.json") # cleaned dataset +input_file = os.path.join(base_dir, "../clean_data/cleanSample.json") # cleaned dataset errors_file = os.path.join(base_dir, "errors.json") summary_file = os.path.join(base_dir, "summary_report.txt") diff --git a/database/clean data/IOExamples/cleanSample.json b/database/clean_data/IOExamples/cleanSample.json similarity index 100% rename from database/clean data/IOExamples/cleanSample.json rename to database/clean_data/IOExamples/cleanSample.json diff --git a/database/clean data/IOExamples/rawSample.jsonl b/database/clean_data/IOExamples/rawSample.jsonl similarity index 100% rename from database/clean data/IOExamples/rawSample.jsonl rename to database/clean_data/IOExamples/rawSample.jsonl diff --git a/database/clean data/__init__.py b/database/clean_data/__init__.py similarity index 100% rename from database/clean data/__init__.py rename to database/clean_data/__init__.py diff --git a/database/clean data/cleanProductData.py b/database/clean_data/cleanProductData.py similarity index 99% rename from database/clean data/cleanProductData.py rename to database/clean_data/cleanProductData.py index 43c2635..6902634 100644 --- a/database/clean data/cleanProductData.py +++ b/database/clean_data/cleanProductData.py @@ -18,8 +18,8 @@ # === Configuration constants === # Edit these paths as needed # - Find Examples of Input and Output in IOExamples Folder -INPUT_FILE = "database/clean data/IOExamples/rawSample.jsonl" -OUTPUT_FILE = "database/clean data/cleanSample.json" +INPUT_FILE = "database/clean_data/IOExamples/rawSample.jsonl" +OUTPUT_FILE = "database/clean_data/cleanSample.json" NUTRIENTS_TO_KEEP = { # Energy diff --git a/database/clean data/cleanSample.json b/database/clean_data/cleanSample.json similarity index 100% rename from database/clean data/cleanSample.json rename to database/clean_data/cleanSample.json diff --git a/database/clean data/constants.py b/database/clean_data/constants.py similarity index 100% rename from database/clean data/constants.py rename to database/clean_data/constants.py diff --git a/database/clean data/normalization/CategoryHarmonisation.py b/database/clean_data/normalization/CategoryHarmonisation.py similarity index 100% rename from database/clean data/normalization/CategoryHarmonisation.py rename to database/clean_data/normalization/CategoryHarmonisation.py diff --git a/database/clean data/normalization/IngredientStandardisation.py b/database/clean_data/normalization/IngredientStandardisation.py similarity index 100% rename from database/clean data/normalization/IngredientStandardisation.py rename to database/clean_data/normalization/IngredientStandardisation.py diff --git a/database/clean data/normalization/NutrientUnitNormalisation.py b/database/clean_data/normalization/NutrientUnitNormalisation.py similarity index 100% rename from database/clean data/normalization/NutrientUnitNormalisation.py rename to database/clean_data/normalization/NutrientUnitNormalisation.py diff --git a/database/pipeline/stages/clean_stage.py b/database/pipeline/stages/clean_stage.py index e61b93d..2364bfd 100644 --- a/database/pipeline/stages/clean_stage.py +++ b/database/pipeline/stages/clean_stage.py @@ -12,7 +12,7 @@ logger.setLevel(logging.INFO) # TODO: Remove sys.path workaround once NutrientUnitNormalisation is packaged as a proper module -sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'clean data', 'normalization')) +sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'clean_data', 'normalization')) from NutrientUnitNormalisation import normalize_nutriments_dict diff --git a/mapping/map_enriched_to_product_detail.py b/mapping/map_enriched_to_product_detail.py index f00afd5..a1cce58 100644 --- a/mapping/map_enriched_to_product_detail.py +++ b/mapping/map_enriched_to_product_detail.py @@ -26,23 +26,26 @@ def map_enriched_to_product_detail(product: Dict[str, Any]) -> Dict[str, Any]: """Map an enriched product record to ProductDetail V1 contract.""" out: Dict[str, Any] = {} - out["barcode"] = product.get("barcode") - out["brand"] = product.get("brand") - out["productName"] = product.get("productName") - out["genericName"] = product.get("genericName") + # Required fields + out["barcode"] = str(product.get("barcode") or "") + out["productName"] = str(product.get("productName") or "") + + # Optional fields with correct types/defaults + out["brand"] = product.get("brand") if product.get("brand") is not None else None + out["genericName"] = product.get("genericName") if product.get("genericName") is not None else None out["additives"] = _safe_list(product.get("additives")) out["allergens"] = _safe_list(product.get("allergens")) out["ingredients"] = _safe_list(product.get("ingredients")) - out["ingredientsText"] = product.get("ingredientsText") - - # Normalize category data: remove language prefixes, deduplicate, filter empty values + out["ingredientsText"] = product.get("ingredientsText") if product.get("ingredientsText") is not None else None + + # Categories category_data = normalize_category_fields(product.get("categories")) - out["category"] = category_data["category"] - out["categories"] = category_data["categories"] - + out["category"] = category_data.get("category") if category_data.get("category") is not None else None + out["categories"] = category_data.get("categories") if category_data.get("categories") is not None else [] + out["labels"] = _safe_list(product.get("labels")) - out["nutrientLevels"] = product.get("nutrientLevels") or {} - out["nutriments"] = product.get("nutriments") or {} + out["nutrientLevels"] = dict(product.get("nutrientLevels") or {}) + out["nutriments"] = dict(product.get("nutriments") or {}) # Normalise numeric nutriments using existing utility try: @@ -52,31 +55,32 @@ def map_enriched_to_product_detail(product: Dict[str, Any]) -> Dict[str, Any]: norm = {} out["nutriments_normalized"] = { - "energy_kj": norm.get("energy_kj"), - "energy_kcal": norm.get("energy_kcal"), - "fat_g": norm.get("fat_g"), - "saturated_fat_g": norm.get("saturated_fat_g"), - "carbohydrates_g": norm.get("carbohydrates_g"), - "sugars_g": norm.get("sugars_g"), - "proteins_g": norm.get("proteins_g"), - "salt_g": norm.get("salt_g"), - "sodium_mg": norm.get("sodium_mg"), - "fiber_g": norm.get("fiber_g"), + "energy_kj": norm.get("energy_kj", None), + "energy_kcal": norm.get("energy_kcal", None), + "fat_g": norm.get("fat_g", None), + "saturated_fat_g": norm.get("saturated_fat_g", None), + "carbohydrates_g": norm.get("carbohydrates_g", None), + "sugars_g": norm.get("sugars_g", None), + "proteins_g": norm.get("proteins_g", None), + "salt_g": norm.get("salt_g", None), + "sodium_mg": norm.get("sodium_mg", None), + "fiber_g": norm.get("fiber_g", None), } - out["nutriscoreGrade"] = product.get("nutriscoreGrade") - out["productQuantity"] = product.get("productQuantity") - out["productQuantityUnit"] = product.get("productQuantityUnit") - out["servingQuantity"] = product.get("servingQuantity") - out["servingQuantityUnit"] = product.get("servingQuantityUnit") - out["traces"] = product.get("traces") - out["completeness"] = product.get("completeness") + out["nutriscoreGrade"] = product.get("nutriscoreGrade") if product.get("nutriscoreGrade") is not None else None + out["productQuantity"] = product.get("productQuantity") if product.get("productQuantity") is not None else None + out["productQuantityUnit"] = product.get("productQuantityUnit") if product.get("productQuantityUnit") is not None else None + out["servingQuantity"] = product.get("servingQuantity") if product.get("servingQuantity") is not None else None + out["servingQuantityUnit"] = product.get("servingQuantityUnit") if product.get("servingQuantityUnit") is not None else None + out["traces"] = product.get("traces") if product.get("traces") is not None else None + out["completeness"] = product.get("completeness") if product.get("completeness") is not None else None + # Images images = product.get("images") or {} out["images"] = { - "root": images.get("root") or "", - "primary": images.get("primary"), - "variants": images.get("variants") or {}, + "root": str(images.get("root") or ""), + "primary": images.get("primary") if images.get("primary") is not None else None, + "variants": dict(images.get("variants") or {}), } # Tags: use resolver if tags present; otherwise empty lists @@ -88,9 +92,9 @@ def map_enriched_to_product_detail(product: Dict[str, Any]) -> Dict[str, Any]: else: final = [] removed = [] - out["tags"] = {"final": final, "removed": removed} - out["metadata"] = {"source": "local-enriched"} + # Metadata (always present, can be extended) + out["metadata"] = dict(product.get("metadata") or {"source": "local-enriched"}) return out diff --git a/scripts/reports/validation_report.json b/scripts/reports/validation_report.json index e37f29b..1b2d04f 100644 --- a/scripts/reports/validation_report.json +++ b/scripts/reports/validation_report.json @@ -6,7 +6,7 @@ "barcode": "9300633714437", "ok": true, "errors": [], - "time_s": 0.0003 + "time_s": 0.0011 }, { "barcode": "9300633391645", @@ -18,31 +18,31 @@ "barcode": "9300695008826", "ok": true, "errors": [], - "time_s": 0.0001 + "time_s": 0.0 }, { "barcode": "93552516", "ok": true, "errors": [], - "time_s": 0.0001 + "time_s": 0.0 }, { "barcode": "0062020000248", "ok": true, "errors": [], - "time_s": 0.0001 + "time_s": 0.0 }, { "barcode": "9338441010052", "ok": true, "errors": [], - "time_s": 0.0001 + "time_s": 0.0 }, { "barcode": "0089686170924", "ok": true, "errors": [], - "time_s": 0.0001 + "time_s": 0.0 }, { "barcode": "9342584072280", @@ -54,13 +54,13 @@ "barcode": "5060195907145", "ok": true, "errors": [], - "time_s": 0.0001 + "time_s": 0.0 }, { "barcode": "9339423001075", "ok": true, "errors": [], - "time_s": 0.0001 + "time_s": 0.0 }, { "barcode": "9323966105178", @@ -68,6 +68,12 @@ "errors": [], "time_s": 0.0 }, + { + "barcode": "", + "ok": true, + "errors": [], + "time_s": 0.0 + }, { "barcode": "0009542005948", "ok": true, @@ -94,17 +100,15 @@ }, { "barcode": "0011210681101", - "ok": false, - "errors": [ - "productName must be string" - ], + "ok": true, + "errors": [], "time_s": 0.0 }, { "barcode": "0011826800071", "ok": true, "errors": [], - "time_s": 0.0001 + "time_s": 0.0 }, { "barcode": "0012524702117", @@ -182,14 +186,12 @@ "barcode": "0020176790284", "ok": true, "errors": [], - "time_s": 0.0 + "time_s": 0.0001 }, { "barcode": "0020662020154", - "ok": false, - "errors": [ - "productName must be string" - ], + "ok": true, + "errors": [], "time_s": 0.0 }, { @@ -299,19 +301,13 @@ "ok": true, "errors": [], "time_s": 0.0 - }, - { - "barcode": "0039047154391", - "ok": true, - "errors": [], - "time_s": 0.0 } ], "summary": { "total_items": 50, - "errors": 2, - "error_rate": 0.04, - "total_time_s": 0.0025, - "avg_time_s": 4.9e-05 + "errors": 0, + "error_rate": 0.0, + "total_time_s": 0.0023, + "avg_time_s": 4.5e-05 } } \ No newline at end of file