diff --git a/transaction_parser/parser_benchmark/doctype/parser_benchmark_dataset/parser_benchmark_dataset.json b/transaction_parser/parser_benchmark/doctype/parser_benchmark_dataset/parser_benchmark_dataset.json index fb07441..86fdcfb 100644 --- a/transaction_parser/parser_benchmark/doctype/parser_benchmark_dataset/parser_benchmark_dataset.json +++ b/transaction_parser/parser_benchmark/doctype/parser_benchmark_dataset/parser_benchmark_dataset.json @@ -7,11 +7,12 @@ "field_order": [ "enabled", "column_break_title", - "section_break_file", - "file", - "file_type", - "column_break_gbap", + "files_section", + "files", + "is_multiple_files", + "section_break_txfs", "transaction_type", + "column_break_xusg", "country", "section_break_sobg", "company", @@ -59,20 +60,12 @@ "label": "Enabled" }, { - "fieldname": "section_break_file", - "fieldtype": "Section Break" - }, - { - "fieldname": "file", - "fieldtype": "Attach", - "label": "File", - "print_hide": 1, - "reqd": 1 - }, - { - "fieldname": "file_type", - "fieldtype": "Data", - "label": "File Type", + "allow_on_submit": 1, + "default": "0", + "fieldname": "is_multiple_files", + "fieldtype": "Check", + "in_standard_filter": 1, + "label": "Multiple Files", "read_only": 1 }, { @@ -91,10 +84,6 @@ "options": "India\nOther", "reqd": 1 }, - { - "fieldname": "column_break_gbap", - "fieldtype": "Column Break" - }, { "fieldname": "company", "fieldtype": "Link", @@ -115,6 +104,19 @@ "label": "Party", "options": "party_type" }, + { + "fieldname": "files_section", + "fieldtype": "Section Break", + "label": "Files" + }, + { + "allow_on_submit": 1, + "fieldname": "files", + "fieldtype": "Table", + "label": "Files", + "options": "Parser Benchmark Dataset File", + "reqd": 1 + }, { "fieldname": "processing_section", "fieldtype": "Section Break", @@ -181,7 +183,6 @@ "label": "Google Gemini Flash-2.5" }, { - "depends_on": "eval: doc.file_type === \"PDF\"", "fieldname": "pdf_processor_section", "fieldtype": "Section Break", "label": "PDF Processors" @@ -273,6 +274,15 @@ { "fieldname": "column_break_aoce", "fieldtype": "Column Break" + }, + { + "fieldname": "section_break_txfs", + "fieldtype": "Section Break", + "label": "Transaction Details" + }, + { + "fieldname": "column_break_xusg", + "fieldtype": "Column Break" } ], "index_web_pages_for_search": 1, @@ -283,7 +293,7 @@ "link_fieldname": "dataset" } ], - "modified": "2026-03-27 10:32:46.620190", + "modified": "2026-03-30 15:01:03.977940", "modified_by": "Administrator", "module": "Parser Benchmark", "name": "Parser Benchmark Dataset", diff --git a/transaction_parser/parser_benchmark/doctype/parser_benchmark_dataset/parser_benchmark_dataset.py b/transaction_parser/parser_benchmark/doctype/parser_benchmark_dataset/parser_benchmark_dataset.py index 861a245..26feffa 100644 --- a/transaction_parser/parser_benchmark/doctype/parser_benchmark_dataset/parser_benchmark_dataset.py +++ b/transaction_parser/parser_benchmark/doctype/parser_benchmark_dataset/parser_benchmark_dataset.py @@ -34,6 +34,9 @@ class ParserBenchmarkDataset(Document): if TYPE_CHECKING: from frappe.types import DF + from transaction_parser.parser_benchmark.doctype.parser_benchmark_dataset_file.parser_benchmark_dataset_file import ( + ParserBenchmarkDatasetFile, + ) from transaction_parser.parser_benchmark.doctype.parser_benchmark_expected_field.parser_benchmark_expected_field import ( ParserBenchmarkExpectedField, ) @@ -46,10 +49,10 @@ class ParserBenchmarkDataset(Document): docling: DF.Check enabled: DF.Check expected_fields: DF.Table[ParserBenchmarkExpectedField] - file: DF.Attach - file_type: DF.Data | None + files: DF.Table[ParserBenchmarkDatasetFile] google_gemini_flash_25: DF.Check google_gemini_pro_25: DF.Check + is_multiple_files: DF.Check naming_series: DF.Literal["PAR-BM-DTS-"] ocrmypdf: DF.Check openai_gpt_4o: DF.Check @@ -62,39 +65,27 @@ class ParserBenchmarkDataset(Document): transaction_type: DF.Literal["Sales Order", "Expense"] # end: auto-generated types - SUPPORTED_FILE_TYPES = ("PDF", "CSV", "XLSX", "XLS") - def validate(self): - self.set_file_type() - self.validate_file_type() + self.validate_files() self.validate_selected_models() - self.validate_selected_processors() self.validate_expected_fields() - def set_file_type(self): - if self.file_type and not self.has_value_changed("file"): - return + def before_update_after_submit(self): + self.validate_files() - file_doc = frappe.get_last_doc("File", filters={"file_url": self.file}) - self.file_type = file_doc.file_type + def validate_files(self): + """Set file_type for each row and auto-set is_multiple_files.""" + for row in self.files: + if row.file and (not row.file_type or row.has_value_changed("file")): + file_doc = frappe.get_last_doc("File", filters={"file_url": row.file}) + row.file_type = file_doc.file_type - def validate_file_type(self): - if self.file_type not in self.SUPPORTED_FILE_TYPES: - frappe.throw(_("Unsupported file type: {0}").format(self.file_type)) + self.is_multiple_files = len(self.files) > 1 def validate_selected_models(self): if not self.get_selected_models(): frappe.throw(_("Please select at least one AI Model.")) - def validate_selected_processors(self): - if self.file_type != "PDF": - for field in PDF_PROCESSOR_FIELD_MAP: - self.set(field, 0) - return - - if not self.get_selected_processors(): - frappe.throw(_("Please select at least one PDF Processor.")) - def validate_expected_fields(self): if not self.expected_fields: return @@ -129,6 +120,18 @@ def get_selected_processors(self) -> list[str]: label for field, label in PDF_PROCESSOR_FIELD_MAP.items() if self.get(field) ] + def has_pdf_file(self) -> bool: + """Check if any file in the child table is a PDF.""" + return any(row.file_type == "PDF" for row in self.files) + + def get_file_docs(self) -> list: + """Return File documents for each row in the files child table.""" + file_docs = [] + for row in self.files: + file_doc = frappe.get_last_doc("File", filters={"file_url": row.file}) + file_docs.append(file_doc) + return file_docs + @frappe.whitelist() def run_benchmark(dataset_name: str): @@ -156,7 +159,7 @@ def create_and_enqueue_benchmark_logs(dataset_name: str) -> list[str]: models = dataset.get_selected_models() processors = ( (dataset.get_selected_processors() or [None]) - if dataset.file_type == "PDF" + if dataset.has_pdf_file() else [None] ) diff --git a/transaction_parser/parser_benchmark/doctype/parser_benchmark_dataset_file/__init__.py b/transaction_parser/parser_benchmark/doctype/parser_benchmark_dataset_file/__init__.py new file mode 100644 index 0000000..c4fea77 --- /dev/null +++ b/transaction_parser/parser_benchmark/doctype/parser_benchmark_dataset_file/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2026, Resilient Tech and contributors +# For license information, please see license.txt diff --git a/transaction_parser/parser_benchmark/doctype/parser_benchmark_dataset_file/parser_benchmark_dataset_file.json b/transaction_parser/parser_benchmark/doctype/parser_benchmark_dataset_file/parser_benchmark_dataset_file.json new file mode 100644 index 0000000..5f54a45 --- /dev/null +++ b/transaction_parser/parser_benchmark/doctype/parser_benchmark_dataset_file/parser_benchmark_dataset_file.json @@ -0,0 +1,43 @@ +{ + "actions": [], + "creation": "2026-03-30 00:00:00", + "doctype": "DocType", + "engine": "InnoDB", + "field_order": [ + "file", + "column_break_yahx", + "file_type" + ], + "fields": [ + { + "fieldname": "file", + "fieldtype": "Attach", + "in_list_view": 1, + "label": "File", + "reqd": 1 + }, + { + "fieldname": "file_type", + "fieldtype": "Data", + "in_list_view": 1, + "label": "File Type", + "read_only": 1 + }, + { + "fieldname": "column_break_yahx", + "fieldtype": "Column Break" + } + ], + "istable": 1, + "links": [], + "modified": "2026-03-30 15:01:27.752102", + "modified_by": "Administrator", + "module": "Parser Benchmark", + "name": "Parser Benchmark Dataset File", + "owner": "Administrator", + "permissions": [], + "row_format": "Dynamic", + "sort_field": "modified", + "sort_order": "DESC", + "states": [] +} \ No newline at end of file diff --git a/transaction_parser/parser_benchmark/doctype/parser_benchmark_dataset_file/parser_benchmark_dataset_file.py b/transaction_parser/parser_benchmark/doctype/parser_benchmark_dataset_file/parser_benchmark_dataset_file.py new file mode 100644 index 0000000..78b5e42 --- /dev/null +++ b/transaction_parser/parser_benchmark/doctype/parser_benchmark_dataset_file/parser_benchmark_dataset_file.py @@ -0,0 +1,23 @@ +# Copyright (c) 2026, Resilient Tech and contributors +# For license information, please see license.txt + +from frappe.model.document import Document + + +class ParserBenchmarkDatasetFile(Document): + # begin: auto-generated types + # This code is auto-generated. Do not modify anything in this block. + + from typing import TYPE_CHECKING + + if TYPE_CHECKING: + from frappe.types import DF + + file: DF.Attach + file_type: DF.Data | None + parent: DF.Data + parentfield: DF.Data + parenttype: DF.Data + # end: auto-generated types + + pass diff --git a/transaction_parser/parser_benchmark/doctype/parser_benchmark_log/parser_benchmark_log.json b/transaction_parser/parser_benchmark/doctype/parser_benchmark_log/parser_benchmark_log.json index 9ba294b..0bbe3c6 100644 --- a/transaction_parser/parser_benchmark/doctype/parser_benchmark_log/parser_benchmark_log.json +++ b/transaction_parser/parser_benchmark/doctype/parser_benchmark_log/parser_benchmark_log.json @@ -26,7 +26,6 @@ "column_break_ubhs", "file_parsing_tab", "pdf_processor", - "file_type", "column_break_file_metrics", "page_limit", "section_break_umzr", @@ -107,7 +106,6 @@ "read_only": 1 }, { - "depends_on": "eval: doc.file_type === \"PDF\"", "fieldname": "pdf_processor", "fieldtype": "Select", "label": "PDF Processor", @@ -121,13 +119,7 @@ "label": "Total Time (s)", "read_only": 1 }, - { - "fieldname": "file_type", - "fieldtype": "Data", - "is_virtual": 1, - "label": "File Type", - "read_only": 1 - }, + { "fieldname": "file_parsing_tab", "fieldtype": "Tab Break", diff --git a/transaction_parser/parser_benchmark/doctype/parser_benchmark_log/parser_benchmark_log.py b/transaction_parser/parser_benchmark/doctype/parser_benchmark_log/parser_benchmark_log.py index 0b9e29a..5cd5b7f 100644 --- a/transaction_parser/parser_benchmark/doctype/parser_benchmark_log/parser_benchmark_log.py +++ b/transaction_parser/parser_benchmark/doctype/parser_benchmark_log/parser_benchmark_log.py @@ -97,7 +97,3 @@ def party(self): @property def page_limit(self): return self.get_from_dataset("page_limit") or 0 - - @property - def file_type(self): - return self.get_from_dataset("file_type") diff --git a/transaction_parser/parser_benchmark/report/transaction_parser_accuracy_analysis/transaction_parser_accuracy_analysis.js b/transaction_parser/parser_benchmark/report/transaction_parser_accuracy_analysis/transaction_parser_accuracy_analysis.js index c22fab8..5c84d22 100644 --- a/transaction_parser/parser_benchmark/report/transaction_parser_accuracy_analysis/transaction_parser_accuracy_analysis.js +++ b/transaction_parser/parser_benchmark/report/transaction_parser_accuracy_analysis/transaction_parser_accuracy_analysis.js @@ -12,8 +12,6 @@ const AI_MODELS = [ "Google Gemini Flash-2.5", ]; -const FILE_TYPES = ["PDF", "CSV", "XLSX", "XLS"]; - const PDF_PROCESSORS = ["OCRMyPDF", "Docling"]; const PARTY_TYPE_MAP = { @@ -68,12 +66,6 @@ frappe.query_reports["Transaction Parser Accuracy Analysis"] = { fieldtype: "Dynamic Link", options: "party_type", }, - { - fieldname: "file_type", - label: __("File Type"), - fieldtype: "MultiSelectList", - get_data: (txt) => make_options(FILE_TYPES, txt), - }, { fieldname: "ai_model", label: __("AI Model"), @@ -92,6 +84,12 @@ frappe.query_reports["Transaction Parser Accuracy Analysis"] = { fieldtype: "Check", default: 0, }, + { + fieldname: "is_multiple_files", + label: __("Multiple Files Only"), + fieldtype: "Check", + default: 0, + }, ], }; diff --git a/transaction_parser/parser_benchmark/report/transaction_parser_accuracy_analysis/transaction_parser_accuracy_analysis.py b/transaction_parser/parser_benchmark/report/transaction_parser_accuracy_analysis/transaction_parser_accuracy_analysis.py index 3d1e7ac..27283ad 100644 --- a/transaction_parser/parser_benchmark/report/transaction_parser_accuracy_analysis/transaction_parser_accuracy_analysis.py +++ b/transaction_parser/parser_benchmark/report/transaction_parser_accuracy_analysis/transaction_parser_accuracy_analysis.py @@ -34,13 +34,6 @@ "Docling": 1, } -_FILE_TYPE_ORDER = { - "PDF": 0, - "CSV": 1, - "XLSX": 2, - "XLS": 3, -} - class Col(StrEnum): """Column fieldnames — single source of truth for the report.""" @@ -50,7 +43,6 @@ class Col(StrEnum): ACCURACY_SCORE = "accuracy_score" AI_MODEL = "ai_model" PDF_PROCESSOR = "pdf_processor" - FILE_TYPE = "file_type" FILE_PARSE_TIME = "file_parse_time" FILE_PARSE_MEMORY = "file_parse_memory" AI_PARSE_TIME = "ai_parse_time" @@ -155,12 +147,6 @@ def _get_columns(self): "fieldtype": "Data", "width": 110, }, - { - "fieldname": Col.FILE_TYPE, - "label": _("File Type"), - "fieldtype": "Data", - "width": 90, - }, { "fieldname": Col.FILE_PARSE_TIME, "label": _("File Parse (s)"), @@ -254,19 +240,21 @@ def _fetch_logs(self): log.currency, log.dataset, ds.party, - ds.file_type, Coalesce(cust.customer_name, supp.supplier_name, ds.party).as_( "party_name" ), ) .where(log.status == "Completed") .where(ds.docstatus == 1) - .orderby(ds.party, log.ai_model, ds.file_type) + .orderby(ds.party, log.ai_model) ) if not self.filters.get("include_disabled_datasets"): query = query.where(ds.enabled == 1) + if self.filters.get("is_multiple_files"): + query = query.where(ds.is_multiple_files == 1) + # exact-match filters for column, key in ( (ds.company, "company"), @@ -279,7 +267,6 @@ def _fetch_logs(self): # multi-select IN filters for column, key in ( - (ds.file_type, "file_type"), (log.ai_model, "ai_model"), (log.pdf_processor, "pdf_processor"), ): @@ -334,7 +321,6 @@ def _build_row(self, r, score_details_map): Col.ACCURACY_SCORE: r.accuracy_score, Col.AI_MODEL: r.ai_model, Col.PDF_PROCESSOR: r.pdf_processor, - Col.FILE_TYPE: r.file_type, Col.DATASET: r.dataset, Col.FILE_PARSE_TIME: r.file_parse_time, Col.FILE_PARSE_MEMORY: r.file_parse_memory, @@ -362,7 +348,6 @@ def _aggregate_by_config(self): row.get(Col.DATASET), row.get(Col.AI_MODEL), row.get(Col.PDF_PROCESSOR) or "", - row.get(Col.FILE_TYPE), ) groups[key].append(row) @@ -374,7 +359,6 @@ def _aggregate_by_config(self): Col.PARTY_NAME: rows[0].get(Col.PARTY_NAME), Col.AI_MODEL: rows[0].get(Col.AI_MODEL), Col.PDF_PROCESSOR: rows[0].get(Col.PDF_PROCESSOR), - Col.FILE_TYPE: rows[0].get(Col.FILE_TYPE), Col.CURRENCY: rows[0].get(Col.CURRENCY), Col.RUN_COUNT: count, } @@ -431,11 +415,10 @@ def _group_by_party(self): @staticmethod def _sort_key(row): - """Sort key for child rows: AI Model → PDF Processor → File Type.""" + """Sort key for child rows: AI Model → PDF Processor.""" return ( _AI_MODEL_ORDER.get(row.get(Col.AI_MODEL), 99), _PDF_PROCESSOR_ORDER.get(row.get(Col.PDF_PROCESSOR), 99), - _FILE_TYPE_ORDER.get(row.get(Col.FILE_TYPE), 99), ) def _group_row(self, party, rows): diff --git a/transaction_parser/parser_benchmark/report/transaction_parser_version_comparison/transaction_parser_version_comparison.js b/transaction_parser/parser_benchmark/report/transaction_parser_version_comparison/transaction_parser_version_comparison.js index 6061239..2c252e2 100644 --- a/transaction_parser/parser_benchmark/report/transaction_parser_version_comparison/transaction_parser_version_comparison.js +++ b/transaction_parser/parser_benchmark/report/transaction_parser_version_comparison/transaction_parser_version_comparison.js @@ -12,8 +12,6 @@ const AI_MODELS = [ "Google Gemini Flash-2.5", ]; -const FILE_TYPES = ["PDF", "CSV", "XLSX", "XLS"]; - const PDF_PROCESSORS = ["OCRMyPDF", "Docling"]; const PARTY_TYPE_MAP = { @@ -68,12 +66,6 @@ frappe.query_reports["Transaction Parser Version Comparison"] = { fieldtype: "Dynamic Link", options: "party_type", }, - { - fieldname: "file_type", - label: __("File Type"), - fieldtype: "MultiSelectList", - get_data: (txt) => make_options(FILE_TYPES, txt), - }, { fieldname: "ai_model", label: __("AI Model"), @@ -92,6 +84,12 @@ frappe.query_reports["Transaction Parser Version Comparison"] = { fieldtype: "Check", default: 0, }, + { + fieldname: "is_multiple_files", + label: __("Multiple Files Only"), + fieldtype: "Check", + default: 0, + }, ], }; diff --git a/transaction_parser/parser_benchmark/report/transaction_parser_version_comparison/transaction_parser_version_comparison.py b/transaction_parser/parser_benchmark/report/transaction_parser_version_comparison/transaction_parser_version_comparison.py index 6773fb8..e20aae9 100644 --- a/transaction_parser/parser_benchmark/report/transaction_parser_version_comparison/transaction_parser_version_comparison.py +++ b/transaction_parser/parser_benchmark/report/transaction_parser_version_comparison/transaction_parser_version_comparison.py @@ -30,13 +30,6 @@ "Docling": 1, } -_FILE_TYPE_ORDER = { - "PDF": 0, - "CSV": 1, - "XLSX": 2, - "XLS": 3, -} - class Col(StrEnum): """Column fieldnames — single source of truth for the report.""" @@ -46,7 +39,6 @@ class Col(StrEnum): DATASET = "dataset" AI_MODEL = "ai_model" PDF_PROCESSOR = "pdf_processor" - FILE_TYPE = "file_type" COMMIT_HASH = "commit_hash" COMMIT_MESSAGE = "commit_message" ACCURACY_SCORE = "accuracy_score" @@ -116,12 +108,6 @@ def _get_columns(self): "fieldtype": "Data", "width": 110, }, - { - "fieldname": Col.FILE_TYPE, - "label": _("File Type"), - "fieldtype": "Data", - "width": 90, - }, { "fieldname": Col.COMMIT_HASH, "label": _("Commit"), @@ -179,7 +165,6 @@ def _fetch_logs(self): log.commit_hash, log.commit_message, ds.party, - ds.file_type, Coalesce(cust.customer_name, supp.supplier_name, ds.party).as_( "party_name" ), @@ -187,12 +172,15 @@ def _fetch_logs(self): .where(log.status == "Completed") .where(ds.docstatus == 1) .where(Coalesce(log.commit_hash, "") != "") - .orderby(ds.party, log.ai_model, ds.file_type) + .orderby(ds.party, log.ai_model) ) if not self.filters.get("include_disabled_datasets"): query = query.where(ds.enabled == 1) + if self.filters.get("is_multiple_files"): + query = query.where(ds.is_multiple_files == 1) + # exact-match filters for column, key in ( (ds.company, "company"), @@ -205,7 +193,6 @@ def _fetch_logs(self): # multi-select IN filters for column, key in ( - (ds.file_type, "file_type"), (log.ai_model, "ai_model"), (log.pdf_processor, "pdf_processor"), ): @@ -264,7 +251,6 @@ def _build_row(self, r, score_details_map): Col.DATASET: r.dataset, Col.AI_MODEL: r.ai_model, Col.PDF_PROCESSOR: r.pdf_processor, - Col.FILE_TYPE: r.file_type, Col.COMMIT_HASH: short_hash, Col.COMMIT_MESSAGE: commit_msg, Col.ACCURACY_SCORE: r.accuracy_score, @@ -286,7 +272,6 @@ def _aggregate_by_config(self): row.get(Col.DATASET), row.get(Col.AI_MODEL), row.get(Col.PDF_PROCESSOR) or "", - row.get(Col.FILE_TYPE), row.get(Col.COMMIT_HASH), ) groups[key].append(row) @@ -300,7 +285,6 @@ def _aggregate_by_config(self): Col.DATASET: rows[0].get(Col.DATASET), Col.AI_MODEL: rows[0].get(Col.AI_MODEL), Col.PDF_PROCESSOR: rows[0].get(Col.PDF_PROCESSOR), - Col.FILE_TYPE: rows[0].get(Col.FILE_TYPE), Col.COMMIT_HASH: rows[0].get(Col.COMMIT_HASH), Col.COMMIT_MESSAGE: rows[0].get(Col.COMMIT_MESSAGE), Col.RUN_COUNT: count, @@ -353,11 +337,10 @@ def _group_by_party(self): @staticmethod def _sort_key(row): - """Sort: AI Model → PDF Processor → File Type → Commit Hash.""" + """Sort: AI Model → PDF Processor → Commit Hash.""" return ( _AI_MODEL_ORDER.get(row.get(Col.AI_MODEL), 99), _PDF_PROCESSOR_ORDER.get(row.get(Col.PDF_PROCESSOR), 99), - _FILE_TYPE_ORDER.get(row.get(Col.FILE_TYPE), 99), row.get(Col.COMMIT_HASH) or "", ) diff --git a/transaction_parser/parser_benchmark/runner.py b/transaction_parser/parser_benchmark/runner.py index 716fffe..8f949d7 100644 --- a/transaction_parser/parser_benchmark/runner.py +++ b/transaction_parser/parser_benchmark/runner.py @@ -2,6 +2,7 @@ from timeit import default_timer import frappe +from frappe import _ from frappe.core.doctype.file.file import File from frappe.utils import cint, flt @@ -57,11 +58,11 @@ def run(self): total_start = default_timer() try: - file_doc: File = self._get_file_doc() - self.controller: Transaction = self._get_controller(file_doc) + file_docs: list[File] = self._get_file_docs() + self.controller: Transaction = self._get_controller(file_docs[0]) - file_content = self._run_file_parsing(file_doc) - ai_content = self._run_ai_parsing(file_content, file_doc.name) + file_content = self._run_file_parsing(file_docs) + ai_content = self._run_ai_parsing(file_content, file_docs[0].name) self._calculate_cost() self._score_response(ai_content) @@ -80,8 +81,11 @@ def run(self): # ── helpers ────────────────────────────────────────────── - def _get_file_doc(self): - return frappe.get_last_doc("File", filters={"file_url": self.dataset.file}) + def _get_file_docs(self) -> list[File]: + file_docs = self.dataset.get_file_docs() + if not file_docs: + frappe.throw(_("No files in dataset {0}").format(self.dataset.name)) + return file_docs def _get_controller(self, file_doc: File) -> Transaction: ds = self.dataset @@ -103,11 +107,7 @@ def _get_cost_row(self): # ── step 1: file parsing ──────────────────────────────── - def _run_file_parsing(self, file_doc: File) -> str: - pdf_processor = None - if self.log.file_type == "PDF" and self.log.pdf_processor: - pdf_processor = get_pdf_processor(self.log.pdf_processor) - + def _run_file_parsing(self, file_docs: list[File]) -> str: # to prevent stopping an already running tracemalloc instance was_tracing = tracemalloc.is_tracing() if not was_tracing: @@ -115,10 +115,23 @@ def _run_file_parsing(self, file_doc: File) -> str: start = default_timer() try: - content = FileProcessor().get_content( - file_doc, - self.dataset.page_limit or None, - pdf_processor, + contents = [] + for file_doc in file_docs: + pdf_processor = None + if file_doc.file_type == "PDF" and self.log.pdf_processor: + pdf_processor = get_pdf_processor(self.log.pdf_processor) + + content = FileProcessor().get_content( + file_doc, + self.dataset.page_limit or None, + pdf_processor, + ) + contents.append(content) + + combined = ( + "\n\n--- Document Separator ---\n\n".join(contents) + if len(contents) > 1 + else contents[0] ) finally: self.log.file_parse_time = flt(default_timer() - start, self.precision) @@ -129,8 +142,8 @@ def _run_file_parsing(self, file_doc: File) -> str: peak / 1024 / 1024, self.precision ) # bytes → MB - self.log.file_content = content - return content + self.log.file_content = combined + return combined # ── step 2: AI parsing ────────────────────────────────── diff --git a/transaction_parser/patches.txt b/transaction_parser/patches.txt index d88f879..9107383 100644 --- a/transaction_parser/patches.txt +++ b/transaction_parser/patches.txt @@ -2,9 +2,11 @@ # Patches added in this section will be executed before doctypes are migrated # Read docs to understand patches: https://frappeframework.com/docs/v14/user/en/database-migrations transaction_parser.patches.rename_gemini_models +transaction_parser.patches.remove_dataset_file_field [post_model_sync] # Patches added in this section will be executed after doctypes are migrated execute:from transaction_parser.install import after_install; after_install() #2 transaction_parser.patches.set_default_pdf_processor #1 transaction_parser.patches.recalculate_accuracy +transaction_parser.patches.populate_dataset_files_table diff --git a/transaction_parser/patches/populate_dataset_files_table.py b/transaction_parser/patches/populate_dataset_files_table.py new file mode 100644 index 0000000..07a415f --- /dev/null +++ b/transaction_parser/patches/populate_dataset_files_table.py @@ -0,0 +1,52 @@ +""" +Populate the new ``files`` child table on Parser Benchmark Dataset. + +After model_sync creates the ``Parser Benchmark Dataset File`` child table, +this patch reads the File documents that were previously attached (by the +``remove_dataset_file_field`` pre_model_sync patch) and inserts them as child +rows so the new child-table based workflow works seamlessly. +""" + +import frappe + + +def execute(): + datasets = frappe.get_all("Parser Benchmark Dataset", fields=["name"]) + + for ds in datasets: + # Skip if already has files in child table + if frappe.db.count("Parser Benchmark Dataset File", {"parent": ds.name}): + continue + + # Find File docs attached to this dataset + files = frappe.get_all( + "File", + filters={ + "attached_to_doctype": "Parser Benchmark Dataset", + "attached_to_name": ds.name, + }, + fields=["file_url", "file_type"], + ) + + if not files: + continue + + for idx, f in enumerate(files, 1): + child = frappe.new_doc("Parser Benchmark Dataset File") + child.update( + { + "parent": ds.name, + "parenttype": "Parser Benchmark Dataset", + "parentfield": "files", + "idx": idx, + "file": f.file_url, + "file_type": f.file_type or "", + } + ) + child.db_insert() + + # Update is_multiple_files flag + is_multiple = 1 if len(files) > 1 else 0 + frappe.db.set_value( + "Parser Benchmark Dataset", ds.name, "is_multiple_files", is_multiple + ) diff --git a/transaction_parser/patches/remove_dataset_file_field.py b/transaction_parser/patches/remove_dataset_file_field.py new file mode 100644 index 0000000..87e2b48 --- /dev/null +++ b/transaction_parser/patches/remove_dataset_file_field.py @@ -0,0 +1,73 @@ +""" +Migrate `file` field data on Parser Benchmark Dataset to Frappe File attachments. + +Before the `file` column is dropped (pre_model_sync), ensure every Dataset that +had a file URL stored in the `file` field has a corresponding File doc properly +linked via `attached_to_doctype` / `attached_to_name`. +""" + +import frappe + + +def execute(): + if not frappe.db.has_column("Parser Benchmark Dataset", "file"): + return + + datasets = frappe.get_all( + "Parser Benchmark Dataset", + filters={"file": ("is", "set")}, + fields=["name", "file"], + ) + + for ds in datasets: + file_url = ds.file + if not file_url: + continue + + # Check if a properly-linked File doc already exists + existing = frappe.db.exists( + "File", + { + "file_url": file_url, + "attached_to_doctype": "Parser Benchmark Dataset", + "attached_to_name": ds.name, + }, + ) + + if existing: + continue + + # Try to find an unlinked File doc with the same URL and link it + unlinked = frappe.db.get_value( + "File", + {"file_url": file_url}, + ["name", "attached_to_doctype", "attached_to_name"], + as_dict=True, + ) + + if unlinked: + if not unlinked.attached_to_doctype: + # Link the orphan File doc to this dataset + frappe.db.set_value( + "File", + unlinked.name, + { + "attached_to_doctype": "Parser Benchmark Dataset", + "attached_to_name": ds.name, + }, + ) + else: + # File is attached to something else — create a copy + _create_attachment(ds.name, file_url) + else: + # No File doc exists at all — create one + _create_attachment(ds.name, file_url) + + +def _create_attachment(dataset_name: str, file_url: str): + """Create a new File doc attached to the given dataset.""" + f = frappe.new_doc("File") + f.file_url = file_url + f.attached_to_doctype = "Parser Benchmark Dataset" + f.attached_to_name = dataset_name + f.insert(ignore_permissions=True)