Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
"field_order": [
"enabled",
"column_break_title",
"section_break_file",
"file",
"file_type",
"column_break_gbap",
"files_section",
"files",
"is_multiple_files",
"section_break_txfs",
"transaction_type",
"column_break_xusg",
"country",
"section_break_sobg",
"company",
Expand Down Expand Up @@ -59,20 +60,12 @@
"label": "Enabled"
},
{
"fieldname": "section_break_file",
"fieldtype": "Section Break"
},
{
"fieldname": "file",
"fieldtype": "Attach",
"label": "File",
"print_hide": 1,
"reqd": 1
},
{
"fieldname": "file_type",
"fieldtype": "Data",
"label": "File Type",
"allow_on_submit": 1,
"default": "0",
"fieldname": "is_multiple_files",
"fieldtype": "Check",
"in_standard_filter": 1,
"label": "Multiple Files",
"read_only": 1
},
{
Expand All @@ -91,10 +84,6 @@
"options": "India\nOther",
"reqd": 1
},
{
"fieldname": "column_break_gbap",
"fieldtype": "Column Break"
},
{
"fieldname": "company",
"fieldtype": "Link",
Expand All @@ -115,6 +104,19 @@
"label": "Party",
"options": "party_type"
},
{
"fieldname": "files_section",
"fieldtype": "Section Break",
"label": "Files"
},
{
"allow_on_submit": 1,
"fieldname": "files",
"fieldtype": "Table",
"label": "Files",
"options": "Parser Benchmark Dataset File",
"reqd": 1
},
{
"fieldname": "processing_section",
"fieldtype": "Section Break",
Expand Down Expand Up @@ -181,7 +183,6 @@
"label": "Google Gemini Flash-2.5"
},
{
"depends_on": "eval: doc.file_type === \"PDF\"",
"fieldname": "pdf_processor_section",
"fieldtype": "Section Break",
"label": "PDF Processors"
Expand Down Expand Up @@ -273,6 +274,15 @@
{
"fieldname": "column_break_aoce",
"fieldtype": "Column Break"
},
{
"fieldname": "section_break_txfs",
"fieldtype": "Section Break",
"label": "Transaction Details"
},
{
"fieldname": "column_break_xusg",
"fieldtype": "Column Break"
}
],
"index_web_pages_for_search": 1,
Expand All @@ -283,7 +293,7 @@
"link_fieldname": "dataset"
}
],
"modified": "2026-03-27 10:32:46.620190",
"modified": "2026-03-30 15:01:03.977940",
"modified_by": "Administrator",
"module": "Parser Benchmark",
"name": "Parser Benchmark Dataset",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ class ParserBenchmarkDataset(Document):
if TYPE_CHECKING:
from frappe.types import DF

from transaction_parser.parser_benchmark.doctype.parser_benchmark_dataset_file.parser_benchmark_dataset_file import (
ParserBenchmarkDatasetFile,
)
from transaction_parser.parser_benchmark.doctype.parser_benchmark_expected_field.parser_benchmark_expected_field import (
ParserBenchmarkExpectedField,
)
Expand All @@ -46,10 +49,10 @@ class ParserBenchmarkDataset(Document):
docling: DF.Check
enabled: DF.Check
expected_fields: DF.Table[ParserBenchmarkExpectedField]
file: DF.Attach
file_type: DF.Data | None
files: DF.Table[ParserBenchmarkDatasetFile]
google_gemini_flash_25: DF.Check
google_gemini_pro_25: DF.Check
is_multiple_files: DF.Check
naming_series: DF.Literal["PAR-BM-DTS-"]
ocrmypdf: DF.Check
openai_gpt_4o: DF.Check
Expand All @@ -62,39 +65,27 @@ class ParserBenchmarkDataset(Document):
transaction_type: DF.Literal["Sales Order", "Expense"]
# end: auto-generated types

SUPPORTED_FILE_TYPES = ("PDF", "CSV", "XLSX", "XLS")

def validate(self):
self.set_file_type()
self.validate_file_type()
self.validate_files()
self.validate_selected_models()
self.validate_selected_processors()
self.validate_expected_fields()

def set_file_type(self):
if self.file_type and not self.has_value_changed("file"):
return
def before_update_after_submit(self):
self.validate_files()

file_doc = frappe.get_last_doc("File", filters={"file_url": self.file})
self.file_type = file_doc.file_type
def validate_files(self):
"""Set file_type for each row and auto-set is_multiple_files."""
for row in self.files:
if row.file and (not row.file_type or row.has_value_changed("file")):
file_doc = frappe.get_last_doc("File", filters={"file_url": row.file})
row.file_type = file_doc.file_type

def validate_file_type(self):
if self.file_type not in self.SUPPORTED_FILE_TYPES:
frappe.throw(_("Unsupported file type: {0}").format(self.file_type))
self.is_multiple_files = len(self.files) > 1

def validate_selected_models(self):
if not self.get_selected_models():
frappe.throw(_("Please select at least one AI Model."))

def validate_selected_processors(self):
if self.file_type != "PDF":
for field in PDF_PROCESSOR_FIELD_MAP:
self.set(field, 0)
return

if not self.get_selected_processors():
frappe.throw(_("Please select at least one PDF Processor."))

def validate_expected_fields(self):
if not self.expected_fields:
return
Expand Down Expand Up @@ -129,6 +120,18 @@ def get_selected_processors(self) -> list[str]:
label for field, label in PDF_PROCESSOR_FIELD_MAP.items() if self.get(field)
]

def has_pdf_file(self) -> bool:
"""Check if any file in the child table is a PDF."""
return any(row.file_type == "PDF" for row in self.files)

def get_file_docs(self) -> list:
"""Return File documents for each row in the files child table."""
file_docs = []
for row in self.files:
file_doc = frappe.get_last_doc("File", filters={"file_url": row.file})
file_docs.append(file_doc)
return file_docs


@frappe.whitelist()
def run_benchmark(dataset_name: str):
Expand Down Expand Up @@ -156,7 +159,7 @@ def create_and_enqueue_benchmark_logs(dataset_name: str) -> list[str]:
models = dataset.get_selected_models()
processors = (
(dataset.get_selected_processors() or [None])
if dataset.file_type == "PDF"
if dataset.has_pdf_file()
else [None]
)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (c) 2026, Resilient Tech and contributors
# For license information, please see license.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"actions": [],
"creation": "2026-03-30 00:00:00",
"doctype": "DocType",
"engine": "InnoDB",
"field_order": [
"file",
"column_break_yahx",
"file_type"
],
"fields": [
{
"fieldname": "file",
"fieldtype": "Attach",
"in_list_view": 1,
"label": "File",
"reqd": 1
},
{
"fieldname": "file_type",
"fieldtype": "Data",
"in_list_view": 1,
"label": "File Type",
"read_only": 1
},
{
"fieldname": "column_break_yahx",
"fieldtype": "Column Break"
}
],
"istable": 1,
"links": [],
"modified": "2026-03-30 15:01:27.752102",
"modified_by": "Administrator",
"module": "Parser Benchmark",
"name": "Parser Benchmark Dataset File",
"owner": "Administrator",
"permissions": [],
"row_format": "Dynamic",
"sort_field": "modified",
"sort_order": "DESC",
"states": []
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Copyright (c) 2026, Resilient Tech and contributors
# For license information, please see license.txt

from frappe.model.document import Document


class ParserBenchmarkDatasetFile(Document):
# begin: auto-generated types
# This code is auto-generated. Do not modify anything in this block.

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from frappe.types import DF

file: DF.Attach
file_type: DF.Data | None
parent: DF.Data
parentfield: DF.Data
parenttype: DF.Data
# end: auto-generated types

pass
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
"column_break_ubhs",
"file_parsing_tab",
"pdf_processor",
"file_type",
"column_break_file_metrics",
"page_limit",
"section_break_umzr",
Expand Down Expand Up @@ -107,7 +106,6 @@
"read_only": 1
},
{
"depends_on": "eval: doc.file_type === \"PDF\"",
"fieldname": "pdf_processor",
"fieldtype": "Select",
"label": "PDF Processor",
Expand All @@ -121,13 +119,7 @@
"label": "Total Time (s)",
"read_only": 1
},
{
"fieldname": "file_type",
"fieldtype": "Data",
"is_virtual": 1,
"label": "File Type",
"read_only": 1
},

{
"fieldname": "file_parsing_tab",
"fieldtype": "Tab Break",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,3 @@ def party(self):
@property
def page_limit(self):
return self.get_from_dataset("page_limit") or 0

@property
def file_type(self):
return self.get_from_dataset("file_type")
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@ const AI_MODELS = [
"Google Gemini Flash-2.5",
];

const FILE_TYPES = ["PDF", "CSV", "XLSX", "XLS"];

const PDF_PROCESSORS = ["OCRMyPDF", "Docling"];

const PARTY_TYPE_MAP = {
Expand Down Expand Up @@ -68,12 +66,6 @@ frappe.query_reports["Transaction Parser Accuracy Analysis"] = {
fieldtype: "Dynamic Link",
options: "party_type",
},
{
fieldname: "file_type",
label: __("File Type"),
fieldtype: "MultiSelectList",
get_data: (txt) => make_options(FILE_TYPES, txt),
},
{
fieldname: "ai_model",
label: __("AI Model"),
Expand All @@ -92,6 +84,12 @@ frappe.query_reports["Transaction Parser Accuracy Analysis"] = {
fieldtype: "Check",
default: 0,
},
{
fieldname: "is_multiple_files",
label: __("Multiple Files Only"),
fieldtype: "Check",
default: 0,
},
],
};

Expand Down
Loading
Loading