Skip to content

Commit

Permalink
fix: TableFormer raises IndexError: too many indices for array
Browse files Browse the repository at this point in the history
Signed-off-by: Maxim Lysak <[email protected]>
Co-authored-by: Maxim Lysak <[email protected]>
  • Loading branch information
maxmnemonic and Maxim Lysak authored Sep 3, 2024
1 parent b478eae commit ad494ca
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 23 deletions.
40 changes: 23 additions & 17 deletions docling_ibm_models/tableformer/data_management/tf_cell_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,13 +127,14 @@ def match_cells(self, iocr_page, table_bbox, prediction):
Dictionary with all details about the mathings between the table and pdf cells
"""
pdf_cells = copy.deepcopy(iocr_page["tokens"])
for word in pdf_cells:
word["bbox"] = [
word["bbox"]["l"],
word["bbox"]["t"],
word["bbox"]["r"],
word["bbox"]["b"],
]
if len(pdf_cells) > 0:
for word in pdf_cells:
word["bbox"] = [
word["bbox"]["l"],
word["bbox"]["t"],
word["bbox"]["r"],
word["bbox"]["b"],
]
table_bboxes = prediction["bboxes"]
table_classes = prediction["classes"]
# BBOXES transformed...
Expand All @@ -145,9 +146,13 @@ def match_cells(self, iocr_page, table_bbox, prediction):
table_cells = self._build_table_cells(
html_seq, otsl_seq, table_bboxes_page, table_classes
)
matches, matches_counter = self._intersection_over_pdf_match(
table_cells, pdf_cells
)

matches = {}
matches_counter = 0
if len(pdf_cells) > 0:
matches, matches_counter = self._intersection_over_pdf_match(
table_cells, pdf_cells
)

self._log().debug("matches_counter: {}".format(matches_counter))

Expand Down Expand Up @@ -188,13 +193,14 @@ def match_cells_dummy(self, iocr_page, table_bbox, prediction):
Dictionary with all details about the mathings between the table and pdf cells
"""
pdf_cells = copy.deepcopy(iocr_page["tokens"])
for word in pdf_cells:
word["bbox"] = [
word["bbox"]["l"],
word["bbox"]["t"],
word["bbox"]["r"],
word["bbox"]["b"],
]
if len(pdf_cells) > 0:
for word in pdf_cells:
word["bbox"] = [
word["bbox"]["l"],
word["bbox"]["t"],
word["bbox"]["r"],
word["bbox"]["b"],
]

table_bboxes = prediction["bboxes"]
table_classes = prediction["classes"]
Expand Down
25 changes: 19 additions & 6 deletions docling_ibm_models/tableformer/data_management/tf_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,12 @@ def predict_dummy(
prediction["bboxes"] = corrected_bboxes

# Match the cells
matching_details = {"table_cells": [], "matches": {}}
matching_details = {
"table_cells": [],
"matches": {},
"pdf_cells": [],
"prediction_bboxes_page": [],
}

# Table bbox upscaling will scale predicted bboxes too within cell matcher
scaled_table_bbox = [
Expand Down Expand Up @@ -803,7 +808,12 @@ def predict(
prediction["bboxes"] = corrected_bboxes

# Match the cells
matching_details = {"table_cells": [], "matches": {}}
matching_details = {
"table_cells": [],
"matches": {},
"pdf_cells": [],
"prediction_bboxes_page": [],
}

# Table bbox upscaling will scale predicted bboxes too within cell matcher
scaled_table_bbox = [
Expand All @@ -819,10 +829,13 @@ def predict(
)
# Post-processing
if len(prediction["bboxes"]) > 0:
if self.enable_post_process:
AggProfiler().begin("post_process", self._prof)
matching_details = self._post_processor.process(matching_details)
AggProfiler().end("post_process", self._prof)
if (
len(iocr_page["tokens"]) > 0
): # There are at least some pdf cells to match with
if self.enable_post_process:
AggProfiler().begin("post_process", self._prof)
matching_details = self._post_processor.process(matching_details)
AggProfiler().end("post_process", self._prof)

# Generate the expected Docling responses
AggProfiler().begin("generate_docling_response", self._prof)
Expand Down
Binary file added tests/test_data/samples/empty_iocr.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
67 changes: 67 additions & 0 deletions tests/test_data/samples/empty_iocr.png.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
{
"doc_source_type": {},
"font_dist_info": {},
"info": {
"histogram": {
"mean-char-height": {},
"mean-char-width": {},
"number-of-chars": {}
},
"styles": []
},
"title": "",
"metadata": {
"numPages": 1
},
"pages": [
{
"blocks": [],
"cells": [],
"height": 1612,
"width": 1237,
"dimensions": {
"bbox": [
0.0,
0.0,
1237,
1612
],
"height": 1612,
"origin": "TopLeft",
"width": 1237
},
"fonts": [],
"links": [],
"rotation": 0.0,
"rectangles": [],
"textPositions": [],
"text_lines": [],
"tokens": [],
"localized_image_locations": [],
"scanned_elements": [],
"paths": [],
"pageNumber": 1,
"page_image": {},
"lang": [
"en",
"pt",
"fr",
"it",
"es",
"fi"
]
}
],
"settings": {},
"passedHeadersFooters": {
"headerFooters": {
"1": {
"headerHeight": 0,
"footerHeight": 0
}
},
"headerFound": false,
"footerFound": false
},
"styles": []
}
3 changes: 3 additions & 0 deletions tests/test_tf_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,17 @@
"table_jsons": [
"./tests/test_data/samples/ADS.2007.page_123.png_iocr.parse_format.json",
"./tests/test_data/samples/PHM.2013.page_30.png_iocr.parse_format.json",
"./tests/test_data/samples/empty_iocr.png.json"
],
"png_images": [
"./tests/test_data/samples/ADS.2007.page_123.png",
"./tests/test_data/samples/PHM.2013.page_30.png",
"./tests/test_data/samples/empty_iocr.png"
],
"table_bboxes": [
[[178, 748, 1061, 976], [177, 1163, 1062, 1329]],
[[100, 186, 1135, 525]],
[[178, 748, 1061, 976], [177, 1163, 1062, 1329]]
],
}

Expand Down

0 comments on commit ad494ca

Please sign in to comment.