Skip to content

Commit

Permalink
feat: process title and footnote
Browse files Browse the repository at this point in the history
  • Loading branch information
xu rui committed Mar 22, 2024
1 parent e3e125b commit 432e1ae
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 3 deletions.
3 changes: 2 additions & 1 deletion magic_pdf/pdf_parse_for_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,8 @@ def parse_pdf_for_train(
# isSimpleLayout_flag, fullColumn_cnt, subColumn_cnt, curPage_loss = evaluate_pdf_layout(page_id, page, model_output_json)
接下来开始进行预处理过程
"""

title_bboxs = parse_titles(page_id, page, model_output_json)

"""去掉每页的页码、页眉、页脚"""
page_no_bboxs = parse_pageNos(page_id, page, model_output_json)
header_bboxs = parse_headers(page_id, page, model_output_json)
Expand Down
16 changes: 14 additions & 2 deletions magic_pdf/train_utils/convert_to_train_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,16 @@ def convert_to_train_format(jso: dict) -> []:

# 脚注, 目前没有看到例子
for para in v["para_blocks"]:
n_bbox = {"category_id": 2, "bbox": para["bbox"]}
bboxes.append(n_bbox)
if "paras" in para:
paras = para["paras"]
for para_key, para_content in paras.items():
para_bbox = para_content["para_bbox"]
is_para_title = para_content["is_para_title"]
if is_para_title:
n_bbox = {"category_id": 0, "bbox": para_bbox}
else:
n_bbox = {"category_id": 2, "bbox": para_bbox}
bboxes.append(n_bbox)

for inline_equation in v["inline_equations"]:
n_bbox = {"category_id": 13, "bbox": inline_equation["bbox"]}
Expand All @@ -46,6 +54,10 @@ def convert_to_train_format(jso: dict) -> []:
n_bbox = {"category_id": 10, "bbox": inter_equation["bbox"]}
bboxes.append(n_bbox)

for footnote in v['footnote_bboxes_tmp']:
n_bbox = {"category_id": 5, "bbox": footnote["bbox"]}
bboxes.append(n_bbox)

info["bboxes"] = bboxes
info["layout_tree"] = v["layout_bboxes"]
pages.append(info)
Expand Down

0 comments on commit 432e1ae

Please sign in to comment.