Skip to content

Commit

Permalink
Merge pull request #6 from myhloli/feat/add_extract_train_data
Browse files Browse the repository at this point in the history
feat: add extract_train_data
  • Loading branch information
icecraft authored Mar 21, 2024
2 parents 056aed8 + 390fdb2 commit 439c18f
Show file tree
Hide file tree
Showing 7 changed files with 1,591 additions and 134 deletions.
684 changes: 684 additions & 0 deletions magic_pdf/pdf_parse_for_train.py

Large diffs are not rendered by default.

444 changes: 310 additions & 134 deletions magic_pdf/pipeline.py

Large diffs are not rendered by default.

Empty file.
52 changes: 52 additions & 0 deletions magic_pdf/train_utils/convert_to_train_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@


def convert_to_train_format(jso: dict) -> []:
pages = []
for k, v in jso.items():
page_idx = v["page_idx"]
width, height = v["page_size"]

info = {"page_info": {"page_no": page_idx, "height": height, "width": width}}

bboxes: list[dict] = []
for img_bbox in v["image_bboxes_with_caption"]:
bbox = {"category_id": 1, "bbox": img_bbox["bbox"]}
if "caption" in img_bbox:
bbox["caption_bbox"] = img_bbox["caption"]
bboxes.append(bbox)

for tbl_bbox in v["table_bboxes_with_caption"]:
bbox = {"category_id": 7, "bbox": tbl_bbox["bbox"]}
if "caption" in tbl_bbox:
bbox["caption_bbox"] = tbl_bbox["caption"]
bboxes.append(bbox)

for bbox in v["bak_page_no_bboxes"]:
n_bbox = {"category_id": 4, "bbox": bbox}
bboxes.append(n_bbox)

for bbox in v["bak_header_bboxes"]:
n_bbox = {"category_id": 3, "bbox": bbox}
bboxes.append(n_bbox)

for bbox in v["bak_footer_bboxes"]:
n_bbox = {"category_id": 6, "bbox": bbox}
bboxes.append(n_bbox)

# 脚注, 目前没有看到例子
for para in v["para_blocks"]:
n_bbox = {"category_id": 2, "bbox": para["bbox"]}
bboxes.append(n_bbox)

for inline_equation in v["inline_equations"]:
n_bbox = {"category_id": 13, "bbox": inline_equation["bbox"]}
bboxes.append(n_bbox)

for inter_equation in v["interline_equations"]:
n_bbox = {"category_id": 10, "bbox": inter_equation["bbox"]}
bboxes.append(n_bbox)

info["bboxes"] = bboxes
pages.append(info)

return pages
59 changes: 59 additions & 0 deletions magic_pdf/train_utils/extract_caption.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from magic_pdf.libs.boxbase import _is_in


def extract_caption_bbox(outer: list, inner: list) -> list:
"""
ret: list of {
"bbox": [1,2,3,4],
"caption": [5,6,7,8] # may existed
}
"""
found_count = 0 # for debug
print(outer, inner)

def is_float_equal(a, b):
if 0.01 > abs(a - b): # non strict float equal compare
return True
return False

outer_h = {i: outer[i] for i in range(len(outer))}
ret = []
for v in inner:
ix0, iy0, ix1, iy1 = v
found_idx = None
d = {"bbox": v[:4]}
for k in outer_h:
ox0, oy0, ox1, oy1 = outer_h[k]
equal_float_flags = [
is_float_equal(ix0, ox0),
is_float_equal(iy0, oy0),
is_float_equal(ix1, ox1),
is_float_equal(iy1, oy1),
]
if _is_in(v, outer_h[k]) and not all(equal_float_flags):
found_idx = k
break
if found_idx is not None:
found_count += 1
captions: list[list] = []
ox0, oy0, ox1, oy1 = outer_h[found_idx]
captions = [
[ox0, oy0, ix0, oy1],
[ox0, oy0, ox1, iy0],
[ox0, iy1, ox1, oy1],
[ix1, oy0, ox1, oy1],
]
captions = sorted(
captions,
key=lambda rect: abs(rect[0] - rect[2]) * abs(rect[1] - rect[3]),
) # 面积最大的框就是caption
d["caption"] = captions[-1]
outer_h.pop(
found_idx
) # 同一个 outer box 只能用于确定一个 inner box 的 caption 位置。

ret.append(d)

print("found_count: ", found_count)
return ret
159 changes: 159 additions & 0 deletions magic_pdf/train_utils/remove_footer_header.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import re

from magic_pdf.libs.boxbase import _is_in_or_part_overlap
from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO


"""
copy from pre_proc/remove_footer_header.py
"""


def remove_headder_footer_one_page(
text_raw_blocks,
image_bboxes,
table_bboxes,
header_bboxs,
footer_bboxs,
page_no_bboxs,
page_w,
page_h,
):
"""
删除页眉页脚,页码
从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
"""
if 1:
return image_bboxes, table_bboxes, text_raw_blocks, [], [], []

header = []
footer = []
if len(header) == 0:
model_header = header_bboxs
if model_header:
x0 = min([x for x, _, _, _ in model_header])
y0 = min([y for _, y, _, _ in model_header])
x1 = max([x1 for _, _, x1, _ in model_header])
y1 = max([y1 for _, _, _, y1 in model_header])
header = [x0, y0, x1, y1]
if len(footer) == 0:
model_footer = footer_bboxs
if model_footer:
x0 = min([x for x, _, _, _ in model_footer])
y0 = min([y for _, y, _, _ in model_footer])
x1 = max([x1 for _, _, x1, _ in model_footer])
y1 = max([y1 for _, _, _, y1 in model_footer])
footer = [x0, y0, x1, y1]

header_y0 = 0 if len(header) == 0 else header[3]
footer_y0 = page_h if len(footer) == 0 else footer[1]
if page_no_bboxs:
top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]

top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h

header_y0 = max(header_y0, top_max_y0)
footer_y0 = min(footer_y0, btn_min_y1)

content_boundry = [0, header_y0, page_w, footer_y0]

header = [0, 0, page_w, header_y0]
footer = [0, footer_y0, page_w, page_h]

"""以上计算出来了页眉页脚的边界,下面开始进行删除"""
text_block_to_remove = []
# 首先检查每个textblock
for blk in text_raw_blocks:
if len(blk["lines"]) > 0:
for line in blk["lines"]:
line_del = []
for span in line["spans"]:
span_del = []
if span["bbox"][3] < header_y0:
span_del.append(span)
elif _is_in_or_part_overlap(
span["bbox"], header
) or _is_in_or_part_overlap(span["bbox"], footer):
span_del.append(span)
for span in span_del:
line["spans"].remove(span)
if not line["spans"]:
line_del.append(line)

for line in line_del:
blk["lines"].remove(line)
else:
# if not blk['lines']:
blk["tag"] = CONTENT_IN_FOOT_OR_HEADER
text_block_to_remove.append(blk)

"""有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
page_no_block_2_remove = []
if page_no_bboxs:
for pagenobox in page_no_bboxs:
for block in text_raw_blocks:
if _is_in_or_part_overlap(
pagenobox, block["bbox"]
): # 在span级别删除页码
for line in block["lines"]:
for span in line["spans"]:
if _is_in_or_part_overlap(pagenobox, span["bbox"]):
# span['text'] = ''
span["tag"] = PAGE_NO
# 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
if len(line["spans"]) == 1 and len(block["lines"]) == 1:
page_no_block_2_remove.append(block)
else:
# 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
if len(text_raw_blocks) > 0:
text_raw_blocks.sort(key=lambda x: x["bbox"][1], reverse=True)
last_block = text_raw_blocks[0]
if len(last_block["lines"]) == 1:
last_line = last_block["lines"][0]
if len(last_line["spans"]) == 1:
last_span = last_line["spans"][0]
if (
last_span["text"].strip()
and not re.search("[a-zA-Z]", last_span["text"])
and re.search("[0-9]", last_span["text"])
):
last_span["tag"] = PAGE_NO
page_no_block_2_remove.append(last_block)

for b in page_no_block_2_remove:
text_block_to_remove.append(b)

for blk in text_block_to_remove:
if blk in text_raw_blocks:
text_raw_blocks.remove(blk)

text_block_remain = text_raw_blocks
image_bbox_to_remove = [
bbox
for bbox in image_bboxes
if not _is_in_or_part_overlap(bbox, content_boundry)
]

image_bbox_remain = [
bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)
]
table_bbox_to_remove = [
bbox
for bbox in table_bboxes
if not _is_in_or_part_overlap(bbox, content_boundry)
]
table_bbox_remain = [
bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)
]

# 1, 2, 3
return (
image_bbox_remain,
table_bbox_remain,
text_block_remain,
text_block_to_remove,
image_bbox_to_remove,
table_bbox_to_remove,
)
Loading

0 comments on commit 439c18f

Please sign in to comment.