-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #6 from myhloli/feat/add_extract_train_data
feat: add extract_train_data
- Loading branch information
Showing
7 changed files
with
1,591 additions
and
134 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
|
||
|
||
def convert_to_train_format(jso: dict) -> []: | ||
pages = [] | ||
for k, v in jso.items(): | ||
page_idx = v["page_idx"] | ||
width, height = v["page_size"] | ||
|
||
info = {"page_info": {"page_no": page_idx, "height": height, "width": width}} | ||
|
||
bboxes: list[dict] = [] | ||
for img_bbox in v["image_bboxes_with_caption"]: | ||
bbox = {"category_id": 1, "bbox": img_bbox["bbox"]} | ||
if "caption" in img_bbox: | ||
bbox["caption_bbox"] = img_bbox["caption"] | ||
bboxes.append(bbox) | ||
|
||
for tbl_bbox in v["table_bboxes_with_caption"]: | ||
bbox = {"category_id": 7, "bbox": tbl_bbox["bbox"]} | ||
if "caption" in tbl_bbox: | ||
bbox["caption_bbox"] = tbl_bbox["caption"] | ||
bboxes.append(bbox) | ||
|
||
for bbox in v["bak_page_no_bboxes"]: | ||
n_bbox = {"category_id": 4, "bbox": bbox} | ||
bboxes.append(n_bbox) | ||
|
||
for bbox in v["bak_header_bboxes"]: | ||
n_bbox = {"category_id": 3, "bbox": bbox} | ||
bboxes.append(n_bbox) | ||
|
||
for bbox in v["bak_footer_bboxes"]: | ||
n_bbox = {"category_id": 6, "bbox": bbox} | ||
bboxes.append(n_bbox) | ||
|
||
# 脚注, 目前没有看到例子 | ||
for para in v["para_blocks"]: | ||
n_bbox = {"category_id": 2, "bbox": para["bbox"]} | ||
bboxes.append(n_bbox) | ||
|
||
for inline_equation in v["inline_equations"]: | ||
n_bbox = {"category_id": 13, "bbox": inline_equation["bbox"]} | ||
bboxes.append(n_bbox) | ||
|
||
for inter_equation in v["interline_equations"]: | ||
n_bbox = {"category_id": 10, "bbox": inter_equation["bbox"]} | ||
bboxes.append(n_bbox) | ||
|
||
info["bboxes"] = bboxes | ||
pages.append(info) | ||
|
||
return pages |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from magic_pdf.libs.boxbase import _is_in | ||
|
||
|
||
def extract_caption_bbox(outer: list, inner: list) -> list: | ||
""" | ||
ret: list of { | ||
"bbox": [1,2,3,4], | ||
"caption": [5,6,7,8] # may existed | ||
} | ||
""" | ||
found_count = 0 # for debug | ||
print(outer, inner) | ||
|
||
def is_float_equal(a, b): | ||
if 0.01 > abs(a - b): # non strict float equal compare | ||
return True | ||
return False | ||
|
||
outer_h = {i: outer[i] for i in range(len(outer))} | ||
ret = [] | ||
for v in inner: | ||
ix0, iy0, ix1, iy1 = v | ||
found_idx = None | ||
d = {"bbox": v[:4]} | ||
for k in outer_h: | ||
ox0, oy0, ox1, oy1 = outer_h[k] | ||
equal_float_flags = [ | ||
is_float_equal(ix0, ox0), | ||
is_float_equal(iy0, oy0), | ||
is_float_equal(ix1, ox1), | ||
is_float_equal(iy1, oy1), | ||
] | ||
if _is_in(v, outer_h[k]) and not all(equal_float_flags): | ||
found_idx = k | ||
break | ||
if found_idx is not None: | ||
found_count += 1 | ||
captions: list[list] = [] | ||
ox0, oy0, ox1, oy1 = outer_h[found_idx] | ||
captions = [ | ||
[ox0, oy0, ix0, oy1], | ||
[ox0, oy0, ox1, iy0], | ||
[ox0, iy1, ox1, oy1], | ||
[ix1, oy0, ox1, oy1], | ||
] | ||
captions = sorted( | ||
captions, | ||
key=lambda rect: abs(rect[0] - rect[2]) * abs(rect[1] - rect[3]), | ||
) # 面积最大的框就是caption | ||
d["caption"] = captions[-1] | ||
outer_h.pop( | ||
found_idx | ||
) # 同一个 outer box 只能用于确定一个 inner box 的 caption 位置。 | ||
|
||
ret.append(d) | ||
|
||
print("found_count: ", found_count) | ||
return ret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
import re | ||
|
||
from magic_pdf.libs.boxbase import _is_in_or_part_overlap | ||
from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO | ||
|
||
|
||
""" | ||
copy from pre_proc/remove_footer_header.py | ||
""" | ||
|
||
|
||
def remove_headder_footer_one_page( | ||
text_raw_blocks, | ||
image_bboxes, | ||
table_bboxes, | ||
header_bboxs, | ||
footer_bboxs, | ||
page_no_bboxs, | ||
page_w, | ||
page_h, | ||
): | ||
""" | ||
删除页眉页脚,页码 | ||
从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中 | ||
""" | ||
if 1: | ||
return image_bboxes, table_bboxes, text_raw_blocks, [], [], [] | ||
|
||
header = [] | ||
footer = [] | ||
if len(header) == 0: | ||
model_header = header_bboxs | ||
if model_header: | ||
x0 = min([x for x, _, _, _ in model_header]) | ||
y0 = min([y for _, y, _, _ in model_header]) | ||
x1 = max([x1 for _, _, x1, _ in model_header]) | ||
y1 = max([y1 for _, _, _, y1 in model_header]) | ||
header = [x0, y0, x1, y1] | ||
if len(footer) == 0: | ||
model_footer = footer_bboxs | ||
if model_footer: | ||
x0 = min([x for x, _, _, _ in model_footer]) | ||
y0 = min([y for _, y, _, _ in model_footer]) | ||
x1 = max([x1 for _, _, x1, _ in model_footer]) | ||
y1 = max([y1 for _, _, _, y1 in model_footer]) | ||
footer = [x0, y0, x1, y1] | ||
|
||
header_y0 = 0 if len(header) == 0 else header[3] | ||
footer_y0 = page_h if len(footer) == 0 else footer[1] | ||
if page_no_bboxs: | ||
top_part = [b for b in page_no_bboxs if b[3] < page_h / 2] | ||
btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2] | ||
|
||
top_max_y0 = max([b[1] for b in top_part]) if top_part else 0 | ||
btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h | ||
|
||
header_y0 = max(header_y0, top_max_y0) | ||
footer_y0 = min(footer_y0, btn_min_y1) | ||
|
||
content_boundry = [0, header_y0, page_w, footer_y0] | ||
|
||
header = [0, 0, page_w, header_y0] | ||
footer = [0, footer_y0, page_w, page_h] | ||
|
||
"""以上计算出来了页眉页脚的边界,下面开始进行删除""" | ||
text_block_to_remove = [] | ||
# 首先检查每个textblock | ||
for blk in text_raw_blocks: | ||
if len(blk["lines"]) > 0: | ||
for line in blk["lines"]: | ||
line_del = [] | ||
for span in line["spans"]: | ||
span_del = [] | ||
if span["bbox"][3] < header_y0: | ||
span_del.append(span) | ||
elif _is_in_or_part_overlap( | ||
span["bbox"], header | ||
) or _is_in_or_part_overlap(span["bbox"], footer): | ||
span_del.append(span) | ||
for span in span_del: | ||
line["spans"].remove(span) | ||
if not line["spans"]: | ||
line_del.append(line) | ||
|
||
for line in line_del: | ||
blk["lines"].remove(line) | ||
else: | ||
# if not blk['lines']: | ||
blk["tag"] = CONTENT_IN_FOOT_OR_HEADER | ||
text_block_to_remove.append(blk) | ||
|
||
"""有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除""" | ||
page_no_block_2_remove = [] | ||
if page_no_bboxs: | ||
for pagenobox in page_no_bboxs: | ||
for block in text_raw_blocks: | ||
if _is_in_or_part_overlap( | ||
pagenobox, block["bbox"] | ||
): # 在span级别删除页码 | ||
for line in block["lines"]: | ||
for span in line["spans"]: | ||
if _is_in_or_part_overlap(pagenobox, span["bbox"]): | ||
# span['text'] = '' | ||
span["tag"] = PAGE_NO | ||
# 检查这个block是否只有这一个span,如果是,那么就把这个block也删除 | ||
if len(line["spans"]) == 1 and len(block["lines"]) == 1: | ||
page_no_block_2_remove.append(block) | ||
else: | ||
# 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字 | ||
if len(text_raw_blocks) > 0: | ||
text_raw_blocks.sort(key=lambda x: x["bbox"][1], reverse=True) | ||
last_block = text_raw_blocks[0] | ||
if len(last_block["lines"]) == 1: | ||
last_line = last_block["lines"][0] | ||
if len(last_line["spans"]) == 1: | ||
last_span = last_line["spans"][0] | ||
if ( | ||
last_span["text"].strip() | ||
and not re.search("[a-zA-Z]", last_span["text"]) | ||
and re.search("[0-9]", last_span["text"]) | ||
): | ||
last_span["tag"] = PAGE_NO | ||
page_no_block_2_remove.append(last_block) | ||
|
||
for b in page_no_block_2_remove: | ||
text_block_to_remove.append(b) | ||
|
||
for blk in text_block_to_remove: | ||
if blk in text_raw_blocks: | ||
text_raw_blocks.remove(blk) | ||
|
||
text_block_remain = text_raw_blocks | ||
image_bbox_to_remove = [ | ||
bbox | ||
for bbox in image_bboxes | ||
if not _is_in_or_part_overlap(bbox, content_boundry) | ||
] | ||
|
||
image_bbox_remain = [ | ||
bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry) | ||
] | ||
table_bbox_to_remove = [ | ||
bbox | ||
for bbox in table_bboxes | ||
if not _is_in_or_part_overlap(bbox, content_boundry) | ||
] | ||
table_bbox_remain = [ | ||
bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry) | ||
] | ||
|
||
# 1, 2, 3 | ||
return ( | ||
image_bbox_remain, | ||
table_bbox_remain, | ||
text_block_remain, | ||
text_block_to_remove, | ||
image_bbox_to_remove, | ||
table_bbox_to_remove, | ||
) |
Oops, something went wrong.