diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py index 21c9d7a2..f438f6e6 100644 --- a/magic_pdf/dict2md/ocr_mkcontent.py +++ b/magic_pdf/dict2md/ocr_mkcontent.py @@ -8,6 +8,7 @@ from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char from magic_pdf.libs.ocr_content_type import BlockType, ContentType +from magic_pdf.para.para_split_v3 import ListLineTag def __is_hyphen_at_line_end(line): @@ -124,7 +125,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, for para_block in paras_of_layout: para_text = '' para_type = para_block['type'] - if para_type == BlockType.Text: + if para_type in [BlockType.Text, BlockType.List, BlockType.Index]: para_text = merge_para_with_text(para_block, parse_type=parse_type, lang=lang) elif para_type == BlockType.Title: para_text = f'# {merge_para_with_text(para_block, parse_type=parse_type, lang=lang)}' @@ -177,22 +178,26 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, return page_markdown -def merge_para_with_text(para_block, parse_type="auto", lang=None): - - def detect_language(text): - en_pattern = r'[a-zA-Z]+' - en_matches = re.findall(en_pattern, text) - en_length = sum(len(match) for match in en_matches) - if len(text) > 0: - if en_length / len(text) >= 0.5: - return 'en' - else: - return 'unknown' +def detect_language(text): + en_pattern = r'[a-zA-Z]+' + en_matches = re.findall(en_pattern, text) + en_length = sum(len(match) for match in en_matches) + if len(text) > 0: + if en_length / len(text) >= 0.5: + return 'en' else: - return 'empty' + return 'unknown' + else: + return 'empty' + +def merge_para_with_text(para_block, parse_type="auto", lang=None): para_text = '' - for line in para_block['lines']: + for i, line in enumerate(para_block['lines']): + + if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False): + para_text += ' \n' + line_text = '' line_lang = '' for span in line['spans']: diff --git a/magic_pdf/libs/draw_bbox.py b/magic_pdf/libs/draw_bbox.py index 36265fb2..550a4cec 100644 --- a/magic_pdf/libs/draw_bbox.py +++ b/magic_pdf/libs/draw_bbox.py @@ -75,6 +75,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): titles_list = [] texts_list = [] interequations_list = [] + lists_list = [] + indexs_list = [] for page in pdf_info: page_dropped_list = [] @@ -83,6 +85,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): titles = [] texts = [] interequations = [] + lists = [] + indexs = [] for dropped_bbox in page['discarded_blocks']: page_dropped_list.append(dropped_bbox['bbox']) @@ -115,6 +119,11 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): texts.append(bbox) elif block['type'] == BlockType.InterlineEquation: interequations.append(bbox) + elif block['type'] == BlockType.List: + lists.append(bbox) + elif block['type'] == BlockType.Index: + indexs.append(bbox) + tables_list.append(tables) tables_body_list.append(tables_body) tables_caption_list.append(tables_caption) @@ -126,6 +135,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): titles_list.append(titles) texts_list.append(texts) interequations_list.append(interequations) + lists_list.append(lists) + indexs_list.append(indexs) layout_bbox_list = [] @@ -160,6 +171,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True) draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True) + draw_bbox_without_number(i, lists_list, page, [40, 169, 92], True) + draw_bbox_without_number(i, indexs_list, page, [40, 169, 92], True) draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False, draw_bbox=False) diff --git a/magic_pdf/libs/ocr_content_type.py b/magic_pdf/libs/ocr_content_type.py index 749c16f9..30d88cfd 100644 --- a/magic_pdf/libs/ocr_content_type.py +++ b/magic_pdf/libs/ocr_content_type.py @@ -20,6 +20,8 @@ class BlockType: InterlineEquation = 'interline_equation' Footnote = 'footnote' Discarded = 'discarded' + List = 'list' + Index = 'index' class CategoryId: diff --git a/magic_pdf/model/pdf_extract_kit.py b/magic_pdf/model/pdf_extract_kit.py index 1235a0a8..bca9b987 100644 --- a/magic_pdf/model/pdf_extract_kit.py +++ b/magic_pdf/model/pdf_extract_kit.py @@ -340,7 +340,7 @@ def __call__(self, image): if torch.cuda.is_available(): properties = torch.cuda.get_device_properties(self.device) total_memory = properties.total_memory / (1024 ** 3) # 将字节转换为 GB - if total_memory <= 8: + if total_memory <= 10: gc_start = time.time() clean_memory() gc_time = round(time.time() - gc_start, 2) diff --git a/magic_pdf/para/para_split_v3.py b/magic_pdf/para/para_split_v3.py index d3407061..0ee2004a 100644 --- a/magic_pdf/para/para_split_v3.py +++ b/magic_pdf/para/para_split_v3.py @@ -1,8 +1,17 @@ import copy +from loguru import logger + from magic_pdf.libs.Constants import LINES_DELETED, CROSS_PAGE +from magic_pdf.libs.ocr_content_type import BlockType, ContentType LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';') +LIST_END_FLAG = ('.', '。', ';', ';') + + +class ListLineTag: + IS_LIST_START_LINE = "is_list_start_line" + IS_LIST_END_LINE = "is_list_end_line" def __process_blocks(blocks): @@ -38,7 +47,127 @@ def __process_blocks(blocks): return result -def __merge_2_blocks(block1, block2): +def __is_list_block(block): + # 一个block如果是list block 应该同时满足以下特征 + # 1.block内有多个line 2.block 内有多个line左侧顶格写 3.block内有多个line 右侧不顶格(狗牙状) + # 1.block内有多个line 2.block 内有多个line左侧顶格写 3.多个line以endflag结尾 + # 1.block内有多个line 2.block 内有多个line左侧顶格写 3.block内有多个line 左侧不顶格 + if len(block['lines']) >= 3: + first_line = block['lines'][0] + line_height = first_line['bbox'][3] - first_line['bbox'][1] + block_weight = block['bbox_fs'][2] - block['bbox_fs'][0] + + left_close_num = 0 + left_not_close_num = 0 + right_not_close_num = 0 + lines_text_list = [] + for line in block['lines']: + + line_text = "" + + for span in line['spans']: + span_type = span['type'] + if span_type == ContentType.Text: + line_text += span['content'].strip() + + lines_text_list.append(line_text) + + # 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断 + if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2: + left_close_num += 1 + elif line['bbox'][0] - block['bbox_fs'][0] > line_height: + # logger.info(f"{line_text}, {block['bbox_fs']}, {line['bbox']}") + left_not_close_num += 1 + + # 计算右侧是否不顶格,拍脑袋用0.3block宽度做阈值 + closed_area = 0.3 * block_weight + # closed_area = 5 * line_height + if block['bbox_fs'][2] - line['bbox'][2] > closed_area: + right_not_close_num += 1 + + # 判断lines_text_list中的元素是否有超过80%都以LIST_END_FLAG结尾 + line_end_flag = False + if len(lines_text_list) > 0: + num_end_count = 0 + for line_text in lines_text_list: + if len(line_text) > 0: + if line_text[-1] in LIST_END_FLAG: + num_end_count += 1 + + if num_end_count / len(lines_text_list) >= 0.8: + line_end_flag = True + + if left_close_num >= 2 and (right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2): + for line in block['lines']: + if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2: + line[ListLineTag.IS_LIST_START_LINE] = True + if abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height: + line[ListLineTag.IS_LIST_END_LINE] = True + + return True + else: + return False + else: + return False + + +def __is_index_block(block): + # 一个block如果是index block 应该同时满足以下特征 + # 1.block内有多个line 2.block 内有多个line两侧均顶格写 3.line的开头或者结尾均为数字 + if len(block['lines']) >= 3: + first_line = block['lines'][0] + line_height = first_line['bbox'][3] - first_line['bbox'][1] + + left_close_num = 0 + right_close_num = 0 + + lines_text_list = [] + for line in block['lines']: + + # 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断 + if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2: + left_close_num += 1 + + # 计算右侧是否不顶格 + if abs(block['bbox_fs'][2] - line['bbox'][2]) < line_height / 2: + right_close_num += 1 + + line_text = "" + + for span in line['spans']: + span_type = span['type'] + if span_type == ContentType.Text: + line_text += span['content'].strip() + + lines_text_list.append(line_text) + + # 判断lines_text_list中的元素是否有超过80%都以数字开头或都以数字结尾 + line_num_flag = False + if len(lines_text_list) > 0: + num_start_count = 0 + num_end_count = 0 + for line_text in lines_text_list: + if len(line_text) > 0: + if line_text[0].isdigit(): + num_start_count += 1 + if line_text[-1].isdigit(): + num_end_count += 1 + + if num_start_count / len(lines_text_list) >= 0.8 or num_end_count / len(lines_text_list) >= 0.8: + line_num_flag = True + + if left_close_num >= 2 and right_close_num >= 2 and line_num_flag: + for line in block['lines']: + line[ListLineTag.IS_LIST_START_LINE] = True + + return True + else: + return False + else: + return False + + +def __merge_2_text_blocks(block1, block2): if len(block1['lines']) > 0: first_line = block1['lines'][0] line_height = first_line['bbox'][3] - first_line['bbox'][1] @@ -59,17 +188,46 @@ def __merge_2_blocks(block1, block2): return block1, block2 +def __merge_2_list_blocks(block1, block2): + + if block1['page_num'] != block2['page_num']: + for line in block1['lines']: + for span in line['spans']: + span[CROSS_PAGE] = True + block2['lines'].extend(block1['lines']) + block1['lines'] = [] + block1[LINES_DELETED] = True + + return block1, block2 + + def __para_merge_page(blocks): page_text_blocks_groups = __process_blocks(blocks) for text_blocks_group in page_text_blocks_groups: + + if len(text_blocks_group) > 0: + # 需要先在合并前对所有block判断是否为list block + for block in text_blocks_group: + if __is_list_block(block): + block['type'] = BlockType.List + elif __is_index_block(block): + block['type'] = BlockType.Index + if len(text_blocks_group) > 1: # 倒序遍历 for i in range(len(text_blocks_group)-1, -1, -1): current_block = text_blocks_group[i] + # 检查是否有前一个块 if i - 1 >= 0: prev_block = text_blocks_group[i - 1] - __merge_2_blocks(current_block, prev_block) + + if current_block['type'] == 'text' and prev_block['type'] == 'text': + __merge_2_text_blocks(current_block, prev_block) + if current_block['type'] == BlockType.List and prev_block['type'] == BlockType.List: + __merge_2_list_blocks(current_block, prev_block) + if current_block['type'] == BlockType.Index and prev_block['type'] == BlockType.Index: + __merge_2_list_blocks(current_block, prev_block) else: continue diff --git a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py index 9767030b..8725b884 100644 --- a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py +++ b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py @@ -108,7 +108,7 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b all_bboxes = remove_overlaps_min_blocks(all_bboxes) all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks) '''将剩余的bbox做分离处理,防止后面分layout时出错''' - # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes) + all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes) return all_bboxes, all_discarded_blocks diff --git a/magic_pdf/pre_proc/ocr_dict_merge.py b/magic_pdf/pre_proc/ocr_dict_merge.py index d9a24319..69c4982f 100644 --- a/magic_pdf/pre_proc/ocr_dict_merge.py +++ b/magic_pdf/pre_proc/ocr_dict_merge.py @@ -49,8 +49,7 @@ def merge_spans_to_line(spans): continue # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行 - if __is_overlaps_y_exceeds_threshold(span['bbox'], - current_line[-1]['bbox']): + if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], 0.6): current_line.append(span) else: # 否则,开始新行