diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py index 188b25a5..fb0357b3 100644 --- a/magic_pdf/dict2md/ocr_mkcontent.py +++ b/magic_pdf/dict2md/ocr_mkcontent.py @@ -95,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""): page_markdown = [] for para_block in paras_of_layout: para_text = '' - para_type = para_block.get('type') + para_type = para_block['type'] if para_type == BlockType.Text: para_text = merge_para_with_text(para_block) elif para_type == BlockType.Title: @@ -106,32 +106,30 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""): if mode == 'nlp': continue elif mode == 'mm': - img_blocks = para_block.get('blocks') - for img_block in img_blocks: - if img_block.get('type') == BlockType.ImageBody: - for line in img_block.get('lines'): + for block in para_block['blocks']: + if block['type'] == BlockType.ImageBody: + for line in block['lines']: for span in line['spans']: - if span.get('type') == ContentType.Image: + if span['type'] == ContentType.Image: para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n" - for img_block in img_blocks: - if img_block.get('type') == BlockType.ImageCaption: - para_text += merge_para_with_text(img_block) + for block in para_block['blocks']: + if block['type'] == BlockType.ImageCaption: + para_text += merge_para_with_text(block) elif para_type == BlockType.Table: if mode == 'nlp': continue elif mode == 'mm': - table_blocks = para_block.get('blocks') - for table_block in table_blocks: - if table_block.get('type') == BlockType.TableBody: - for line in table_block.get('lines'): + for block in para_block['blocks']: + if block['type'] == BlockType.TableBody: + for line in block['lines']: for span in line['spans']: - if span.get('type') == ContentType.Table: + if span['type'] == ContentType.Table: para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n" - for table_block in table_blocks: - if table_block.get('type') == BlockType.TableCaption: - para_text += merge_para_with_text(table_block) - elif table_block.get('type') == BlockType.TableFootnote: - para_text += merge_para_with_text(table_block) + for block in para_block['blocks']: + if block['type'] == BlockType.TableCaption: + para_text += merge_para_with_text(block) + elif block['type'] == BlockType.TableFootnote: + para_text += merge_para_with_text(block) if para_text.strip() == '': continue @@ -141,11 +139,11 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""): return page_markdown -def merge_para_with_text(para): +def merge_para_with_text(para_block): para_text = '' - for line in para['lines']: + for line in para_block['lines']: for span in line['spans']: - span_type = span.get('type') + span_type = span['type'] content = '' language = '' if span_type == ContentType.Text: @@ -159,6 +157,7 @@ def merge_para_with_text(para): content = f"${span['content']}$" elif span_type == ContentType.InterlineEquation: content = f"\n$$\n{span['content']}\n$$\n" + if content != '': if language == 'en': # 英文语境下 content间需要空格分隔 para_text += content + ' ' diff --git a/magic_pdf/pdf_parse_by_ocr_v2.py b/magic_pdf/pdf_parse_by_ocr_v2.py index e7324587..cab4f2f4 100644 --- a/magic_pdf/pdf_parse_by_ocr_v2.py +++ b/magic_pdf/pdf_parse_by_ocr_v2.py @@ -61,7 +61,7 @@ def parse_pdf_by_ocr(pdf_bytes, '''将所有区块的bbox整理到一起''' all_bboxes = ocr_prepare_bboxes_for_layout_split( img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks, - interline_equation_blocks, page_w, page_h) + interline_equations, page_w, page_h) '''根据区块信息计算layout''' page_boundry = [0, 0, page_w, page_h]