Merge branch 'master' of https://github.com/magicpdf/Magic-PDF

opendatalab · Apr 25, 2024 · d74957c · d74957c
2 parents c9f2109 + d286e3b
commit d74957c
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 23 deletions.
diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -95,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
     page_markdown = []
     for para_block in paras_of_layout:
         para_text = ''
-        para_type = para_block.get('type')
+        para_type = para_block['type']
         if para_type == BlockType.Text:
             para_text = merge_para_with_text(para_block)
         elif para_type == BlockType.Title:
@@ -106,32 +106,30 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
             if mode == 'nlp':
                 continue
             elif mode == 'mm':
-                img_blocks = para_block.get('blocks')
-                for img_block in img_blocks:
-                    if img_block.get('type') == BlockType.ImageBody:
-                        for line in img_block.get('lines'):
+                for block in para_block['blocks']:
+                    if block['type'] == BlockType.ImageBody:
+                        for line in block['lines']:
                             for span in line['spans']:
-                                if span.get('type') == ContentType.Image:
+                                if span['type'] == ContentType.Image:
                                     para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
-                for img_block in img_blocks:
-                    if img_block.get('type') == BlockType.ImageCaption:
-                        para_text += merge_para_with_text(img_block)
+                for block in para_block['blocks']:
+                    if block['type'] == BlockType.ImageCaption:
+                        para_text += merge_para_with_text(block)
         elif para_type == BlockType.Table:
             if mode == 'nlp':
                 continue
             elif mode == 'mm':
-                table_blocks = para_block.get('blocks')
-                for table_block in table_blocks:
-                    if table_block.get('type') == BlockType.TableBody:
-                        for line in table_block.get('lines'):
+                for block in para_block['blocks']:
+                    if block['type'] == BlockType.TableBody:
+                        for line in block['lines']:
                             for span in line['spans']:
-                                if span.get('type') == ContentType.Table:
+                                if span['type'] == ContentType.Table:
                                     para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
-                for table_block in table_blocks:
-                    if table_block.get('type') == BlockType.TableCaption:
-                        para_text += merge_para_with_text(table_block)
-                    elif table_block.get('type') == BlockType.TableFootnote:
-                        para_text += merge_para_with_text(table_block)
+                for block in para_block['blocks']:
+                    if block['type'] == BlockType.TableCaption:
+                        para_text += merge_para_with_text(block)
+                    elif block['type'] == BlockType.TableFootnote:
+                        para_text += merge_para_with_text(block)
 
         if para_text.strip() == '':
             continue
@@ -141,11 +139,11 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
     return page_markdown
 
 
-def merge_para_with_text(para):
+def merge_para_with_text(para_block):
     para_text = ''
-    for line in para['lines']:
+    for line in para_block['lines']:
         for span in line['spans']:
-            span_type = span.get('type')
+            span_type = span['type']
             content = ''
             language = ''
             if span_type == ContentType.Text:
@@ -159,6 +157,7 @@ def merge_para_with_text(para):
                 content = f"${span['content']}$"
             elif span_type == ContentType.InterlineEquation:
                 content = f"\n$$\n{span['content']}\n$$\n"
+
             if content != '':
                 if language == 'en':  # 英文语境下 content间需要空格分隔
                     para_text += content + ' '

diff --git a/magic_pdf/pdf_parse_by_ocr_v2.py b/magic_pdf/pdf_parse_by_ocr_v2.py
@@ -61,7 +61,7 @@ def parse_pdf_by_ocr(pdf_bytes,
         '''将所有区块的bbox整理到一起'''
         all_bboxes = ocr_prepare_bboxes_for_layout_split(
             img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
-            interline_equation_blocks, page_w, page_h)
+            interline_equations, page_w, page_h)
 
         '''根据区块信息计算layout'''
         page_boundry = [0, 0, page_w, page_h]