From 2277e31ff426281e804d18ec0255da0b75f52583 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B5=B5=E5=B0=8F=E8=92=99?= Date: Fri, 22 Mar 2024 16:33:54 +0800 Subject: [PATCH] =?UTF-8?q?ocr=5Fdemo=20main=E5=87=BD=E6=95=B0=E7=B2=BE?= =?UTF-8?q?=E7=AE=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- demo/ocr_demo.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/demo/ocr_demo.py b/demo/ocr_demo.py index 05557f4d..577b701f 100644 --- a/demo/ocr_demo.py +++ b/demo/ocr_demo.py @@ -6,7 +6,13 @@ from app.common.s3 import get_s3_config from demo.demo_test import get_json_from_local_or_s3 -from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown_with_para, ocr_mk_nlp_markdown, ocr_mk_mm_markdown, ocr_mk_mm_standard_format +from magic_pdf.dict2md.ocr_mkcontent import ( + ocr_mk_mm_markdown_with_para, + ocr_mk_nlp_markdown, + ocr_mk_mm_markdown, + ocr_mk_mm_standard_format, + ocr_mk_mm_markdown_with_para_and_pagination +) from magic_pdf.libs.commons import join_path from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr @@ -47,7 +53,7 @@ def ocr_online_parse(book_name, start_page_id=0, debug_mode=True): # logger.info(json_object) s3_pdf_path = json_object["file_location"] s3_config = get_s3_config(s3_pdf_path) - ocr_pdf_model_info = json_object["doc_layout_result"] + ocr_pdf_model_info = json_object.get("doc_layout_result") ocr_parse_core(book_name, s3_pdf_path, ocr_pdf_model_info, s3_config=s3_config) except Exception as e: logger.exception(e) @@ -72,6 +78,7 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0, # markdown_content = mk_nlp_markdown(pdf_info_dict) markdown_content = ocr_mk_mm_markdown_with_para(pdf_info_dict) + # markdown_pagination = ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict) with open(text_content_save_path, "w", encoding="utf-8") as f: f.write(markdown_content) @@ -83,14 +90,9 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0, if __name__ == '__main__': - #ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf" - #ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json" - # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf" - # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json" - - ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf" - ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json" - # ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf" - # ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json" - ocr_local_parse(ocr_pdf_path, ocr_json_file_path) - #ocr_online_parse(book_name="美国加州中学教材/edu_00000060") + # pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf" + # json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json" + # ocr_local_parse(pdf_path, json_file_path) + # book_name = "数学新星网/edu_00001236" + # ocr_online_parse(book_name) + pass