Skip to content

Commit

Permalink
ocr_demo main函数精简
Browse files Browse the repository at this point in the history
  • Loading branch information
myhloli committed Mar 22, 2024
1 parent 7d010e1 commit 2277e31
Showing 1 changed file with 15 additions and 13 deletions.
28 changes: 15 additions & 13 deletions demo/ocr_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,13 @@

from app.common.s3 import get_s3_config
from demo.demo_test import get_json_from_local_or_s3
from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown_with_para, ocr_mk_nlp_markdown, ocr_mk_mm_markdown, ocr_mk_mm_standard_format
from magic_pdf.dict2md.ocr_mkcontent import (
ocr_mk_mm_markdown_with_para,
ocr_mk_nlp_markdown,
ocr_mk_mm_markdown,
ocr_mk_mm_standard_format,
ocr_mk_mm_markdown_with_para_and_pagination
)
from magic_pdf.libs.commons import join_path
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr

Expand Down Expand Up @@ -47,7 +53,7 @@ def ocr_online_parse(book_name, start_page_id=0, debug_mode=True):
# logger.info(json_object)
s3_pdf_path = json_object["file_location"]
s3_config = get_s3_config(s3_pdf_path)
ocr_pdf_model_info = json_object["doc_layout_result"]
ocr_pdf_model_info = json_object.get("doc_layout_result")
ocr_parse_core(book_name, s3_pdf_path, ocr_pdf_model_info, s3_config=s3_config)
except Exception as e:
logger.exception(e)
Expand All @@ -72,6 +78,7 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0,

# markdown_content = mk_nlp_markdown(pdf_info_dict)
markdown_content = ocr_mk_mm_markdown_with_para(pdf_info_dict)
# markdown_pagination = ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict)

with open(text_content_save_path, "w", encoding="utf-8") as f:
f.write(markdown_content)
Expand All @@ -83,14 +90,9 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0,


if __name__ == '__main__':
#ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
#ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"

ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
# ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
ocr_local_parse(ocr_pdf_path, ocr_json_file_path)
#ocr_online_parse(book_name="美国加州中学教材/edu_00000060")
# pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
# json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_local_parse(pdf_path, json_file_path)
# book_name = "数学新星网/edu_00001236"
# ocr_online_parse(book_name)
pass

0 comments on commit 2277e31

Please sign in to comment.