From 27c080a944862becd393709a076daaccf0a9675c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B5=B5=E5=B0=8F=E8=92=99?= Date: Thu, 21 Mar 2024 17:39:30 +0800 Subject: [PATCH] =?UTF-8?q?pipeline=E8=B0=83=E6=95=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- magic_pdf/pipeline.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/magic_pdf/pipeline.py b/magic_pdf/pipeline.py index 886f872d..64e16e8e 100644 --- a/magic_pdf/pipeline.py +++ b/magic_pdf/pipeline.py @@ -496,6 +496,35 @@ def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict: return jso +def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False) -> dict: + + if debug_mode: + pass + else: # 如果debug没开,则检测是否有needdrop字段 + if jso.get("need_drop", False): + book_name = join_path(get_data_source(jso), jso["file_id"]) + logger.info(f"book_name is:{book_name} need drop", file=sys.stderr) + jso["dropped"] = True + return jso + try: + pdf_intermediate_dict = jso["pdf_intermediate_dict"] + # 将 pdf_intermediate_dict 解压 + pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict) + markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict) + jso["content"] = markdown_content + logger.info( + f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}", + file=sys.stderr, + ) + # 把无用的信息清空 + jso["doc_layout_result"] = "" + jso["pdf_intermediate_dict"] = "" + jso["pdf_meta"] = "" + except Exception as e: + jso = exception_handler(jso, e) + return jso + + def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa( jso: dict, debug_mode=False ) -> dict: @@ -520,7 +549,8 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa( ) # 把无用的信息清空 jso["doc_layout_result"] = "" - jso["pdf_intermediate_dict"] = pdf_intermediate_dict + jso["pdf_intermediate_dict"] = "" + jso["mid_json_ocr"] = pdf_intermediate_dict jso["pdf_meta"] = "" except Exception as e: jso = exception_handler(jso, e)