Skip to content

Commit

Permalink
pipeline调整
Browse files Browse the repository at this point in the history
  • Loading branch information
myhloli committed Mar 21, 2024
1 parent b94fd7f commit 27c080a
Showing 1 changed file with 31 additions and 1 deletion.
32 changes: 31 additions & 1 deletion magic_pdf/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,35 @@ def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
return jso


def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False) -> dict:

if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
if jso.get("need_drop", False):
book_name = join_path(get_data_source(jso), jso["file_id"])
logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
jso["dropped"] = True
return jso
try:
pdf_intermediate_dict = jso["pdf_intermediate_dict"]
# 将 pdf_intermediate_dict 解压
pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
jso["content"] = markdown_content
logger.info(
f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
file=sys.stderr,
)
# 把无用的信息清空
jso["doc_layout_result"] = ""
jso["pdf_intermediate_dict"] = ""
jso["pdf_meta"] = ""
except Exception as e:
jso = exception_handler(jso, e)
return jso


def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa(
jso: dict, debug_mode=False
) -> dict:
Expand All @@ -520,7 +549,8 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa(
)
# 把无用的信息清空
jso["doc_layout_result"] = ""
jso["pdf_intermediate_dict"] = pdf_intermediate_dict
jso["pdf_intermediate_dict"] = ""
jso["mid_json_ocr"] = pdf_intermediate_dict
jso["pdf_meta"] = ""
except Exception as e:
jso = exception_handler(jso, e)
Expand Down

0 comments on commit 27c080a

Please sign in to comment.