Skip to content

Commit

Permalink
ocr_pdf_intermediate_dict_to_markdown_with_para输出nlp格式的markdown
Browse files Browse the repository at this point in the history
  • Loading branch information
myhloli committed Mar 24, 2024
1 parent bf8d8e2 commit 07e4f11
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions magic_pdf/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
ocr_mk_nlp_markdown,
ocr_mk_mm_markdown,
ocr_mk_mm_standard_format,
ocr_mk_mm_markdown_with_para, ocr_mk_mm_markdown_with_para_and_pagination,
ocr_mk_mm_markdown_with_para, ocr_mk_mm_markdown_with_para_and_pagination, ocr_mk_nlp_markdown_with_para,
)
from magic_pdf.libs.commons import (
read_file,
Expand Down Expand Up @@ -510,7 +510,8 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False)
pdf_intermediate_dict = jso["pdf_intermediate_dict"]
# 将 pdf_intermediate_dict 解压
pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
# markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
markdown_content = ocr_mk_nlp_markdown_with_para(pdf_intermediate_dict)
jso["content"] = markdown_content
logger.info(
f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
Expand Down

0 comments on commit 07e4f11

Please sign in to comment.