Skip to content

Commit

Permalink
Merge pull request #1436 from myhloli/dev
Browse files Browse the repository at this point in the history
feat(api): simplify markdown and content list generation
  • Loading branch information
myhloli authored Jan 7, 2025
2 parents 91e5fdd + 52efe94 commit ed20278
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 26 deletions.
18 changes: 16 additions & 2 deletions demo/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
image_dir = str(os.path.basename(local_image_dir))

# read bytes
reader1 = FileBasedDataReader("")
Expand All @@ -45,14 +44,29 @@
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))

### get model inference result
model_inference_result = infer_result.get_infer_res()

### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))

### draw spans result on each page
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))

### get markdown content
md_content = pipe_result.get_markdown(image_dir)

### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)

### get content list content
content_list_content = pipe_result.get_content_list(image_dir)

### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)

### get middle json
middle_json_content = pipe_result.get_middle_json()

### dump middle json
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
1 change: 0 additions & 1 deletion demo/demo1.json

This file was deleted.

1 change: 0 additions & 1 deletion demo/demo2.json

This file was deleted.

1 change: 0 additions & 1 deletion demo/small_ocr.json

This file was deleted.

16 changes: 6 additions & 10 deletions magic_pdf/operators/pipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ def __init__(self, pipe_res, dataset: Dataset):
def get_markdown(
self,
img_dir_or_bucket_prefix: str,
drop_mode=DropMode.WHOLE_PDF,
drop_mode=DropMode.NONE,
md_make_mode=MakeMode.MM_MD,
) -> str:
"""Get markdown content.
Args:
img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
Returns:
Expand All @@ -50,7 +50,7 @@ def dump_md(
writer: DataWriter,
file_path: str,
img_dir_or_bucket_prefix: str,
drop_mode=DropMode.WHOLE_PDF,
drop_mode=DropMode.NONE,
md_make_mode=MakeMode.MM_MD,
):
"""Dump The Markdown.
Expand All @@ -59,7 +59,7 @@ def dump_md(
writer (DataWriter): File writer handle
file_path (str): The file location of markdown
img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
"""

Expand All @@ -72,22 +72,20 @@ def get_content_list(
self,
image_dir_or_bucket_prefix: str,
drop_mode=DropMode.NONE,
md_make_mode=MakeMode.STANDARD_FORMAT,
) -> str:
"""Get Content List.
Args:
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
Returns:
str: content list content
"""
pdf_info_list = self._pipe_res['pdf_info']
content_list = union_make(
pdf_info_list,
md_make_mode,
MakeMode.STANDARD_FORMAT,
drop_mode,
image_dir_or_bucket_prefix,
)
Expand All @@ -99,7 +97,6 @@ def dump_content_list(
file_path: str,
image_dir_or_bucket_prefix: str,
drop_mode=DropMode.NONE,
md_make_mode=MakeMode.STANDARD_FORMAT,
):
"""Dump Content List.
Expand All @@ -108,10 +105,9 @@ def dump_content_list(
file_path (str): The file location of content list
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
"""
content_list = self.get_content_list(
image_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode
image_dir_or_bucket_prefix, drop_mode=drop_mode,
)
writer.write_string(
file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
Expand Down
24 changes: 13 additions & 11 deletions next_docs/en/user_guide/usage/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ Local File Example
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.config.make_content_config import DropMode, MakeMode
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
Expand Down Expand Up @@ -66,21 +65,24 @@ Local File Example
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
### get markdown content
md_content = pipe_result.get_markdown(image_dir)
### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir)
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
### get markdown content
md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT)
### get middle json
middle_json_content = pipe_result.get_middle_json()
### dump middle json
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
S3 File Example
Expand All @@ -93,7 +95,6 @@ S3 File Example
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.config.enums import SupportedPdfParseMethod
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
Expand Down Expand Up @@ -157,15 +158,16 @@ S3 File Example
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
### get markdown content
md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)
md_content = pipe_result.get_markdown(image_dir)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT)
content_list_content = pipe_result.get_content_list(image_dir)
### get middle json
middle_json_content = pipe_result.get_middle_json()
### dump middle json
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
MS-Office
----------
Expand Down

0 comments on commit ed20278

Please sign in to comment.