Skip to content

Commit

Permalink
feat(api): simplify markdown and content list generation
Browse files Browse the repository at this point in the history
- Remove DropMode and MakeMode imports from user code
- Set default drop_mode to DropMode.NONE in get_markdown and get_content_list methods
- Remove md_make_mode parameter from get_content_list method
- Add dump_middle_json method to PipeResult
- Update examples in API documentation and demo script
  • Loading branch information
myhloli committed Jan 7, 2025
1 parent 15db6fe commit 52efe94
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 31 deletions.
16 changes: 9 additions & 7 deletions demo/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.config.make_content_config import DropMode, MakeMode

# args
pdf_file_name = "demo1.pdf" # replace with the real pdf path
Expand Down Expand Up @@ -54,17 +53,20 @@
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))

### get markdown content
md_content = pipe_result.get_markdown(image_dir)

### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)

### get content list content
content_list_content = pipe_result.get_content_list(image_dir)

### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)

### get markdown content
md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD)

### get content list content
content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT)

### get middle json
middle_json_content = pipe_result.get_middle_json()

### dump middle json
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
1 change: 0 additions & 1 deletion demo/demo1.json

This file was deleted.

1 change: 0 additions & 1 deletion demo/demo2.json

This file was deleted.

1 change: 0 additions & 1 deletion demo/small_ocr.json

This file was deleted.

16 changes: 6 additions & 10 deletions magic_pdf/operators/pipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ def __init__(self, pipe_res, dataset: Dataset):
def get_markdown(
self,
img_dir_or_bucket_prefix: str,
drop_mode=DropMode.WHOLE_PDF,
drop_mode=DropMode.NONE,
md_make_mode=MakeMode.MM_MD,
) -> str:
"""Get markdown content.
Args:
img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
Returns:
Expand All @@ -50,7 +50,7 @@ def dump_md(
writer: DataWriter,
file_path: str,
img_dir_or_bucket_prefix: str,
drop_mode=DropMode.WHOLE_PDF,
drop_mode=DropMode.NONE,
md_make_mode=MakeMode.MM_MD,
):
"""Dump The Markdown.
Expand All @@ -59,7 +59,7 @@ def dump_md(
writer (DataWriter): File writer handle
file_path (str): The file location of markdown
img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
"""

Expand All @@ -72,22 +72,20 @@ def get_content_list(
self,
image_dir_or_bucket_prefix: str,
drop_mode=DropMode.NONE,
md_make_mode=MakeMode.STANDARD_FORMAT,
) -> str:
"""Get Content List.
Args:
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
Returns:
str: content list content
"""
pdf_info_list = self._pipe_res['pdf_info']
content_list = union_make(
pdf_info_list,
md_make_mode,
MakeMode.STANDARD_FORMAT,
drop_mode,
image_dir_or_bucket_prefix,
)
Expand All @@ -99,7 +97,6 @@ def dump_content_list(
file_path: str,
image_dir_or_bucket_prefix: str,
drop_mode=DropMode.NONE,
md_make_mode=MakeMode.STANDARD_FORMAT,
):
"""Dump Content List.
Expand All @@ -108,10 +105,9 @@ def dump_content_list(
file_path (str): The file location of content list
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
"""
content_list = self.get_content_list(
image_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode
image_dir_or_bucket_prefix, drop_mode=drop_mode,
)
writer.write_string(
file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
Expand Down
24 changes: 13 additions & 11 deletions next_docs/en/user_guide/usage/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ Local File Example
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.config.make_content_config import DropMode, MakeMode
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
Expand Down Expand Up @@ -66,21 +65,24 @@ Local File Example
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
### get markdown content
md_content = pipe_result.get_markdown(image_dir)
### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir)
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
### get markdown content
md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT)
### get middle json
middle_json_content = pipe_result.get_middle_json()
### dump middle json
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
S3 File Example
Expand All @@ -93,7 +95,6 @@ S3 File Example
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.config.enums import SupportedPdfParseMethod
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
Expand Down Expand Up @@ -157,15 +158,16 @@ S3 File Example
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
### get markdown content
md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)
md_content = pipe_result.get_markdown(image_dir)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT)
content_list_content = pipe_result.get_content_list(image_dir)
### get middle json
middle_json_content = pipe_result.get_middle_json()
### dump middle json
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
MS-Office
----------
Expand Down

0 comments on commit 52efe94

Please sign in to comment.