Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs: add more method description #1336

Merged
merged 1 commit into from
Dec 20, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 34 additions & 3 deletions next_docs/en/user_guide/usage/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Local File Example
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.config.make_content_config import DropMode, MakeMode

# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
Expand All @@ -31,7 +32,6 @@ Local File Example
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
image_dir = str(os.path.basename(local_image_dir))

# read bytes
reader1 = FileBasedDataReader("")
Expand All @@ -57,6 +57,9 @@ Local File Example
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))

### get model inference result
model_inference_result = infer_result.get_infer_res()

### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))

Expand All @@ -69,6 +72,16 @@ Local File Example
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)

### get markdown content
md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)

### get content list content
content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT)

### get middle json
middle_json_content = pipe_result.get_middle_json()



S3 File Example
^^^^^^^^^^^^^^^^
Expand All @@ -80,20 +93,25 @@ S3 File Example
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.config.enums import SupportedPdfParseMethod

bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
ak = "{Your S3 access key}" # replace with real s3 access key
sk = "{Your S3 secret key}" # replace with real s3 secret key
endpoint_url = "{Your S3 endpoint_url}" # replace with real s3 endpoint_url


reader = S3DataReader('unittest/tmp/', bucket_name, ak, sk, endpoint_url) # replace `unittest/tmp` with the real s3 prefix
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
md_writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)

local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))

# args
pdf_file_name = (
"s3://llm-pdf-text-1/unittest/tmp/bug5-11.pdf" # replace with the real s3 path
f"s3://{bucket_name}/unittest/tmp/bug5-11.pdf" # replace with the real s3 path
)

# prepare env
Expand Down Expand Up @@ -123,6 +141,9 @@ S3 File Example
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))

### get model inference result
model_inference_result = infer_result.get_infer_res()

### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))

Expand All @@ -135,6 +156,16 @@ S3 File Example
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)

### get markdown content
md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)

### get content list content
content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT)

### get middle json
middle_json_content = pipe_result.get_middle_json()



MS-Office
----------
Expand Down
Loading