diff --git a/next_docs/en/user_guide/usage/api.rst b/next_docs/en/user_guide/usage/api.rst index 3a1cca07..d6802a38 100644 --- a/next_docs/en/user_guide/usage/api.rst +++ b/next_docs/en/user_guide/usage/api.rst @@ -17,6 +17,7 @@ Local File Example from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.config.enums import SupportedPdfParseMethod + from magic_pdf.config.make_content_config import DropMode, MakeMode # args pdf_file_name = "abc.pdf" # replace with the real pdf path @@ -31,7 +32,6 @@ Local File Example image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter( local_md_dir ) - image_dir = str(os.path.basename(local_image_dir)) # read bytes reader1 = FileBasedDataReader("") @@ -57,6 +57,9 @@ Local File Example ### draw model result on each page infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf")) + ### get model inference result + model_inference_result = infer_result.get_infer_res() + ### draw layout result on each page pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf")) @@ -69,6 +72,16 @@ Local File Example ### dump content list pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir) + ### get markdown content + md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) + + ### get content list content + content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT) + + ### get middle json + middle_json_content = pipe_result.get_middle_json() + + S3 File Example ^^^^^^^^^^^^^^^^ @@ -80,20 +93,25 @@ S3 File Example from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze + from magic_pdf.config.make_content_config import DropMode, MakeMode + from magic_pdf.config.enums import SupportedPdfParseMethod bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name ak = "{Your S3 access key}" # replace with real s3 access key sk = "{Your S3 secret key}" # replace with real s3 secret key endpoint_url = "{Your S3 endpoint_url}" # replace with real s3 endpoint_url - reader = S3DataReader('unittest/tmp/', bucket_name, ak, sk, endpoint_url) # replace `unittest/tmp` with the real s3 prefix writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url) image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url) + md_writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url) + + local_image_dir, local_md_dir = "output/images", "output" + image_dir = str(os.path.basename(local_image_dir)) # args pdf_file_name = ( - "s3://llm-pdf-text-1/unittest/tmp/bug5-11.pdf" # replace with the real s3 path + f"s3://{bucket_name}/unittest/tmp/bug5-11.pdf" # replace with the real s3 path ) # prepare env @@ -123,6 +141,9 @@ S3 File Example ### draw model result on each page infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf")) + ### get model inference result + model_inference_result = infer_result.get_infer_res() + ### draw layout result on each page pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf")) @@ -135,6 +156,16 @@ S3 File Example ### dump content list pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir) + ### get markdown content + md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) + + ### get content list content + content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT) + + ### get middle json + middle_json_content = pipe_result.get_middle_json() + + MS-Office ----------