+ NOTE: このドキュメントはすでに古くなっています。最新版のドキュメントを参照してください。 +
[English](README.md) | [简体中文](README_zh-CN.md) | [日本語](README_ja-JP.md) This might be because the server's CPU does not support the AVX/AVX2 instruction set, or the CPU itself supports it but has been disabled by the system administrator. You can try contacting the system administrator to remove the restriction or change to a different server.

References: https://github.com/opendatalab/MinerU/issues/591 , https://github.com/opendatalab/MinerU/issues/736 Test CUDA Acceleration

If your graphics card has at least **8GB** of VRAM, follow these steps to test CUDA acceleration:

1. Modify the value of `"device-mode"` in the `magic-pdf.json` configuration file located in your home directory.

   ```json
   {
     "device-mode": "cuda"
   }
   ```

2. Test CUDA acceleration with the following command:
   ```sh
   magic-pdf -p small_ocr.pdf -o ./output
   ``` Test OCR acceleration with the following command:
   ```sh
   magic-pdf -p small_ocr.pdf -o ./output
   ``` Test CUDA Acceleration @@ -86,7 +86,7 @@ If your graphics card has at least 8GB of VRAM, follow these steps to test CUDA- 3. **Run the following command to test CUDA acceleration**: ``` - magic-pdf -p small_ocr.pdf + magic-pdf -p small_ocr.pdf -o ./output ``` ### 9. Enable CUDA Acceleration for OCR

1. **Download paddlepaddle-gpu**, which will automatically enable OCR acceleration upon installation.

   ```
   pip install paddlepaddle-gpu==2.6.1
   ```

2. **Run the following command to test OCR acceleration**:

   ```
   magic-pdf -p small_ocr.pdf -o ./output
   ``` Args: - default_bucket (str): the default bucket name of the relative path + default_prefix (str): the default prefix of the relative path. for example, {some_bucket}/{some_prefix} or {some_bucket} s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list. Raises: - InvalidConfig: default bucket config not in s3_configs - InvalidConfig: bucket name not unique in s3_configs - InvalidConfig: default bucket must be provided + InvalidConfig: default bucket config not in s3_configs. + InvalidConfig: bucket name not unique in s3_configs. + InvalidConfig: default bucket must be provided. """ - if len(default_bucket) == 0: - raise InvalidConfig('default_bucket must be provided') + if len(default_prefix) == 0: + raise InvalidConfig('default_prefix must be provided') + + arr = default_prefix.strip("/").split("/") + self.default_bucket = arr[0] + self.default_prefix = "/".join(arr[1:]) found_default_bucket_config = False for conf in s3_configs: - if conf.bucket_name == default_bucket: + if conf.bucket_name == self.default_bucket: found_default_bucket_config = True break if not found_default_bucket_config: raise InvalidConfig( - f'default_bucket: {default_bucket} config must be provided in s3_configs: {s3_configs}' + f'default_bucket: {self.default_bucket} config must be provided in s3_configs: {s3_configs}' ) uniq_bucket = set([conf.bucket_name for conf in s3_configs]) @@ -39,7 +44,6 @@ def __init__(self, default_bucket: str, s3_configs: list[S3Config]): f'the bucket_name in s3_configs: {s3_configs} must be unique' ) - self.default_bucket = default_bucket self.s3_configs = s3_configs self._s3_clients_h: dict = {} @@ -47,14 +51,14 @@ def __init__(self, default_bucket: str, s3_configs: list[S3Config]): class MultiBucketS3DataReader(DataReader, MultiS3Mixin): def read(self, path: str) -> bytes: """Read the path from s3, select diffect bucket client for each request - based on the path, also support range read. + based on the bucket, also support range read. Args: - path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit - for example: s3://bucket_name/path?0,100 + path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit. + for example: s3://bucket_name/path?0,100. Returns: - bytes: the content of s3 file + bytes: the content of s3 file. """ may_range_params = parse_s3_range_params(path) if may_range_params is None or 2 != len(may_range_params): @@ -84,21 +88,22 @@ def __get_s3_client(self, bucket_name: str): def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes: """Read the file with offset and limit, select diffect bucket client - for each request based on the path. + for each request based on the bucket. Args: - path (str): the file path + path (str): the file path. offset (int, optional): the number of bytes skipped. Defaults to 0. limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite. Returns: - bytes: the file content + bytes: the file content. """ if path.startswith('s3://'): bucket_name, path = parse_s3path(path) s3_reader = self.__get_s3_client(bucket_name) else: s3_reader = self.__get_s3_client(self.default_bucket) + path = os.path.join(self.default_prefix, path) return s3_reader.read_at(path, offset, limit) @@ -123,15 +128,16 @@ def __get_s3_client(self, bucket_name: str): def write(self, path: str, data: bytes) -> None: """Write file with data, also select diffect bucket client for each - request based on the path. + request based on the bucket. Args: path (str): the path of file, if the path is relative path, it will be joined with parent_dir. - data (bytes): the data want to write + data (bytes): the data want to write. """ if path.startswith('s3://'): bucket_name, path = parse_s3path(path) s3_writer = self.__get_s3_client(bucket_name) else: s3_writer = self.__get_s3_client(self.default_bucket) + path = os.path.join(self.default_prefix, path) return s3_writer.write(path, data) diff --git a/magic_pdf/data/data_reader_writer/s3.py b/magic_pdf/data/data_reader_writer/s3.py index b6f27a27..34ec43b7 100644 --- a/magic_pdf/data/data_reader_writer/s3.py +++ b/magic_pdf/data/data_reader_writer/s3.py @@ -6,6 +6,7 @@ class S3DataReader(MultiBucketS3DataReader): def __init__( self, + default_prefix_without_bucket: str, bucket: str, ak: str, sk: str, @@ -15,6 +16,7 @@ def __init__( """s3 reader client. Args: + default_prefix_without_bucket: prefix that not contains bucket bucket (str): bucket name ak (str): access key sk (str): secret key @@ -23,7 +25,7 @@ def __init__( refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html """ super().__init__( - bucket, + f'{bucket}/{default_prefix_without_bucket}', [ S3Config( bucket_name=bucket, @@ -39,6 +41,7 @@ def __init__( class S3DataWriter(MultiBucketS3DataWriter): def __init__( self, + default_prefix_without_bucket: str, bucket: str, ak: str, sk: str, @@ -48,6 +51,7 @@ def __init__( """s3 writer client. def __init__(self, model_path, max_new_tokens=2048, max_time=400, device = 'cpu'): + def __init__(self, model_path, max_new_tokens=1024, max_time=60): # init - self.model_path = model_path - self.max_new_tokens = max_new_tokens # maximum output tokens length - self.max_time = max_time # timeout for processing in seconds - if device == 'cuda': - self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time).cuda() + assert torch.cuda.is_available(), "CUDA must be available for StructEqTable model." + self.model = build_model( + model_ckpt=model_path, + max_new_tokens=max_new_tokens, + max_time=max_time, + lmdeploy=False, + flash_attn=False, + batch_size=1, + ).cuda() + self.default_format = "html" + + def predict(self, images, output_format=None, **kwargs): + + if output_format is None: + output_format = self.default_format else: - self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time) + if output_format not in ['latex', 'markdown', 'html']: + raise ValueError(f"Output format {output_format} is not supported.") + + results = self.model( + images, output_format=output_format + ) + + if output_format == "html": + results = [self.minify_html(html) for html in results] - def image2latex(self, image) -> str: - table_latex = self.model.forward(image) - return table_latex + return results - def image2html(self, image) -> str: - table_latex = self.image2latex(image) - table_html = convert_text(table_latex, 'html', format='latex') - return table_html + def minify_html(self, html): + # 移除多余的空白字符 + html = re.sub(r'\s+', ' ', html) + # 移除行尾的空白字符 + html = re.sub(r'\s*>\s*', '>', html) + # 移除标签前的空白字符 + html = re.sub(r'\s*<\s*', '<', html) + return html.strip() \ No newline at end of file diff --git a/magic_pdf/model/ppTableModel.py b/magic_pdf/model/ppTableModel.py index 933f31a0..3f08d78d 100644 --- a/magic_pdf/model/ppTableModel.py +++ b/magic_pdf/model/ppTableModel.py @@ -1,3 +1,4 @@ +import cv2 from paddleocr.ppstructure.table.predict_table import TableSystem from paddleocr.ppstructure.utility import init_args from magic_pdf.libs.Constants import * @@ -36,12 +37,13 @@ def img2html(self, image): - HTML (str): A string representing the HTML structure with content of the table. """ if isinstance(image, Image.Image): - image = np.array(image) + image = np.asarray(image) + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) pred_res, _ = self.table_sys(image) pred_html = pred_res["html"] - res = '' + pred_html.replace("
- requirements: next_docs/requirements.txt

sphinx:
  configuration: next_docs/en/conf.py When using the command ``pip install magic-pdf[full]`` on newer versions of macOS, the error ``zsh: no matches found: magic-pdf[full]`` occurs.

On macOS, the default shell has switched from Bash to Z shell, which has special handling logic for certain types of string matching. This can lead to the "no matches found" error. You can try disabling the globbing feature in the command line and then run the installation command again.

.. code:: bash

   setopt no_nomatch
   pip install magic-pdf[full]

2. Encountering the error ``pickle.UnpicklingError: invalid load key, 'v'.`` during use

This might be due to an incomplete download of the model file. You can try re-downloading the model file and then try again. Reference: https://github.com/opendatalab/MinerU/issues/143

3. Where should the model files be downloaded and how should the ``/models-dir`` configuration be set?

The path for the model files is configured in "magic-pdf.json". just like:

.. code:: json

   {
     "models-dir": "/tmp/models"
   }

This path is an absolute path, not a relative path. You can obtain the absolute path in the models directory using the "pwd" command.
Reference: https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874

4. Encountered the error ``ImportError: libGL.so.1: cannot open shared object file: No such file or directory`` in Ubuntu 22.04 on WSL2

The ``libgl`` library is missing in Ubuntu 22.04 on WSL2. You can install the ``libgl`` library with the following command to resolve the issue:

.. code:: bash

   sudo apt-get install libgl1-mesa-glx

Reference: https://github.com/opendatalab/MinerU/issues/388

5. Encountered error ``ModuleNotFoundError: No module named 'fairscale'``

You need to uninstall the module and reinstall it:

.. code:: bash

   pip uninstall fairscale
   pip install fairscale

Reference: https://github.com/opendatalab/MinerU/issues/411

6. On some newer devices like the H100, the text parsed during OCR using CUDA acceleration is garbled.

The compatibility of cuda11 with new graphics cards is poor, and the CUDA version used by Paddle needs to be upgraded.

.. code:: bash

   pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/

Reference: https://github.com/opendatalab/MinerU/issues/558 When using PyMuPDF to extract text, overlapping text lines can occur, leading to inaccurate formula insertion positions. from docutils import nodes
from docutils.parsers.rst import Directive

Project Introduction
--------------------

MinerU is a tool that converts PDFs into machine-readable formats (e.g., markdown, JSON), allowing for easy extraction into any format. MinerU was born during the pre-training process of `InternLM `__. We focus on solving symbol conversion issues in scientific literature and hope to contribute to technological development in the era of large models. Compared to well-known commercial products, MinerU is still young. If you encounter any issues or if the results are not as expected, please submit an issue on `issue `__ and **attach the relevant PDF**.

Key Features
------------

- Removes elements such as headers, footers, footnotes, and page numbers while maintaining semantic continuity
- Outputs text in a human-readable order from multi-column documents
- Retains the original structure of the document, including titles, paragraphs, and lists
- Extracts images, image captions, tables, and table captions
- Automatically recognizes formulas in the document and converts them to LaTeX
- Automatically recognizes tables in the document and converts them to LaTeX
- Automatically detects and enables OCR for corrupted PDFs
- Supports both CPU and GPU environments
- Supports Windows, Linux, and Mac platforms It is easy to implement new classes, the only one requirement is to inherit from ``DataReader`` or ``DataWriter``

.. code:: python

   class SomeReader(DataReader):
       def read(self, path: str) -> bytes:
           pass

       def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
           pass


   class SomeWriter(DataWriter):
       def write(self, path: str, data: bytes) -> None:
           pass

       def write_string(self, path: str, data: str) -> None:
           pass


Reader may curious about the difference between :doc:`io` and this section. Those two sections look very similarity at first glance. :doc:`io` provides fundamental functions, while This section thinks more at application level. Customer can build they own classes to meet their own applications need which may share same IO function. That is why we have :doc:`io`. So is ``S3DataWriter``. As we all know, Pdf has two categories, :ref:`digital_method_section` or :ref:`ocr_method_section`. Will get ``ImageDataset`` which is subclass of ``Dataset`` with images and get ``PymuDocDataset`` from pdf files. The difference between ``ImageDataset`` and ``PymuDocDataset`` is that ``ImageDataset`` only support ``OCR`` parse method, while ``PymuDocDataset`` support both ``OCR`` and ``TXT``

.. note::

   In fact some pdf may generated by images, that means it can not support ``TXT`` methods. Currently it is something the user needs to ensure does not happen It is easy to implement new classes, the only one requirement is to inherit from ``IOReader`` or ``IOWriter``

.. code:: python

   class SomeReader(IOReader):
       def read(self, path: str) -> bytes:
           pass

       def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
           pass


   class SomeWriter(IOWriter):
       def write(self, path: str, data: bytes) -> None:
           pass

Check :doc:`../../api/classes` for more intuitions or check :doc:`../../api/io` for more details If your device supports CUDA and meets the GPU requirements of the mainline environment, you can use GPU acceleration. Check if NVIDIA Drivers Are Installed

.. code:: sh

   nvidia-smi

If you see information similar to the following, it means that the NVIDIA drivers are already installed, and you can skip Step 2.

Notice:``CUDA Version`` should be >= 12.1, If the displayed version number is less than 12.1, please upgrade the driver.

.. code:: text

   +---------------------------------------------------------------------------------------+
   | NVIDIA-SMI 537.34                 Driver Version: 537.34       CUDA Version: 12.2     |
   |-----------------------------------------+----------------------+----------------------+
   | GPU  Name                 TCC/WDDM      | Bus-Id        Disp.A | Volatile Uncorr. ECC |
   | Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
   |                                         |                      |               MIG M. |
   |=========================================+======================+======================|
   |   0  NVIDIA GeForce RTX 3060 Ti   WDDM  | 00000000:01:00.0  On |                  N/A |
   |  0%   51C    P8              12W / 200W |   1489MiB /  8192MiB |      5%      Default |
   |                                         |                      |                  N/A |
   +-----------------------------------------+----------------------+----------------------+

2. Install the Driver

If no driver is installed, use the following command:

.. code:: sh

   sudo apt-get update
   sudo apt-get install nvidia-driver-545

Install the proprietary driver and restart your computer after installation.

.. code:: sh

   reboot

3. Install Anaconda

If Anaconda is already installed, skip this step.

.. code:: sh

   wget https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Linux-x86_64.sh
   bash Anaconda3-2024.06-1-Linux-x86_64.sh

In the final step, enter ``yes``, close the terminal, and reopen it.

4. Create an Environment Using Conda

Specify Python version 3.10.

.. code:: sh

   conda create -n MinerU python=3.10
   conda activate MinerU

5. Install Applications

.. code:: sh

   pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com

❗ After installation, make sure to check the version of ``magic-pdf`` using the following command:

.. code:: sh

   magic-pdf --version

If the version number is less than 0.7.0, please report the issue.

6. Download Models

Refer to detailed instructions on :doc:`download_model_weight_files`

7. Understand the Location of the Configuration File

After completing the `6. Download Models <#6-download-models>`__ step, the script will automatically generate a ``magic-pdf.json`` file in the user directory and configure the default model path. You can find the ``magic-pdf.json`` file in your user directory.

The user directory for Linux is "/home/username".

8. First Run

Download a sample file from the repository and test it.

.. code:: sh

   wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf
   magic-pdf -p small_ocr.pdf

9. Test CUDA Acceleration

If your graphics card has at least **8GB** of VRAM, follow these steps to test CUDA acceleration:

❗ Due to the extremely limited nature of 8GB VRAM for running this application, you need to close all other programs using VRAM to ensure that 8GB of VRAM is available when running this application.

1. Modify the value of ``"device-mode"`` in the ``magic-pdf.json`` configuration file located in your home directory.

   .. code:: json

      {
        "device-mode": "cuda"
      }

2. Test CUDA acceleration with the following command:

   .. code:: sh

      magic-pdf -p small_ocr.pdf

10. Enable CUDA Acceleration for OCR

1. Download ``paddlepaddle-gpu``. Installation will automatically enable OCR acceleration.

   .. code:: sh

      python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/

2. Test OCR acceleration with the following command:

   .. code:: sh

      magic-pdf -p small_ocr.pdf Install Applications

::

   pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com

5. Download Models

Refer to detailed instructions on :doc:`download_model_weight_files`

6. Understand the Location of the Configuration File

After completing the `5. Download Models <#5-download-models>`__ step, the script will automatically generate a ``magic-pdf.json`` file in the user directory and configure the default model path. You can find the ``magic-pdf.json`` file in your 【user directory】 .

The user directory for Windows is "C:/Users/username".

7. First Run

Download a sample file from the repository and test it.

.. code:: powershell

   wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf -O small_ocr.pdf
   magic-pdf -p small_ocr.pdf

8. Test CUDA Acceleration

If your graphics card has at least 8GB of VRAM, follow these steps to test CUDA-accelerated parsing performance.

❗ Due to the extremely limited nature of 8GB VRAM for running this application, you need to close all Installing higher + versions without specifying them will cause the program to fail. + +2. **Modify the value of ``"device-mode"``** in the ``magic-pdf.json`` + configuration file located in your user directory. + + .. code:: json + + { + "device-mode": "cuda" + } + +3. **Run the following command to test CUDA acceleration**: + + :: + + magic-pdf -p small_ocr.pdf + +9. Enable CUDA Acceleration for OCR +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +1. **Download paddlepaddle-gpu**, which will automatically enable OCR + acceleration upon installation. + + :: + + pip install paddlepaddle-gpu==2.6.1 + +2. **Run the following command to test OCR acceleration**: + + :: + + magic-pdf -p small_ocr.pdf + diff --git a/next_docs/en/user_guide/install/download_model_weight_files.rst b/next_docs/en/user_guide/install/download_model_weight_files.rst new file mode 100644 index 00000000..1f71689c --- /dev/null +++ b/next_docs/en/user_guide/install/download_model_weight_files.rst @@ -0,0 +1,48 @@ + +Download Model Weight Files +============================== + +Model downloads are divided into initial downloads and updates to the +model directory. Please refer to the corresponding documentation for +instructions on how to proceed. + +Initial download of model files +------------------------------ + +1. Download the Model from Hugging Face +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use a Python Script to Download Model Files from Hugging Face + +.. code:: bash + + pip install huggingface_hub + wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py + python download_models_hf.py + +The Python script will automatically download the model files and +configure the model directory in the configuration file. + +The configuration file can be found in the user directory, with the +filename ``magic-pdf.json``. + +How to update models previously downloaded +----------------------------------------- + +1. Models downloaded via Git LFS +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + Due to feedback from some users that downloading model files using + git lfs was incomplete or resulted in corrupted model files, this + method is no longer recommended. + +If you previously downloaded model files via git lfs, you can navigate +to the previous download directory and use the ``git pull`` command to +update the model. + +2. Models downloaded via Hugging Face or Model Scope +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If you previously downloaded models via Hugging Face or Model Scope, you +can rerun the Python script used for the initial download. This will +automatically update the model directory to the latest version. diff --git a/next_docs/en/user_guide/install/install.rst b/next_docs/en/user_guide/install/install.rst new file mode 100644 index 00000000..29a7703d --- /dev/null +++ b/next_docs/en/user_guide/install/install.rst @@ -0,0 +1,107 @@ + +Install +=============================================================== +If you encounter any installation issues, please first consult the FAQ. +If the parsing results are not as expected, refer to the Known Issues. +There are three different ways to experience MinerU + +Pre-installation Notice—Hardware and Software Environment Support +------------------------------------------------------------------ + +To ensure the stability and reliability of the project, we only optimize +and test for specific hardware and software environments during +development. This ensures that users deploying and running the project +on recommended system configurations will get the best performance with +the fewest compatibility issues. + +By focusing resources on the mainline environment, our team can more +efficiently resolve potential bugs and develop new features. + +In non-mainline environments, due to the diversity of hardware and +software configurations, as well as third-party dependency compatibility +issues, we cannot guarantee 100% project availability. Therefore, for +users who wish to use this project in non-recommended environments, we +suggest carefully reading the documentation and FAQ first. Most issues +already have corresponding solutions in the FAQ. We also encourage +community feedback to help us gradually expand support. + +.. raw:: html + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Follow Installation to set up a project and install MinerU first. + + +.. toctree:: + :maxdepth: 1 + + quick_start/command_line + quick_start/to_markdown + diff --git a/next_docs/en/user_guide/quick_start/command_line.rst b/next_docs/en/user_guide/quick_start/command_line.rst new file mode 100644 index 00000000..548c5855 --- /dev/null +++ b/next_docs/en/user_guide/quick_start/command_line.rst @@ -0,0 +1,59 @@ + + +Command Line +=================== + +.. code:: bash + + magic-pdf --help + Usage: magic-pdf [OPTIONS] + + Options: + -v, --version display the version and exit + -p, --path PATH local pdf filepath or directory [required] + -o, --output-dir PATH output local directory [required] + -m, --method [ocr|txt|auto] the method for parsing pdf. ocr: using ocr + technique to extract information from pdf. txt: + suitable for the text-based pdf only and + outperform ocr. auto: automatically choose the + best method for parsing pdf from ocr and txt. + without method specified, auto will be used by + default. + -l, --lang TEXT Input the languages in the pdf (if known) to + improve OCR accuracy. Optional. You should + input "Abbreviation" with language form url: ht + tps://paddlepaddle.github.io/PaddleOCR/en/ppocr + /blog/multi_languages.html#5-support-languages- + and-abbreviations + -d, --debug BOOLEAN Enables detailed debugging information during + the execution of the CLI commands. + -s, --start INTEGER The starting page for PDF parsing, beginning + from 0. + -e, --end INTEGER The ending page for PDF parsing, beginning from + 0. + --help Show this message and exit. + + + ## show version + magic-pdf -v + + ## command line example + magic-pdf -p {some_pdf} -o {some_output_dir} -m auto + +``{some_pdf}`` can be a single PDF file or a directory containing +multiple PDFs. The results will be saved in the ``{some_output_dir}`` +directory. The output file list is as follows: + +.. code:: text + + ├── some_pdf.md # markdown file + ├── images # directory for storing images + ├── some_pdf_layout.pdf # layout diagram + ├── some_pdf_middle.json # MinerU intermediate processing result + ├── some_pdf_model.json # model inference result + ├── some_pdf_origin.pdf # original PDF file + ├── some_pdf_spans.pdf # smallest granularity bbox position information diagram + └── some_pdf_content_list.json # Rich text JSON arranged in reading order + +For more information about the output files, please refer to the :doc:`../tutorial/output_file_description` + diff --git a/next_docs/en/user_guide/quick_start/extract_text.rst b/next_docs/en/user_guide/quick_start/extract_text.rst new file mode 100644 index 00000000..08de7a91 --- /dev/null +++ b/next_docs/en/user_guide/quick_start/extract_text.rst @@ -0,0 +1,10 @@ + + +Extract Content from Pdf +======================== + +.. code:: python + + from magic_pdf.data.read_api import read_local_pdfs + from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union + from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze diff --git a/next_docs/en/user_guide/quick_start/to_markdown.rst b/next_docs/en/user_guide/quick_start/to_markdown.rst new file mode 100644 index 00000000..047c3ba4 --- /dev/null +++ b/next_docs/en/user_guide/quick_start/to_markdown.rst @@ -0,0 +1,52 @@ + + +Convert To Markdown +======================== + +.. code:: python + + import os + + from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader + from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode + from magic_pdf.pipe.OCRPipe import OCRPipe + + + ## args + model_list = [] + pdf_file_name = "abc.pdf" # replace with the real pdf path + + + ## prepare env + local_image_dir, local_md_dir = "output/images", "output" + os.makedirs(local_image_dir, exist_ok=True) + + image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter( + local_md_dir + ) # create 00 + image_dir = str(os.path.basename(local_image_dir)) + + reader1 = FileBasedDataReader("") + pdf_bytes = reader1.read(pdf_file_name) # read the pdf content + + + pipe = OCRPipe(pdf_bytes, model_list, image_writer) + + pipe.pipe_classify() + pipe.pipe_analyze() + pipe.pipe_parse() + + pdf_info = pipe.pdf_mid_data["pdf_info"] + + + md_content = pipe.pipe_mk_markdown( + image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD + ) + + if isinstance(md_content, list): + md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content)) + else: + md_writer.write_string(f"{pdf_file_name}.md", md_content) + + +Check :doc:`../data/data_reader_writer` for more [reader | writer] examples diff --git a/next_docs/en/user_guide/tutorial.rst b/next_docs/en/user_guide/tutorial.rst new file mode 100644 index 00000000..20764701 --- /dev/null +++ b/next_docs/en/user_guide/tutorial.rst @@ -0,0 +1,10 @@ + +Tutorial +=========== + +From the beginning to the end, Show how to using mineru via a minimal project + +.. toctree:: + :maxdepth: 1 + + tutorial/output_file_description \ No newline at end of file diff --git a/next_docs/en/user_guide/tutorial/output_file_description.rst b/next_docs/en/user_guide/tutorial/output_file_description.rst new file mode 100644 index 00000000..8e190e8f --- /dev/null +++ b/next_docs/en/user_guide/tutorial/output_file_description.rst @@ -0,0 +1,416 @@ + +Output File Description +========================= + +After executing the ``magic-pdf`` command, in addition to outputting +files related to markdown, several other files unrelated to markdown +will also be generated. These files will be introduced one by one. + +some_pdf_layout.pdf +~~~~~~~~~~~~~~~~~~~ + +Each page layout consists of one or more boxes. The number at the top +left of each box indicates its sequence number. Additionally, in +``layout.pdf``, different content blocks are highlighted with different +background colors. + +.. figure:: ../../_static/image/layout_example.png + :alt: layout example + + layout example + +some_pdf_spans.pdf +~~~~~~~~~~~~~~~~~~ + +All spans on the page are drawn with different colored line frames +according to the span type. This file can be used for quality control, +allowing for quick identification of issues such as missing text or +unrecognized inline formulas. + +.. figure:: ../../_static/image/spans_example.png + :alt: spans example + + spans example + +some_pdf_model.json +~~~~~~~~~~~~~~~~~~~ + +Structure Definition +^^^^^^^^^^^^^^^^^^^^ + +.. code:: python + + from pydantic import BaseModel, Field + from enum import IntEnum + + class CategoryType(IntEnum): + title = 0 # Title + plain_text = 1 # Text + abandon = 2 # Includes headers, footers, page numbers, and page annotations + figure = 3 # Image + figure_caption = 4 # Image description + table = 5 # Table + table_caption = 6 # Table description + table_footnote = 7 # Table footnote + isolate_formula = 8 # Block formula + formula_caption = 9 # Formula label + + embedding = 13 # Inline formula + isolated = 14 # Block formula + text = 15 # OCR recognition result + + + class PageInfo(BaseModel): + page_no: int = Field(description="Page number, the first page is 0", ge=0) + height: int = Field(description="Page height", gt=0) + width: int = Field(description="Page width", ge=0) + + class ObjectInferenceResult(BaseModel): + category_id: CategoryType = Field(description="Category", ge=0) + poly: list[float] = Field(description="Quadrilateral coordinates, representing the coordinates of the top-left, top-right, bottom-right, and bottom-left points respectively") + score: float = Field(description="Confidence of the inference result") + latex: str | None = Field(description="LaTeX parsing result", default=None) + html: str | None = Field(description="HTML parsing result", default=None) + + class PageInferenceResults(BaseModel): + layout_dets: list[ObjectInferenceResult] = Field(description="Page recognition results", ge=0) + page_info: PageInfo = Field(description="Page metadata") + + + # The inference results of all pages, ordered by page number, are stored in a list as the inference results of MinerU + inference_result: list[PageInferenceResults] = [] + +The format of the poly coordinates is [x0, y0, x1, y1, x2, y2, x3, y3], +representing the coordinates of the top-left, top-right, bottom-right, +and bottom-left points respectively. |Poly Coordinate Diagram| + +example +^^^^^^^ + +.. code:: json + + [ + { + "layout_dets": [ + { + "category_id": 2, + "poly": [ + 99.1906967163086, + 100.3119125366211, + 730.3707885742188, + 100.3119125366211, + 730.3707885742188, + 245.81326293945312, + 99.1906967163086, + 245.81326293945312 + ], + "score": 0.9999997615814209 + } + ], + "page_info": { + "page_no": 0, + "height": 2339, + "width": 1654 + } + }, + { + "layout_dets": [ + { + "category_id": 5, + "poly": [ + 99.13092803955078, + 2210.680419921875, + 497.3183898925781, + 2210.680419921875, + 497.3183898925781, + 2264.78076171875, + 99.13092803955078, + 2264.78076171875 + ], + "score": 0.9999997019767761 + } + ], + "page_info": { + "page_no": 1, + "height": 2339, + "width": 1654 + } + } + ] + +some_pdf_middle.json +~~~~~~~~~~~~~~~~~~~~ + ++-------+--------------------------------------------------------------+ +| Field | Description | +| Name | | ++=======+==============================================================+ +| pdf | list, each element is a dict representing the parsing result | +| _info | of each PDF page, see the table below for details | ++-------+--------------------------------------------------------------+ +| \_ | ocr \| txt, used to indicate the mode used in this | +| parse | intermediate parsing state | +| _type | | ++-------+--------------------------------------------------------------+ +| \_ve | string, indicates the version of magic-pdf used in this | +| rsion | parsing | +| _name | | ++-------+--------------------------------------------------------------+ + +**pdf_info** + +Field structure description + ++---------+------------------------------------------------------------+ +| Field | Description | +| Name | | ++=========+============================================================+ +| preproc | Intermediate result after PDF preprocessing, not yet | +| _blocks | segmented | ++---------+------------------------------------------------------------+ +| layout | Layout segmentation results, containing layout direction | +| _bboxes | (vertical, horizontal), and bbox, sorted by reading order | ++---------+------------------------------------------------------------+ +| p | Page number, starting from 0 | +| age_idx | | ++---------+------------------------------------------------------------+ +| pa | Page width and height | +| ge_size | | ++---------+------------------------------------------------------------+ +| \_layo | Layout tree structure | +| ut_tree | | ++---------+------------------------------------------------------------+ +| images | list, each element is a dict representing an img_block | ++---------+------------------------------------------------------------+ +| tables | list, each element is a dict representing a table_block | ++---------+------------------------------------------------------------+ +| inter | list, each element is a dict representing an | +| line_eq | interline_equation_block | +| uations | | ++---------+------------------------------------------------------------+ +| di | List, block information returned by the model that needs | +| scarded | to be dropped | +| _blocks | | ++---------+------------------------------------------------------------+ +| para | Result after segmenting preproc_blocks | +| _blocks | | ++---------+------------------------------------------------------------+ + +In the above table, ``para_blocks`` is an array of dicts, each dict +representing a block structure. A block can support up to one level of +nesting. + +**block** + +The outer block is referred to as a first-level block, and the fields in +the first-level block include: + ++---------+-------------------------------------------------------------+ +| Field | Description | +| Name | | ++=========+=============================================================+ +| type | Block type (table|image) | ++---------+-------------------------------------------------------------+ +| bbox | Block bounding box coordinates | ++---------+-------------------------------------------------------------+ +| blocks | list, each element is a dict representing a second-level | +| | block | ++---------+-------------------------------------------------------------+ + +There are only two types of first-level blocks: “table” and “image”. All +other blocks are second-level blocks. + +The fields in a second-level block include: + ++-----+----------------------------------------------------------------+ +| Fi | Description | +| eld | | +| N | | +| ame | | ++=====+================================================================+ +| t | Block type | +| ype | | ++-----+----------------------------------------------------------------+ +| b | Block bounding box coordinates | +| box | | ++-----+----------------------------------------------------------------+ +| li | list, each element is a dict representing a line, used to | +| nes | describe the composition of a line of information | ++-----+----------------------------------------------------------------+ + +Detailed explanation of second-level block types + +================== ====================== +type Description +================== ====================== +image_body Main body of the image +image_caption Image description text +table_body Main body of the table +table_caption Table description text +table_footnote Table footnote +text Text block +title Title block +interline_equation Block formula +================== ====================== + +**line** + +The field format of a line is as follows: + ++-----+----------------------------------------------------------------+ +| Fi | Description | +| eld | | +| N | | +| ame | | ++=====+================================================================+ +| b | Bounding box coordinates of the line | +| box | | ++-----+----------------------------------------------------------------+ +| sp | list, each element is a dict representing a span, used to | +| ans | describe the composition of the smallest unit | ++-----+----------------------------------------------------------------+ + +**span** + ++----------+-----------------------------------------------------------+ +| Field | Description | +| Name | | ++==========+===========================================================+ +| bbox | Bounding box coordinates of the span | ++----------+-----------------------------------------------------------+ +| type | Type of the span | ++----------+-----------------------------------------------------------+ +| content | Text spans use content, chart spans use img_path to store | +| \| | the actual text or screenshot path information | +| img_path | | ++----------+-----------------------------------------------------------+ + +The types of spans are as follows: + +================== ============== +type Description +================== ============== +image Image +table Table +text Text +inline_equation Inline formula +interline_equation Block formula +================== ============== + +**Summary** + +A span is the smallest storage unit for all elements. + +The elements stored within para_blocks are block information. + +The block structure is as follows: + +First-level block (if any) -> Second-level block -> Line -> Span + +.. _example-1: + +example +^^^^^^^ + +.. code:: json + + { + "pdf_info": [ + { + "preproc_blocks": [ + { + "type": "text", + "bbox": [ + 52, + 61.956024169921875, + 294, + 82.99800872802734 + ], + "lines": [ + { + "bbox": [ + 52, + 61.956024169921875, + 294, + 72.0000228881836 + ], + "spans": [ + { + "bbox": [ + 54.0, + 61.956024169921875, + 296.2261657714844, + 72.0000228881836 + ], + "content": "dependent on the service headway and the reliability of the departure ", + "type": "text", + "score": 1.0 + } + ] + } + ] + } + ], + "layout_bboxes": [ + { + "layout_bbox": [ + 52, + 61, + 294, + 731 + ], + "layout_label": "V", + "sub_layout": [] + } + ], + "page_idx": 0, + "page_size": [ + 612.0, + 792.0 + ], + "_layout_tree": [], + "images": [], + "tables": [], + "interline_equations": [], + "discarded_blocks": [], + "para_blocks": [ + { + "type": "text", + "bbox": [ + 52, + 61.956024169921875, + 294, + 82.99800872802734 + ], + "lines": [ + { + "bbox": [ + 52, + 61.956024169921875, + 294, + 72.0000228881836 + ], + "spans": [ + { + "bbox": [ + 54.0, + 61.956024169921875, + 296.2261657714844, + 72.0000228881836 + ], + "content": "dependent on the service headway and the reliability of the departure ", + "type": "text", + "score": 1.0 + } + ] + } + ] + } + ] + } + ], + "_parse_type": "txt", + "_version_name": "0.6.1" + } + +.. |Poly Coordinate Diagram| image:: ../../_static/image/poly.png diff --git a/next_docs/requirements.txt b/next_docs/requirements.txt index ddb5027a..1df5f63d 100644 --- a/next_docs/requirements.txt +++ b/next_docs/requirements.txt @@ -5,7 +5,8 @@ Pillow==8.4.0 pydantic>=2.7.2,<2.8.0 PyMuPDF>=1.24.9 sphinx -sphinx-argparse -sphinx-book-theme -sphinx-copybutton -sphinx_rtd_theme +sphinx-argparse>=0.5.2 +sphinx-book-theme>=1.1.3 +sphinx-copybutton>=0.5.2 +sphinx_rtd_theme>=3.0.1 +autodoc_pydantic>=2.2.0 \ No newline at end of file diff --git a/next_docs/zh_cn/.readthedocs.yaml b/next_docs/zh_cn/.readthedocs.yaml index 1f93a4d7..5df2ecff 100644 --- a/next_docs/zh_cn/.readthedocs.yaml +++ b/next_docs/zh_cn/.readthedocs.yaml @@ -10,7 +10,7 @@ formats: python: install: - - requirements: docs/requirements.txt + - requirements: next_docs/requirements.txt sphinx: - configuration: docs/zh_cn/conf.py + configuration: next_docs/zh_cn/conf.py diff --git a/projects/web_demo/README.md b/projects/web_demo/README.md index 562fe8ca..cd4b11de 100644 --- a/projects/web_demo/README.md +++ b/projects/web_demo/README.md @@ -56,5 +56,5 @@ python3 app.py or python app.py ps:API documentation ``` -Open the mineru-web API mineru-web接口文档.html in the browser +https://apifox.com/apidoc/shared-b8eda098-ab9c-4cb3-9432-62be9be9c6f7 ``` diff --git a/projects/web_demo/README_zh-CN.md b/projects/web_demo/README_zh-CN.md index 3b814531..d9ee597b 100644 --- a/projects/web_demo/README_zh-CN.md +++ b/projects/web_demo/README_zh-CN.md @@ -55,5 +55,5 @@ python3 app.py 或者 python app.py ps:接口文档 ``` -在浏览器打开 mineru-web接口文档.html +https://apifox.com/apidoc/shared-b8eda098-ab9c-4cb3-9432-62be9be9c6f7 ``` diff - - - - - diff --git a/scripts/download_models.py b/scripts/download_models.py new file mode 100644 index 00000000..ed1ee5c3 --- /dev/null +++ b/scripts/download_models.py @@ -0,0 +1,59 @@ +import json +import os + +import requests +from modelscope import snapshot_download + + +def download_json(url): + # 下载JSON文件 + response = requests.get(url) + response.raise_for_status() # 检查请求是否成功 + return response.json() + + +def download_and_modify_json(url, local_filename, modifications): + if os.path.exists(local_filename): + data = json.load(open(local_filename)) + config_version = data.get('config_version', '0.0.0') + if config_version < '1.0.0': + data = download_json(url) + else: + data = download_json(url) + + # 修改内容 + for key, value in modifications.items(): + data[key] = value + + # 保存修改后的内容 + with open(local_filename, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + +if __name__ == '__main__': + mineru_patterns = [ + "models/Layout/LayoutLMv3/*", + "models/Layout/YOLO/*", + "models/MFD/YOLO/*", + "models/MFR/unimernet_small/*", + "models/TabRec/TableMaster/*", + "models/TabRec/StructEqTable/*", + ] + model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns) + layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader') + model_dir = model_dir + '/models' + print(f'model_dir is: {model_dir}') + print(f'layoutreader_model_dir is: {layoutreader_model_dir}') + + json_url = 'https://gitee.com/myhloli/MinerU/raw/dev/magic-pdf.template.json' + config_file_name = 'magic-pdf.json' + home_dir = os.path.expanduser('~') + config_file = os.path.join(home_dir, config_file_name) + + json_mods = { + 'models-dir': model_dir, + 'layoutreader-model-dir': layoutreader_model_dir, + } + + download_and_modify_json(json_url, config_file, json_mods) + print(f'The configuration file has been configured successfully, the path is: {config_file}') diff --git a/scripts/download_models_hf.py b/scripts/download_models_hf.py new file mode 100644 index 00000000..5e6b8dce --- /dev/null +++ b/scripts/download_models_hf.py @@ -0,0 +1,66 @@ +import json +import os + +import requests +from huggingface_hub import snapshot_download + + +def download_json(url): + # 下载JSON文件 + response = requests.get(url) + response.raise_for_status() # 检查请求是否成功 + return response.json() + + +def download_and_modify_json(url, local_filename, modifications): + if os.path.exists(local_filename): + data = json.load(open(local_filename)) + config_version = data.get('config_version', '0.0.0') + if config_version < '1.0.0': + data = download_json(url) + else: + data = download_json(url) + + # 修改内容 + for key, value in modifications.items(): + data[key] = value + + # 保存修改后的内容 + with open(local_filename, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + +if __name__ == '__main__': + + mineru_patterns = [ + "models/Layout/LayoutLMv3/*", + "models/Layout/YOLO/*", + "models/MFD/YOLO/*", + "models/MFR/unimernet_small/*", + "models/TabRec/TableMaster/*", + "models/TabRec/StructEqTable/*", + ] + model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns) + + layoutreader_pattern = [ + "*.json", + "*.safetensors", + ] + layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern) + + model_dir = model_dir + '/models' + print(f'model_dir is: {model_dir}') + print(f'layoutreader_model_dir is: {layoutreader_model_dir}') + + json_url = 'https://github.com/opendatalab/MinerU/raw/dev/magic-pdf.template.json' + config_file_name = 'magic-pdf.json' + home_dir = os.path.expanduser('~') + config_file = os.path.join(home_dir, config_file_name) + + json_mods = { + 'models-dir': model_dir, + 'layoutreader-model-dir': layoutreader_model_dir, + } + + download_and_modify_json(json_url, config_file, json_mods) + print(f'The configuration file has been configured successfully, the path is: {config_file}') diff --git a/setup.py b/setup.py index 0a7e8db3..513e349b 100644 --- a/setup.py +++ b/setup.py @@ -43,8 +43,9 @@ def parse_requirements(filename): "paddleocr==2.7.3", # 2.8.0及2.8.1版本与detectron2有冲突,需锁定2.7.3 "paddlepaddle==3.0.0b1;platform_system=='Linux'", # 解决linux的段异常问题 "paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'", # windows版本3.0.0b1效率下降,需锁定2.6.1 - 