opendatalab · myhloli · Dec 2, 2024 · Dec 3, 2024 · Dec 3, 2024 · Dec 3, 2024
diff --git a/.github/workflows/cli.yml b/.github/workflows/cli.yml
@@ -30,33 +30,17 @@ jobs:
         source activate mineru
         conda env list
         pip show coverage
-        # cd $GITHUB_WORKSPACE && sh tests/retry_env.sh
+        cd $GITHUB_WORKSPACE && sh tests/retry_env.sh
         cd $GITHUB_WORKSPACE && python tests/clean_coverage.py      
         cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/  --cov-report html --cov-report term-missing
         cd $GITHUB_WORKSPACE && python tests/get_coverage.py
         cd $GITHUB_WORKSPACE && pytest -m P0 -s -v tests/test_cli/test_cli_sdk.py
 
   notify_to_feishu:
-    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
+    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure')}}
     needs: cli-test
     runs-on: pdf
     steps:
-    - name: get_actor
-      run: |
-          metion_list="dt-yy"
-          echo $GITHUB_ACTOR
-          if [[ $GITHUB_ACTOR == "drunkpig" ]]; then
-            metion_list="xuchao"
-          elif [[ $GITHUB_ACTOR == "myhloli" ]]; then
-            metion_list="zhaoxiaomeng"
-          elif [[ $GITHUB_ACTOR == "icecraft" ]]; then
-            metion_list="xurui1"
-          fi
-          echo $metion_list
-          echo "METIONS=$metion_list" >> "$GITHUB_ENV"
-          echo ${{ env.METIONS }}
-
     - name: notify
       run: |
-        echo ${{ secrets.USER_ID }}
-        curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}'  ${{ secrets.WEBHOOK_URL }}
+        curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'$USER_ID'"}]]}}}}'  $WEBHOOK_URL
diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml
diff --git a/.github/workflows/huigui.yml b/.github/workflows/huigui.yml
@@ -29,14 +29,14 @@ jobs:
         source activate mineru
         conda env list
         pip show coverage
-        # cd $GITHUB_WORKSPACE && sh tests/retry_env.sh
+        cd $GITHUB_WORKSPACE && sh tests/retry_env.sh
         cd $GITHUB_WORKSPACE && python tests/clean_coverage.py      
         cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/  --cov-report html --cov-report term-missing
         cd $GITHUB_WORKSPACE && python tests/get_coverage.py
         cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_cli_sdk.py
 
   notify_to_feishu:
-    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
+    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure')}}
     needs: cli-test
     runs-on: pdf
     steps:
@@ -57,5 +57,5 @@ jobs:
 
     - name: notify
       run: |
-        echo ${{ secrets.USER_ID }}
-        curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}'  ${{ secrets.WEBHOOK_URL }}
+        #echo ${{ secrets.USER_ID }}
+        curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'$USER_ID'"}]]}}}}'  $WEBHOOK_URL
diff --git a/magic_pdf/model/doc_analyze_by_custom_model.py b/magic_pdf/model/doc_analyze_by_custom_model.py
@@ -143,8 +143,10 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
     if lang == "":
         lang = None
 
-    model_manager = ModelSingleton()
-    custom_model = model_manager.get_model(ocr, show_log, lang, layout_model, formula_enable, table_enable)
+    # model_manager = ModelSingleton()
+    # custom_model = model_manager.get_model(ocr, show_log, lang, layout_model, formula_enable, table_enable)
+
+    custom_model = custom_model_init(ocr, show_log, lang, layout_model, formula_enable, table_enable)
 
     with fitz.open("pdf", pdf_bytes) as doc:
         pdf_page_num = doc.page_count

diff --git a/magic_pdf/model/pdf_extract_kit.py b/magic_pdf/model/pdf_extract_kit.py
@@ -22,7 +22,7 @@
 
 from magic_pdf.config.constants import *
 from magic_pdf.model.model_list import AtomicModel
-from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
+from magic_pdf.model.sub_modules.model_init import AtomModelSingleton, ocr_model_init
 from magic_pdf.model.sub_modules.model_utils import (
     clean_vram, crop_img, get_res_list_from_layout_res)
 from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import (
@@ -150,9 +150,14 @@ def __init__(self, ocr: bool = False, show_log: bool = False, **kwargs):
                 device=self.device,
             )
         # 初始化ocr
-        self.ocr_model = atom_model_manager.get_atom_model(
-            atom_model_name=AtomicModel.OCR,
-            ocr_show_log=show_log,
+        # self.ocr_model = atom_model_manager.get_atom_model(
+        #     atom_model_name=AtomicModel.OCR,
+        #     ocr_show_log=show_log,
+        #     det_db_box_thresh=0.3,
+        #     lang=self.lang
+        # )
+        self.ocr_model = ocr_model_init(
+            show_log=show_log,
             det_db_box_thresh=0.3,
             lang=self.lang
         )
@@ -215,6 +220,7 @@ def __call__(self, image):
 
             # OCR recognition
             new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR)
+
             if self.apply_ocr:
                 ocr_res = self.ocr_model.ocr(new_image, mfd_res=adjusted_mfdetrec_res)[0]
             else:

diff --git a/magic_pdf/model/sub_modules/model_init.py b/magic_pdf/model/sub_modules/model_init.py
@@ -57,6 +57,11 @@ def doclayout_yolo_model_init(weight, device='cpu'):
     return model
 
 
+import threading
+current_thread = threading.current_thread()
+current_thread_id = current_thread.ident
+
+
 def ocr_model_init(show_log: bool = False,
                    det_db_box_thresh=0.3,
                    lang=None,
@@ -92,14 +97,24 @@ def __new__(cls, *args, **kwargs):
         return cls._instance
 
     def get_atom_model(self, atom_model_name: str, **kwargs):
+
         lang = kwargs.get('lang', None)
         layout_model_name = kwargs.get('layout_model_name', None)
-        key = (atom_model_name, layout_model_name, lang)
+        table_model_name = kwargs.get('table_model_name', None)
+
+        if atom_model_name in [AtomicModel.OCR]:
+            key = (atom_model_name, lang, current_thread_id)
+        elif atom_model_name in [AtomicModel.Layout]:
+            key = (atom_model_name, layout_model_name)
+        elif atom_model_name in [AtomicModel.Table]:
+            key = (atom_model_name, table_model_name)
+        else:
+            key = atom_model_name
+
         if key not in self._models:
             self._models[key] = atom_model_init(model_name=atom_model_name, **kwargs)
         return self._models[key]
 
-
 def atom_model_init(model_name: str, **kwargs):
     atom_model = None
     if model_name == AtomicModel.Layout:
@@ -129,7 +144,7 @@ def atom_model_init(model_name: str, **kwargs):
         atom_model = ocr_model_init(
             kwargs.get('ocr_show_log'),
             kwargs.get('det_db_box_thresh'),
-            kwargs.get('lang')
+            kwargs.get('lang'),
         )
     elif model_name == AtomicModel.Table:
         atom_model = table_model_init(

diff --git a/magic_pdf/model/sub_modules/model_utils.py b/magic_pdf/model/sub_modules/model_utils.py
@@ -42,10 +42,16 @@ def get_res_list_from_layout_res(layout_res):
 
 
 def clean_vram(device, vram_threshold=8):
+    total_memory = get_vram(device)
+    if total_memory and total_memory <= vram_threshold:
+        gc_start = time.time()
+        clean_memory()
+        gc_time = round(time.time() - gc_start, 2)
+        logger.info(f"gc time: {gc_time}")
+
+
+def get_vram(device):
     if torch.cuda.is_available() and device != 'cpu':
         total_memory = torch.cuda.get_device_properties(device).total_memory / (1024 ** 3)  # 将字节转换为 GB
-        if total_memory <= vram_threshold:
-            gc_start = time.time()
-            clean_memory()
-            gc_time = round(time.time() - gc_start, 2)
-            logger.info(f"gc time: {gc_time}")
+        return total_memory
+    return None
diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py
@@ -31,7 +31,7 @@
 except ImportError:
     pass
 
-from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
+from magic_pdf.model.sub_modules.model_init import AtomModelSingleton, ocr_model_init
 from magic_pdf.para.para_split_v3 import para_split
 from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
 from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
@@ -152,7 +152,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
             return False
 
 
-def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
+def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, ocr_model):
 
     text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
 
@@ -231,13 +231,13 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
     if len(empty_spans) > 0:
 
         # 初始化ocr模型
-        atom_model_manager = AtomModelSingleton()
-        ocr_model = atom_model_manager.get_atom_model(
-            atom_model_name="ocr",
-            ocr_show_log=False,
-            det_db_box_thresh=0.3,
-            lang=lang
-        )
+        # atom_model_manager = AtomModelSingleton()
+        # ocr_model = atom_model_manager.get_atom_model(
+        #     atom_model_name='ocr',
+        #     ocr_show_log=False,
+        #     det_db_box_thresh=0.3,
+        #     lang=lang
+        # )
 
         for span in empty_spans:
             # 对span的bbox截图再ocr
@@ -613,7 +613,7 @@ def get_block_bboxes(blocks, block_type_list):
 
 
 def parse_page_core(
-    page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
+    page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, ocr_model
 ):
     need_drop = False
     drop_reason = []
@@ -682,7 +682,7 @@ def parse_page_core(
     if parse_mode == SupportedPdfParseMethod.TXT:
 
         """使用新版本的混合ocr方案"""
-        spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
+        spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, ocr_model)
 
     elif parse_mode == SupportedPdfParseMethod.OCR:
         pass
@@ -771,6 +771,13 @@ def pdf_parse_union(
     debug_mode=False,
     lang=None,
 ):
+
+    ocr_model = ocr_model_init(
+        show_log=False,
+        det_db_box_thresh=0.3,
+        lang=lang
+    )
+
     pdf_bytes_md5 = compute_md5(dataset.data_bits())
 
     """初始化空的pdf_info_dict"""
@@ -806,7 +813,7 @@ def pdf_parse_union(
         """解析pdf中的每一页"""
         if start_page_id <= page_id <= end_page_id:
             page_info = parse_page_core(
-                page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
+                page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, ocr_model
             )
         else:
             page_info = page.get_page_info()

diff --git a/projects/gradio_app/app.py b/projects/gradio_app/app.py
@@ -14,7 +14,9 @@
 from loguru import logger
 
 from magic_pdf.data.data_reader_writer import FileBasedDataReader
+from magic_pdf.libs.config_reader import get_device
 from magic_pdf.libs.hash_utils import compute_sha256
+from magic_pdf.model.sub_modules.model_utils import get_vram
 from magic_pdf.tools.common import do_parse, prepare_env
 
 
@@ -183,6 +185,16 @@ def to_pdf(file_path):
             return tmp_file_path
 
 
+def get_concurrency_limit(vram_threshold=7.5):
+    vram = get_vram(device = get_device())
+    if vram is not None and isinstance(vram, (int, float)):
+        concurrency_limit = max(1, int(vram // vram_threshold))
+    else:
+        concurrency_limit = 1
+    # logger.info(f'concurrency_limit: {concurrency_limit}')
+    return concurrency_limit
+
+
 if __name__ == '__main__':
     with gr.Blocks() as demo:
         gr.HTML(header)
@@ -219,7 +231,7 @@ def to_pdf(file_path):
                         md_text = gr.TextArea(lines=45, show_copy_button=True)
         file.upload(fn=to_pdf, inputs=file, outputs=pdf_show)
         change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
-                        outputs=[md, md_text, output_file, pdf_show])
+                        outputs=[md, md_text, output_file, pdf_show], concurrency_limit=get_concurrency_limit())
         clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr, table_enable, language])
 
     demo.launch(server_name='0.0.0.0')
diff --git a/setup.py b/setup.py
@@ -39,7 +39,7 @@ def parse_requirements(filename):
             "full": ["unimernet==0.2.1",  # unimernet升级0.2.1
                      "matplotlib<=3.9.0;platform_system=='Windows'",  # 3.9.1及之后不提供windows的预编译包，避免一些没有编译环境的windows设备安装失败
                      "matplotlib;platform_system=='Linux' or platform_system=='Darwin'",  # linux 和 macos 不应限制matplotlib的最高版本，以避免无法更新导致的一些bug
-                     "ultralytics",  # yolov8,公式检测
+                     "ultralytics>=8.3.43",  # yolov8,公式检测
                      "paddleocr==2.7.3",  # 2.8.0及2.8.1版本与detectron2有冲突，需锁定2.7.3
                      "paddlepaddle==3.0.0b1;platform_system=='Linux'",  # 解决linux的段异常问题
                      "paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'",  # windows版本3.0.0b1效率下降，需锁定2.6.1

diff --git a/tests/test_cli/test_cli_sdk.py b/tests/test_cli/test_cli_sdk.py
@@ -7,8 +7,11 @@
 import time
 import magic_pdf.model as model_config
 from magic_pdf.pipe.UNIPipe import UNIPipe
-from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
-from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
+import os
+from magic_pdf.data.data_reader_writer import FileBasedDataWriter
+from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
+from magic_pdf.config.make_content_config import DropMode, MakeMode
+from magic_pdf.pipe.OCRPipe import OCRPipe
 model_config.__use_inside_model__ = True
 pdf_res_path = conf.conf['pdf_res_path']
 code_path = conf.conf['code_path']
@@ -41,7 +44,7 @@ def test_pdf_auto_sdk(self):
             pdf_bytes = open(pdf_path, 'rb').read()
             local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
             image_dir = str(os.path.basename(local_image_dir))
-            image_writer = DiskReaderWriter(local_image_dir)
+            image_writer = FileBasedDataWriter(local_image_dir)
             model_json = list()
             jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
             pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
@@ -77,7 +80,7 @@ def test_pdf_ocr_sdk(self):
             pdf_bytes = open(pdf_path, 'rb').read()
             local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
             image_dir = str(os.path.basename(local_image_dir))
-            image_writer = DiskReaderWriter(local_image_dir)
+            image_writer = FileBasedDataWriter(local_image_dir)
             model_json = list()
             jso_useful_key = {'_pdf_type': 'ocr', 'model_list': model_json}
             pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
@@ -112,7 +115,7 @@ def test_pdf_txt_sdk(self):
             pdf_bytes = open(pdf_path, 'rb').read()
             local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
             image_dir = str(os.path.basename(local_image_dir))
-            image_writer = DiskReaderWriter(local_image_dir)
+            image_writer = FileBasedDataWriter(local_image_dir)
             model_json = list()
             jso_useful_key = {'_pdf_type': 'txt', 'model_list': model_json}
             pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
@@ -284,12 +287,13 @@ def test_s3_sdk_suto(self):
         pdf_endpoint = os.environ.get('pdf_endpoint', "")
         s3_pdf_path = conf.conf["s3_pdf_path"]
         image_dir = "s3://" + pdf_bucket + "/mineru/test/output"
-        print (image_dir)
-        s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
-        s3image_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint, parent_path=image_dir)
-        pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
-        jso_useful_key = {"_pdf_type": "", "model_list": []}
-        pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
+        prefix = "mineru/test/output"
+        reader = S3DataReader(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
+        # = S3DataWriter(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
+        image_writer = S3DataWriter(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
+        pdf_bytes = reader.read(s3_pdf_path)
+        model_list = []
+        pipe = OCRPipe(pdf_bytes, model_list, image_writer)
         pipe.pipe_classify()
         pipe.pipe_analyze()
         pipe.pipe_parse()
@@ -427,3 +431,4 @@ def test_local_magic_pdf_close_html_table(self):
 
 if __name__ == '__main__':
     pytest.main()
+