From 0aa457787417e640a29f617458d0bd8bb2057438 Mon Sep 17 00:00:00 2001 From: yyy <102640628+dt-yy@users.noreply.github.com> Date: Mon, 23 Sep 2024 10:41:45 +0800 Subject: [PATCH] feat: add test case (#645) * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case * feat: add table case --------- Co-authored-by: quyuan --- .github/workflows/cli.yml | 21 ++-- .github/workflows/daily.yml | 55 +++++++++ .github/workflows/huigui.yml | 61 +++++++++ .github/workflows/update_base.yml | 22 ---- .gitignore | 3 +- requirements-qa.txt | 3 +- tests/clean_coverage.py | 3 +- tests/retry_env.sh | 11 +- tests/test_cli/conf/conf.py | 2 +- tests/test_cli/conftest.py | 17 +++ tests/test_cli/lib/common.py | 42 ++++++- ...4039ec2c054.pdf => test_rearch_report.pdf} | Bin tests/test_cli/test_cli_sdk.py | 116 ++++++++++++++---- tests/test_cli/test_magic-pdf-dev_cli.py | 0 tests/test_cli/test_performence.py | 36 ------ tests/test_cli/test_table.py | 54 -------- tests/unittest/test_table/test_tablemaster.py | 2 +- 17 files changed, 288 insertions(+), 160 deletions(-) create mode 100644 .github/workflows/daily.yml create mode 100644 .github/workflows/huigui.yml delete mode 100644 .github/workflows/update_base.yml create mode 100644 tests/test_cli/conftest.py rename tests/test_cli/pdf_dev/pdf/{research_report_1f978cd81fb7260c8f7644039ec2c054.pdf => test_rearch_report.pdf} (100%) delete mode 100644 tests/test_cli/test_magic-pdf-dev_cli.py delete mode 100644 tests/test_cli/test_performence.py delete mode 100644 tests/test_cli/test_table.py diff --git a/.github/workflows/cli.yml b/.github/workflows/cli.yml index ee22ef03..92668fd7 100644 --- a/.github/workflows/cli.yml +++ b/.github/workflows/cli.yml @@ -10,7 +10,6 @@ on: paths-ignore: - "cmds/**" - "**.md" - - "**.yml" pull_request: branches: - "master" @@ -18,12 +17,11 @@ on: paths-ignore: - "cmds/**" - "**.md" - - "**.yml" workflow_dispatch: jobs: cli-test: runs-on: pdf - timeout-minutes: 120 + timeout-minutes: 240 strategy: fail-fast: true @@ -33,17 +31,16 @@ jobs: with: fetch-depth: 2 - - name: install + - name: install&test run: | - echo $GITHUB_WORKSPACE && sh tests/retry_env.sh - - name: unit test - run: | - cd $GITHUB_WORKSPACE && python tests/clean_coverage.py - cd $GITHUB_WORKSPACE && export PYTHONPATH=. && coverage run -m pytest tests/unittest --cov=magic_pdf/ --cov-report term-missing --cov-report html + source activate mineru + conda env list + pip show coverage + # cd $GITHUB_WORKSPACE && sh tests/retry_env.sh + cd $GITHUB_WORKSPACE && python tests/clean_coverage.py + cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/ --cov-report html --cov-report term-missing cd $GITHUB_WORKSPACE && python tests/get_coverage.py - - name: cli test - run: | - source ~/.bashrc && cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_cli.py + cd $GITHUB_WORKSPACE && pytest -m P0 -s -v tests/test_cli/test_cli_sdk.py notify_to_feishu: if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }} diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml new file mode 100644 index 00000000..b6fc09f3 --- /dev/null +++ b/.github/workflows/daily.yml @@ -0,0 +1,55 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: mineru +on: + schedule: + - cron: '0 22 * * *' # 每天晚上 10 点执行 +jobs: + cli-test: + runs-on: pdf + timeout-minutes: 240 + strategy: + fail-fast: true + + steps: + - name: PDF cli + uses: actions/checkout@v3 + with: + fetch-depth: 2 + + - name: install&test + run: | + source activate mineru + conda env list + pip show coverage + # cd $GITHUB_WORKSPACE && sh tests/retry_env.sh + cd $GITHUB_WORKSPACE && python tests/clean_coverage.py + cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/ --cov-report html --cov-report term-missing + cd $GITHUB_WORKSPACE && python tests/get_coverage.py + cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_cli_sdk.py + + notify_to_feishu: + if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }} + needs: cli-test + runs-on: pdf + steps: + - name: get_actor + run: | + metion_list="dt-yy" + echo $GITHUB_ACTOR + if [[ $GITHUB_ACTOR == "drunkpig" ]]; then + metion_list="xuchao" + elif [[ $GITHUB_ACTOR == "myhloli" ]]; then + metion_list="zhaoxiaomeng" + elif [[ $GITHUB_ACTOR == "icecraft" ]]; then + metion_list="xurui1" + fi + echo $metion_list + echo "METIONS=$metion_list" >> "$GITHUB_ENV" + echo ${{ env.METIONS }} + + - name: notify + run: | + echo ${{ secrets.USER_ID }} + curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }} diff --git a/.github/workflows/huigui.yml b/.github/workflows/huigui.yml new file mode 100644 index 00000000..4b4a7905 --- /dev/null +++ b/.github/workflows/huigui.yml @@ -0,0 +1,61 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: mineru +on: + push: + branches: + - "master" + - "dev" + paths-ignore: + - "cmds/**" + - "**.md" + workflow_dispatch: +jobs: + cli-test: + runs-on: pdf + timeout-minutes: 240 + strategy: + fail-fast: true + + steps: + - name: PDF cli + uses: actions/checkout@v3 + with: + fetch-depth: 2 + + - name: install&test + run: | + source activate mineru + conda env list + pip show coverage + # cd $GITHUB_WORKSPACE && sh tests/retry_env.sh + cd $GITHUB_WORKSPACE && python tests/clean_coverage.py + cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/ --cov-report html --cov-report term-missing + cd $GITHUB_WORKSPACE && python tests/get_coverage.py + cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_cli_sdk.py + + notify_to_feishu: + if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }} + needs: cli-test + runs-on: pdf + steps: + - name: get_actor + run: | + metion_list="dt-yy" + echo $GITHUB_ACTOR + if [[ $GITHUB_ACTOR == "drunkpig" ]]; then + metion_list="xuchao" + elif [[ $GITHUB_ACTOR == "myhloli" ]]; then + metion_list="zhaoxiaomeng" + elif [[ $GITHUB_ACTOR == "icecraft" ]]; then + metion_list="xurui1" + fi + echo $metion_list + echo "METIONS=$metion_list" >> "$GITHUB_ENV" + echo ${{ env.METIONS }} + + - name: notify + run: | + echo ${{ secrets.USER_ID }} + curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }} diff --git a/.github/workflows/update_base.yml b/.github/workflows/update_base.yml deleted file mode 100644 index bce75a6c..00000000 --- a/.github/workflows/update_base.yml +++ /dev/null @@ -1,22 +0,0 @@ -# This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python - -name: update-base -on: - push: - tags: - - '*released' - workflow_dispatch: -jobs: - pdf-test: - runs-on: pdf - timeout-minutes: 40 - - - steps: - - name: update-base - uses: actions/checkout@v3 - - name: start-update - run: | - echo "start test" - diff --git a/.gitignore b/.gitignore index fda7b648..1c544f23 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.tar *.tar.gz +*.zip venv*/ envs/ slurm_logs/ @@ -31,7 +32,7 @@ tmp .vscode .vscode/ ocr_demo - +.coveragerc /app/common/__init__.py /magic_pdf/config/__init__.py source.dev.env diff --git a/requirements-qa.txt b/requirements-qa.txt index 56b2dbea..f42f1d38 100644 --- a/requirements-qa.txt +++ b/requirements-qa.txt @@ -16,4 +16,5 @@ pypandoc pyopenssl==24.0.0 struct-eqtable==0.1.0 pytest-cov -beautifulsoup4 \ No newline at end of file +beautifulsoup4 +coverage \ No newline at end of file diff --git a/tests/clean_coverage.py b/tests/clean_coverage.py index 6dc543bd..80c261db 100644 --- a/tests/clean_coverage.py +++ b/tests/clean_coverage.py @@ -21,4 +21,5 @@ def delete_file(path): print(f"Error deleting directory '{path}': {e}") if __name__ == "__main__": - delete_file("htmlcov") \ No newline at end of file + delete_file("htmlcov/") + #delete_file(".coverage") diff --git a/tests/retry_env.sh b/tests/retry_env.sh index 02d8ccfe..fbe1f2d6 100644 --- a/tests/retry_env.sh +++ b/tests/retry_env.sh @@ -1,16 +1,13 @@ #!/bin/bash -# 定义最大重试次数 max_retries=5 retry_count=0 while true; do # prepare env - source activate MinerU - pip install -r requirements-qa.txt - pip uninstall magic-pdf - pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com - pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ + #python -m pip install -r requirements-qa.txt + python -m pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple + python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ exit_code=$? if [ $exit_code -eq 0 ]; then echo "test.sh 成功执行!" @@ -22,6 +19,6 @@ while true; do exit 1 fi echo "test.sh 执行失败 (退出码: $exit_code)。尝试第 $retry_count 次重试..." - sleep 5 # 等待 5 秒后重试 + sleep 5 fi done diff --git a/tests/test_cli/conf/conf.py b/tests/test_cli/conf/conf.py index 44065b30..6ee4da72 100644 --- a/tests/test_cli/conf/conf.py +++ b/tests/test_cli/conf/conf.py @@ -4,5 +4,5 @@ "pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev", "pdf_res_path": "/tmp/magic-pdf", "jsonl_path": "s3://llm-qatest-pnorm/mineru/test/line1.jsonl", -"s3_pdf_path": "s3://llm-qatest-pnorm/mineru/test/test.pdf" +"s3_pdf_path": "s3://llm-qatest-pnorm/mineru/test/test_rearch_report.pdf" } \ No newline at end of file diff --git a/tests/test_cli/conftest.py b/tests/test_cli/conftest.py new file mode 100644 index 00000000..f7df67f6 --- /dev/null +++ b/tests/test_cli/conftest.py @@ -0,0 +1,17 @@ +import pytest +import torch + +def clear_gpu_memory(): + ''' + clear GPU memory + ''' + torch.cuda.empty_cache() + print("GPU memory cleared.") + +@pytest.hookimpl(tryfirst=True, hookwrapper=True) +def pytest_runtest_teardown(item, nextitem): + ''' + clear GPU memory after each test + ''' + yield + clear_gpu_memory() \ No newline at end of file diff --git a/tests/test_cli/lib/common.py b/tests/test_cli/lib/common.py index a676aeac..6da6c94c 100644 --- a/tests/test_cli/lib/common.py +++ b/tests/test_cli/lib/common.py @@ -1,13 +1,20 @@ """common definitions.""" import os import shutil - - +import re +import json def check_shell(cmd): """shell successful.""" res = os.system(cmd) assert res == 0 +def update_config_file(file_path, key, value): + """update config file.""" + with open(file_path, 'r', encoding="utf-8") as f: + config = json.loads(f.read()) + config[key] = value + with open(file_path, 'w', encoding="utf-8") as f: + f.write(json.dumps(config)) def cli_count_folders_and_check_contents(file_path): """" count cli files.""" @@ -40,4 +47,33 @@ def delete_file(path): shutil.rmtree(path) print(f"Directory '{path}' and its contents deleted.") except TypeError as e: - print(f"Error deleting directory '{path}': {e}") \ No newline at end of file + print(f"Error deleting directory '{path}': {e}") + +def check_latex_table_exists(file_path): + """check latex table exists.""" + pattern = r'\\begin\{tabular\}.*?\\end\{tabular\}' + with open(file_path, 'r', encoding='utf-8') as file: + content = file.read() + matches = re.findall(pattern, content, re.DOTALL) + return len(matches) > 0 + +def check_html_table_exists(file_path): + """check html table exists.""" + pattern = r'.*?' + with open(file_path, 'r', encoding='utf-8') as file: + content = file.read() + matches = re.findall(pattern, content, re.DOTALL) + return len(matches) > 0 + +def check_close_tables(file_path): + """delete no tables.""" + latex_pattern = r'\\begin\{tabular\}.*?\\end\{tabular\}' + html_pattern = r'.*?' + with open(file_path, 'r', encoding='utf-8') as file: + content = file.read() + latex_matches = re.findall(latex_pattern, content, re.DOTALL) + html_matches = re.findall(html_pattern, content, re.DOTALL) + if len(latex_matches) == 0 and len(html_matches) == 0: + return True + else: + return False \ No newline at end of file diff --git a/tests/test_cli/pdf_dev/pdf/research_report_1f978cd81fb7260c8f7644039ec2c054.pdf b/tests/test_cli/pdf_dev/pdf/test_rearch_report.pdf similarity index 100% rename from tests/test_cli/pdf_dev/pdf/research_report_1f978cd81fb7260c8f7644039ec2c054.pdf rename to tests/test_cli/pdf_dev/pdf/test_rearch_report.pdf diff --git a/tests/test_cli/test_cli_sdk.py b/tests/test_cli/test_cli_sdk.py index 2a73e5f1..d4d6fa82 100644 --- a/tests/test_cli/test_cli_sdk.py +++ b/tests/test_cli/test_cli_sdk.py @@ -1,11 +1,10 @@ """test cli and sdk.""" import logging import os - import pytest from conf import conf from lib import common - +import time import magic_pdf.model as model_config from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter @@ -57,6 +56,7 @@ def test_pdf_auto_sdk(self): @pytest.mark.P0 def test_pdf_ocr_sdk(self): """pdf sdk ocr test.""" + time.sleep(2) demo_names = list() pdf_path = os.path.join(pdf_dev_path, 'pdf') for pdf_file in os.listdir(pdf_path): @@ -88,10 +88,11 @@ def test_pdf_ocr_sdk(self): with open(res_path, 'w+', encoding='utf-8') as f: f.write(md_content) common.sdk_count_folders_and_check_contents(res_path) - + @pytest.mark.P0 def test_pdf_txt_sdk(self): """pdf sdk txt test.""" + time.sleep(2) demo_names = list() pdf_path = os.path.join(pdf_dev_path, 'pdf') for pdf_file in os.listdir(pdf_path): @@ -99,7 +100,6 @@ def test_pdf_txt_sdk(self): demo_names.append(pdf_file.split('.')[0]) for demo_name in demo_names: pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf') - print(pdf_path) pdf_bytes = open(pdf_path, 'rb').read() local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images') image_dir = str(os.path.basename(local_image_dir)) @@ -123,10 +123,11 @@ def test_pdf_txt_sdk(self): with open(res_path, 'w+', encoding='utf-8') as f: f.write(md_content) common.sdk_count_folders_and_check_contents(res_path) - + @pytest.mark.P0 def test_pdf_cli_auto(self): """magic_pdf cli test auto.""" + time.sleep(2) demo_names = [] pdf_path = os.path.join(pdf_dev_path, 'pdf') for pdf_file in os.listdir(pdf_path): @@ -141,10 +142,11 @@ def test_pdf_cli_auto(self): os.system(cmd) common.cli_count_folders_and_check_contents( os.path.join(res_path, demo_name, 'auto')) - + @pytest.mark.P0 - def test_pdf_clit_txt(self): + def test_pdf_cli_txt(self): """magic_pdf cli test txt.""" + time.sleep(2) demo_names = [] pdf_path = os.path.join(pdf_dev_path, 'pdf') for pdf_file in os.listdir(pdf_path): @@ -159,10 +161,11 @@ def test_pdf_clit_txt(self): os.system(cmd) common.cli_count_folders_and_check_contents( os.path.join(res_path, demo_name, 'txt')) - + @pytest.mark.P0 - def test_pdf_clit_ocr(self): + def test_pdf_cli_ocr(self): """magic_pdf cli test ocr.""" + time.sleep(2) demo_names = [] pdf_path = os.path.join(pdf_dev_path, 'pdf') for pdf_file in os.listdir(pdf_path): @@ -177,85 +180,102 @@ def test_pdf_clit_ocr(self): os.system(cmd) common.cli_count_folders_and_check_contents( os.path.join(res_path, demo_name, 'ocr')) - + + @pytest.mark.skip(reason='out-of-date api') @pytest.mark.P1 def test_pdf_dev_cli_local_jsonl_txt(self): """magic_pdf_dev cli local txt.""" + time.sleep(2) jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl') cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt") logging.info(cmd) os.system(cmd) - + @pytest.mark.skip(reason='out-of-date api') @pytest.mark.P1 def test_pdf_dev_cli_local_jsonl_ocr(self): """magic_pdf_dev cli local ocr.""" + time.sleep(2) jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl') cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr') logging.info(cmd) os.system(cmd) + @pytest.mark.skip(reason='out-of-date api') @pytest.mark.P1 def test_pdf_dev_cli_local_jsonl_auto(self): """magic_pdf_dev cli local auto.""" + time.sleep(2) jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl') cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto') logging.info(cmd) os.system(cmd) - + + @pytest.mark.skip(reason='out-of-date api') @pytest.mark.P1 def test_pdf_dev_cli_s3_jsonl_txt(self): """magic_pdf_dev cli s3 txt.""" + time.sleep(2) jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl') cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt") logging.info(cmd) os.system(cmd) - + @pytest.mark.skip(reason='out-of-date api') @pytest.mark.P1 def test_pdf_dev_cli_s3_jsonl_ocr(self): """magic_pdf_dev cli s3 ocr.""" + time.sleep(2) jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl') cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr') logging.info(cmd) os.system(cmd) + @pytest.mark.skip(reason='out-of-date api') @pytest.mark.P1 def test_pdf_dev_cli_s3_jsonl_auto(self): """magic_pdf_dev cli s3 auto.""" + time.sleep(2) jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl') cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto') logging.info(cmd) os.system(cmd) - @pytest.mark.P1 def test_pdf_dev_cli_pdf_json_auto(self): """magic_pdf_dev cli pdf+json auto.""" + time.sleep(2) json_path = os.path.join(pdf_dev_path, 'test_model.json') - pdf_path = os.path.join(pdf_dev_path, 'pdf', 'research_report_1f978cd81fb7260c8f7644039ec2c054.pdf') + pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf') cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto') logging.info(cmd) os.system(cmd) - + + @pytest.mark.skip(reason='out-of-date api') @pytest.mark.P1 def test_pdf_dev_cli_pdf_json_ocr(self): """magic_pdf_dev cli pdf+json ocr.""" + time.sleep(2) json_path = os.path.join(pdf_dev_path, 'test_model.json') - pdf_path = os.path.join(pdf_dev_path, 'pdf', 'research_report_1f978cd81fb7260c8f7644039ec2c054.pdf') + pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf') cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto') logging.info(cmd) os.system(cmd) - - + @pytest.mark.P1 def test_s3_sdk_suto(self): - pdf_ak = os.environ.get('pdf_ak', "") + """ + test s3 sdk auto. + """ + time.sleep(2) + pdf_ak = os.getenv('pdf_ak') + print (pdf_ak) pdf_sk = os.environ.get('pdf_sk', "") pdf_bucket = os.environ.get('bucket', "") pdf_endpoint = os.environ.get('pdf_endpoint', "") s3_pdf_path = conf.conf["s3_pdf_path"] - image_dir = "s3://" + pdf_bucket + "/mineru/test/test.md" + image_dir = "s3://" + pdf_bucket + "/mineru/test/output" + print (image_dir) s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint) s3image_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint, parent_path=image_dir) pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN) @@ -267,6 +287,60 @@ def test_s3_sdk_suto(self): md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none") assert len(md_content) > 0 + @pytest.mark.P1 + def test_local_magic_pdf_open_st_table(self): + """magic pdf cli open st table.""" + time.sleep(2) + pre_cmd = "cp ~/magic_pdf_st.json ~/magic-pdf.json" + print (pre_cmd) + os.system(pre_cmd) + pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf") + common.delete_file(pdf_res_path) + cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path) + os.system(cli_cmd) + res = common.check_latex_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md")) + assert res is True + + @pytest.mark.P1 + def test_local_magic_pdf_open_html_table(self): + """magic pdf cli open html table.""" + time.sleep(2) + pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json" + os.system(pre_cmd) + pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf") + common.delete_file(pdf_res_path) + cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path) + os.system(cli_cmd) + res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md")) + assert res is True + + @pytest.mark.P1 + def test_magic_pdf_close_html_table_cpu(self): + """magic pdf cli close html table cpu mode.""" + time.sleep(2) + pre_cmd = "cp ~/magic_pdf_html_table_cpu.json ~/magic-pdf.json" + os.system(pre_cmd) + pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf") + common.delete_file(pdf_res_path) + cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path) + os.system(cli_cmd) + res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md")) + assert res is True + + @pytest.mark.P1 + def test_local_magic_pdf_close_html_table(self): + """magic pdf cli close table.""" + time.sleep(2) + pre_cmd = "cp ~/magic_pdf_close_table.json ~/magic-pdf.json" + os.system(pre_cmd) + pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf") + common.delete_file(pdf_res_path) + cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path) + os.system(cli_cmd) + res = common.check_close_tables(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md")) + assert res is True + + if __name__ == '__main__': pytest.main() diff --git a/tests/test_cli/test_magic-pdf-dev_cli.py b/tests/test_cli/test_magic-pdf-dev_cli.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_cli/test_performence.py b/tests/test_cli/test_performence.py deleted file mode 100644 index bd462133..00000000 --- a/tests/test_cli/test_performence.py +++ /dev/null @@ -1,36 +0,0 @@ -""" -test performance -""" -import os -import shutil -import json -from lib import calculate_score -import pytest -from conf import conf - -code_path = os.environ.get('GITHUB_WORKSPACE') -pdf_dev_path = conf.conf["pdf_dev_path"] -pdf_res_path = conf.conf["pdf_res_path"] - -class TestTable(): - """ - test table - """ - def test_perf_close_table(self): - """ - test perf when close table - """ - - - - -def get_score(): - """ - get score - """ - score = calculate_score.Scoring(os.path.join(pdf_dev_path, "result.json")) - score.calculate_similarity_total("mineru", pdf_dev_path) - res = score.summary_scores() - return res - - diff --git a/tests/test_cli/test_table.py b/tests/test_cli/test_table.py deleted file mode 100644 index 8e16475b..00000000 --- a/tests/test_cli/test_table.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -test table case -""" -import os -import shutil -import json -from lib import calculate_score -import pytest -from conf import conf - -code_path = os.environ.get('GITHUB_WORKSPACE') -pdf_dev_path = conf.conf["pdf_dev_path"] -pdf_res_path = conf.conf["pdf_res_path"] - -class TestTable(): - """ - test table - """ - def test_paddle_table_master_cuda(self): - """ - select table: paddle table master,mode is cuda - """ - def test_paddle_table_master_cpu(self): - """ - select table: paddle table master, mode is cpu - """ - def test_st_table_cuda(self): - """ - select table: ST, mode is cuda - """ - - def test_st_table_cpu(self): - """ - select table: ST, mode is cpu - """ - - def test_close_table_cuda(self): - """ - close table, mode is cuda - """ - - - - -def get_score(): - """ - get score - """ - score = calculate_score.Scoring(os.path.join(pdf_dev_path, "result.json")) - score.calculate_similarity_total("mineru", pdf_dev_path) - res = score.summary_scores() - return res - - diff --git a/tests/unittest/test_table/test_tablemaster.py b/tests/unittest/test_table/test_tablemaster.py index 33ab05b0..9078b8dd 100644 --- a/tests/unittest/test_table/test_tablemaster.py +++ b/tests/unittest/test_table/test_tablemaster.py @@ -7,7 +7,7 @@ def test_image2html(self): img = Image.open("tests/unittest/test_table/assets/table.jpg") # 修改table模型路径 config = {"device": "cuda", - "model_dir": "/home/quyuan/PDF-Extract-Kit/models/TabRec/TableMaster"} + "model_dir": "/home/quyuan/.cache/modelscope/hub/opendatalab/PDF-Extract-Kit/models/TabRec/TableMaster"} table_model = ppTableModel(config) res = table_model.img2html(img) true_value = """
MethodsRPFFPS
SegLink [26]70.086.077.08.9
PixelLink [4]73.283.077.8-
TextSnake [18]73.983.278.31.1
TextField [37]75.987.481.35.2
MSR[38]76.787.481.7-
FTSN[3]77.187.682.0-
LSE[30]81.784.282.9-
CRAFT [2]78.288.282.98.6
MCN [16]7988.83-
ATRR[35]82.185.283.6-
PAN [34]83.884.484.130.2
DB[12]79.291.584.932.0
DRRG [41]82.3088.0585.08-
Ours (SynText)80.6885.4082.9712.68
Ours (MLT-17)84.5486.6285.5712.31
\n"""