-
Notifications
You must be signed in to change notification settings - Fork 2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
2,434 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
import os | ||
import json | ||
import datetime | ||
|
||
from loguru import logger | ||
|
||
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox | ||
from magic_pdf.pipe.UNIPipe import UNIPipe | ||
from magic_pdf.pipe.OCRPipe import OCRPipe | ||
from magic_pdf.pipe.TXTPipe import TXTPipe | ||
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter | ||
|
||
def pdf_parse( | ||
pdf_bytes: bytes, | ||
parse_method: str = 'auto', | ||
model_json_path: str = None, | ||
output_dir: str = None | ||
): | ||
""" | ||
执行从 pdf 转换到 json、md 的过程,输出 md 和 json 文件到 pdf 文件所在的目录 | ||
:param pdf_path: .pdf 文件的路径,可以是相对路径,也可以是绝对路径 | ||
:param parse_method: 解析方法, 共 auto、ocr、txt 三种,默认 auto,如果效果不好,可以尝试 ocr | ||
:param model_json_path: 已经存在的模型数据文件,如果为空则使用内置模型,pdf 和 model_json 务必对应 | ||
:param is_json_md_dump: 是否将解析后的数据写入到 .json 和 .md 文件中,默认 True,会将不同阶段的数据写入到不同的 .json 文件中(共3个.json文件),md内容会保存到 .md 文件中 | ||
:param output_dir: 输出结果的目录地址,会生成一个以 pdf 文件名命名的文件夹并保存所有结果 | ||
""" | ||
try: | ||
current_script_dir = os.path.dirname(os.path.abspath(__file__)) | ||
foldname = datetime.datetime.now().strftime("%Y%m%d%H%M%S") | ||
if output_dir: | ||
output_path = os.path.join(output_dir, foldname) | ||
else: | ||
output_path = os.path.join(current_script_dir, foldname) | ||
|
||
output_image_path = os.path.join(output_path, 'images') | ||
|
||
# 获取图片的父路径,为的是以相对路径保存到 .md 和 conent_list.json 文件中 | ||
image_path_parent = os.path.basename(output_image_path) | ||
|
||
if model_json_path: | ||
# 读取已经被模型解析后的pdf文件的 json 原始数据,list 类型 | ||
model_json = json.loads(open(model_json_path, "r", encoding="utf-8").read()) | ||
else: | ||
model_json = [] | ||
|
||
# 执行解析步骤 | ||
# image_writer = DiskReaderWriter(output_image_path) | ||
image_writer, md_writer = DiskReaderWriter(output_image_path), DiskReaderWriter(output_path) | ||
|
||
# 选择解析方式 | ||
# jso_useful_key = {"_pdf_type": "", "model_list": model_json} | ||
# pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) | ||
if parse_method == "auto": | ||
jso_useful_key = {"_pdf_type": "", "model_list": model_json} | ||
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) | ||
elif parse_method == "txt": | ||
pipe = TXTPipe(pdf_bytes, model_json, image_writer) | ||
elif parse_method == "ocr": | ||
pipe = OCRPipe(pdf_bytes, model_json, image_writer) | ||
else: | ||
logger.error("unknown parse method, only auto, ocr, txt allowed") | ||
exit(1) | ||
|
||
# 执行分类 | ||
pipe.pipe_classify() | ||
|
||
# 如果没有传入模型数据,则使用内置模型解析 | ||
if not model_json: | ||
pipe.pipe_analyze() # 解析 | ||
|
||
# 执行解析 | ||
pipe.pipe_parse() | ||
|
||
# 保存 text 和 md 格式的结果 | ||
content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none") | ||
md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none") | ||
|
||
return content_list, md_content | ||
|
||
except Exception as e: | ||
logger.exception(e) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
''' | ||
Author: FutureMeng [email protected] | ||
Date: 2024-11-13 19:44:33 | ||
LastEditors: FutureMeng [email protected] | ||
LastEditTime: 2024-11-14 15:47:27 | ||
FilePath: \MinerU\scripts\fastapitest.py | ||
Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE | ||
''' | ||
from fastapi import FastAPI | ||
import urllib.request | ||
from . import magic_pdf_parse_util | ||
|
||
app = FastAPI() | ||
|
||
@app.post("/parse_pdf") | ||
async def parse_pdf(imageUrl: str, parse_method: str = 'auto'): | ||
pdf_bytes = urllib.request.urlopen(imageUrl).read() | ||
content_list, md_content = magic_pdf_parse_util.pdf_parse(pdf_bytes, parse_method) | ||
return {"content_list": content_list, "md_content": md_content} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#!/bin/bash | ||
echo "starting miner server" | ||
source /opt/mineru_venv/bin/activate | ||
cd /minerugw | ||
uvicorn app.main:app --host 0.0.0.0 --port 80 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
''' | ||
Author: FutureMeng [email protected] | ||
Date: 2024-11-13 19:05:01 | ||
LastEditors: FutureMeng [email protected] | ||
LastEditTime: 2024-11-13 19:06:17 | ||
FilePath: \lzmineru\api\test.py | ||
Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE | ||
''' | ||
|
||
import urllib.request | ||
import os | ||
from magic_pdf.pipe.UNIPipe import UNIPipe | ||
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter | ||
|
||
current_script_dir = os.path.dirname(os.path.abspath(__file__)) | ||
local_image_dir = os.path.join(current_script_dir, 'images') | ||
image_dir = str(os.path.basename(local_image_dir)) | ||
imageUrl = 'https://one-jiulu.oss-cn-beijing.aliyuncs.com/9250ba5ccbf34249b054d063d32ec8f8.pdf?OSSAccessKeyId=LTAI5tABhdnCgSeVaptuWLfx&Expires=1732100601&Signature=XZqGPO%2BJ76bEJ0ou8GZQUO7vhjs%3D' | ||
|
||
pdf_bytes = urllib.request.urlopen(imageUrl).read() | ||
image_writer = DiskReaderWriter(local_image_dir) | ||
jso_useful_key = {"_pdf_type": "", "model_list": []} | ||
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) | ||
pipe.pipe_classify() | ||
pipe.pipe_analyze() | ||
pipe.pipe_parse() | ||
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none") | ||
print(md_content) |
Binary file not shown.
Oops, something went wrong.