Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

change content make logic to union_make #94

Merged
merged 1 commit into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions magic_pdf/cli/magicpdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from loguru import logger
from pathlib import Path

from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.pipe.OCRPipe import OCRPipe
Expand Down Expand Up @@ -78,8 +79,8 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
pdf_info = pipe.pdf_mid_data['pdf_info']
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
md_content = pipe.pipe_mk_markdown(image_dir)
#part_file_name = datetime.now().strftime("%H-%M-%S")
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)

md_writer.write(
content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
)
Expand All @@ -89,7 +90,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
mode=AbsReaderWriter.MODE_TXT,
)
try:
content_list = pipe.pipe_mk_uni_format(image_dir)
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
except Exception as e:
logger.exception(e)
md_writer.write(
Expand Down
26 changes: 8 additions & 18 deletions magic_pdf/pipe/AbsPipe.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from abc import ABC, abstractmethod

from magic_pdf.dict2md.mkcontent import mk_universal_format, mk_mm_markdown
from magic_pdf.dict2md.ocr_mkcontent import make_standard_format_with_para, ocr_mk_mm_markdown_with_para
from magic_pdf.dict2md.ocr_mkcontent import make_standard_format_with_para, ocr_mk_mm_markdown_with_para, union_make
from magic_pdf.filter.pdf_classify_by_type import classify
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
from magic_pdf.libs.MakeContentConfig import MakeMode, DropMode
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.json_compressor import JsonCompressor
Expand Down Expand Up @@ -41,14 +42,14 @@ def pipe_parse(self):
raise NotImplementedError

@abstractmethod
def pipe_mk_uni_format(self):
def pipe_mk_uni_format(self, img_parent_path, drop_mode):
"""
有状态的组装统一格式
"""
raise NotImplementedError

@abstractmethod
def pipe_mk_markdown(self):
def pipe_mk_markdown(self, img_parent_path, drop_mode):
"""
有状态的组装markdown
"""
Expand Down Expand Up @@ -83,34 +84,23 @@ def classify(pdf_bytes: bytes) -> str:
return AbsPipe.PIP_OCR

@staticmethod
def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str) -> list:
def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
"""
根据pdf类型,生成统一格式content_list
"""
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
parse_type = pdf_mid_data["_parse_type"]
pdf_info_list = pdf_mid_data["pdf_info"]
if parse_type == AbsPipe.PIP_TXT:
# content_list = mk_universal_format(pdf_info_list, img_buket_path)
content_list = make_standard_format_with_para(pdf_info_list, img_buket_path)
elif parse_type == AbsPipe.PIP_OCR:
content_list = make_standard_format_with_para(pdf_info_list, img_buket_path)
content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
return content_list

@staticmethod
def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str) -> list:
def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
"""
根据pdf类型,markdown
"""
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
parse_type = pdf_mid_data["_parse_type"]
pdf_info_list = pdf_mid_data["pdf_info"]
if parse_type == AbsPipe.PIP_TXT:
# content_list = mk_universal_format(pdf_info_list, img_buket_path)
# md_content = mk_mm_markdown(content_list)
md_content = ocr_mk_mm_markdown_with_para(pdf_info_list, img_buket_path)
elif parse_type == AbsPipe.PIP_OCR:
md_content = ocr_mk_mm_markdown_with_para(pdf_info_list, img_buket_path)
md_content = union_make(pdf_info_list, MakeMode.MM_MD, drop_mode, img_buket_path)
return md_content


9 changes: 5 additions & 4 deletions magic_pdf/pipe/OCRPipe.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.pipe.AbsPipe import AbsPipe
Expand All @@ -15,10 +16,10 @@ def pipe_classify(self):
def pipe_parse(self):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)

def pipe_mk_uni_format(self, img_parent_path: str):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
return content_list

def pipe_mk_markdown(self, img_parent_path: str):
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
return md_content
9 changes: 5 additions & 4 deletions magic_pdf/pipe/TXTPipe.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.pipe.AbsPipe import AbsPipe
Expand All @@ -15,10 +16,10 @@ def pipe_classify(self):
def pipe_parse(self):
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)

def pipe_mk_uni_format(self, img_parent_path: str):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
return content_list

def pipe_mk_markdown(self, img_parent_path: str):
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
return md_content
10 changes: 6 additions & 4 deletions magic_pdf/pipe/UNIPipe.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import json

from loguru import logger

from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.libs.commons import join_path
Expand All @@ -25,12 +27,12 @@ def pipe_parse(self):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug)

def pipe_mk_uni_format(self, img_parent_path: str):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
return content_list

def pipe_mk_markdown(self, img_parent_path: str):
markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
return markdown_content


Expand Down
Loading