Skip to content

Commit

Permalink
Fix classify: there is no more pdf_bytes in UNIPipe
Browse files Browse the repository at this point in the history
Signed-off-by: Mingde (Matthew) Zeng <[email protected]>
  • Loading branch information
Mingde (Matthew) Zeng committed Dec 30, 2024
1 parent c4f252d commit 9ae7635
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 10 deletions.
4 changes: 3 additions & 1 deletion magic_pdf/pipe/AbsPipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,10 @@ def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, m
return md_content

@staticmethod
def classify(pdf_bytes: bytes) -> str:
def classify(self) -> str:
"""根据pdf的元数据,判断是文本pdf,还是ocr pdf."""
pdf_bytes = self.dataset.data_bits()

pdf_meta = pdf_meta_scan(pdf_bytes)
if pdf_meta.get('_need_drop', False): # 如果返回了需要丢弃的标志,则抛出异常
raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
Expand Down
16 changes: 7 additions & 9 deletions magic_pdf/pipe/UNIPipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def __init__(
formula_enable=None,
table_enable=None,
):
self.pdf_type = jso_useful_key['_pdf_type']
super().__init__(
dataset,
jso_useful_key['model_list'],
Expand All @@ -39,13 +38,14 @@ def __init__(
formula_enable,
table_enable,
)
self.pdf_type = jso_useful_key['_pdf_type']
if len(self.model_list) == 0:
self.input_model_is_empty = True
else:
self.input_model_is_empty = False

def pipe_classify(self):
self.pdf_type = AbsPipe.classify(self.pdf_bytes)
self.pdf_type = AbsPipe.classify(self)

def pipe_analyze(self):
if self.pdf_type == self.PIP_TXT:
Expand Down Expand Up @@ -115,8 +115,9 @@ def pipe_mk_markdown(


if __name__ == '__main__':
# 测试
# Testing
from magic_pdf.data.data_reader_writer import DataReader
from magic_pdf.data.dataset import PymuDocDataset # Import the concrete dataset class

drw = DataReader(r'D:/project/20231108code-clean')

Expand All @@ -129,14 +130,11 @@ def pipe_mk_markdown(
img_bucket_path = 'imgs'
img_writer = DataWriter(join_path(write_path, img_bucket_path))

# pdf_type = UNIPipe.classify(pdf_bytes)
# jso_useful_key = {
# "_pdf_type": pdf_type,
# "model_list": model_list
# }
# Create dataset instance instead of using raw bytes
dataset = PymuDocDataset(pdf_bytes)

jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
pipe = UNIPipe(dataset, jso_useful_key, img_writer)
pipe.pipe_classify()
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(img_bucket_path)
Expand Down

0 comments on commit 9ae7635

Please sign in to comment.