diff --git a/magic_pdf/tools/common.py b/magic_pdf/tools/common.py index c98da4c3..e7137dc4 100644 --- a/magic_pdf/tools/common.py +++ b/magic_pdf/tools/common.py @@ -14,6 +14,9 @@ from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter +import fitz +# from io import BytesIO +# from pypdf import PdfReader, PdfWriter def prepare_env(output_dir, pdf_file_name, method): @@ -26,6 +29,42 @@ def prepare_env(output_dir, pdf_file_name, method): return local_image_dir, local_md_dir +# def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None): +# # 将字节数据包装在 BytesIO 对象中 +# pdf_file = BytesIO(pdf_bytes) +# # 读取 PDF 的字节数据 +# reader = PdfReader(pdf_file) +# # 创建一个新的 PDF 写入器 +# writer = PdfWriter() +# # 将所有页面添加到新的 PDF 写入器中 +# end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1 +# if end_page_id > len(reader.pages) - 1: +# logger.warning("end_page_id is out of range, use pdf_docs length") +# end_page_id = len(reader.pages) - 1 +# for i, page in enumerate(reader.pages): +# if start_page_id <= i <= end_page_id: +# writer.add_page(page) +# # 创建一个字节缓冲区来存储输出的 PDF 数据 +# output_buffer = BytesIO() +# # 将 PDF 写入字节缓冲区 +# writer.write(output_buffer) +# # 获取字节缓冲区的内容 +# converted_pdf_bytes = output_buffer.getvalue() +# return converted_pdf_bytes + + +def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None): + document = fitz.open("pdf", pdf_bytes) + output_document = fitz.open() + end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(document) - 1 + if end_page_id > len(document) - 1: + logger.warning("end_page_id is out of range, use pdf_docs length") + end_page_id = len(document) - 1 + output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id) + output_bytes = output_document.tobytes() + return output_bytes + + def do_parse( output_dir, pdf_file_name, @@ -55,6 +94,8 @@ def do_parse( f_draw_model_bbox = True f_draw_line_sort_bbox = True + pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id, end_page_id) + orig_model_list = copy.deepcopy(model_list) local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method) @@ -66,15 +107,18 @@ def do_parse( if parse_method == 'auto': jso_useful_key = {'_pdf_type': '', 'model_list': model_list} pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True, - start_page_id=start_page_id, end_page_id=end_page_id, lang=lang, + # start_page_id=start_page_id, end_page_id=end_page_id, + lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable) elif parse_method == 'txt': pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True, - start_page_id=start_page_id, end_page_id=end_page_id, lang=lang, + # start_page_id=start_page_id, end_page_id=end_page_id, + lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable) elif parse_method == 'ocr': pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True, - start_page_id=start_page_id, end_page_id=end_page_id, lang=lang, + # start_page_id=start_page_id, end_page_id=end_page_id, + lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable) else: logger.error('unknown parse method')