Skip to content

Commit

Permalink
add para_to_standard_format logic
Browse files Browse the repository at this point in the history
  • Loading branch information
myhloli committed Apr 25, 2024
1 parent c7cca21 commit d3542f6
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 12 deletions.
14 changes: 7 additions & 7 deletions magic_pdf/cli/magicpdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,13 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
path=f"{pdf_file_name}.json",
mode=AbsReaderWriter.MODE_TXT,
)
# try:
# content_list = pipe.pipe_mk_uni_format()
# except Exception as e:
# logger.exception(e)
# md_writer.write(
# str(content_list), f"{part_file_name}.txt", AbsReaderWriter.MODE_TXT
# )
try:
content_list = pipe.pipe_mk_uni_format()
except Exception as e:
logger.exception(e)
md_writer.write(
str(content_list), f"{pdf_file_name}.txt", AbsReaderWriter.MODE_TXT
)


@click.group()
Expand Down
50 changes: 46 additions & 4 deletions magic_pdf/dict2md/ocr_mkcontent.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,16 +201,58 @@ def para_to_standard_format(para, img_buket_path):
return para_content


def para_to_standard_format_v2(para_block, img_buket_path):
para_type = para_block['type']
if para_type == BlockType.Text:
para_content = {
'type': 'text',
'text': merge_para_with_text(para_block),
}
elif para_type == BlockType.Title:
para_content = {
'type': 'text',
'text': merge_para_with_text(para_block),
'text_level': 1
}
elif para_type == BlockType.InterlineEquation:
para_content = {
'type': 'equation',
'text': merge_para_with_text(para_block),
'text_format': "latex"
}
elif para_type == BlockType.Image:
para_content = {
'type': 'image',
}
for block in para_block['blocks']:
if block['type'] == BlockType.ImageBody:
para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
if block['type'] == BlockType.ImageCaption:
para_content['img_caption'] = merge_para_with_text(block)
elif para_type == BlockType.Table:
para_content = {
'type': 'table',
}
for block in para_block['blocks']:
if block['type'] == BlockType.TableBody:
para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
if block['type'] == BlockType.TableCaption:
para_content['table_caption'] = merge_para_with_text(block)
if block['type'] == BlockType.TableFootnote:
para_content['table_footnote'] = merge_para_with_text(block)

return para_content


def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
content_list = []
for page_info in pdf_info_dict:
paras_of_layout = page_info.get("para_blocks")
if not paras_of_layout:
continue
for paras in paras_of_layout:
for para in paras:
para_content = para_to_standard_format(para, img_buket_path)
content_list.append(para_content)
for para_block in paras_of_layout:
para_content = para_to_standard_format_v2(para_block, img_buket_path)
content_list.append(para_content)
return content_list


Expand Down
3 changes: 2 additions & 1 deletion magic_pdf/pipe/AbsPipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str) -> list:
parse_type = pdf_mid_data["_parse_type"]
pdf_info_list = pdf_mid_data["pdf_info"]
if parse_type == AbsPipe.PIP_TXT:
content_list = mk_universal_format(pdf_info_list, img_buket_path)
# content_list = mk_universal_format(pdf_info_list, img_buket_path)
content_list = make_standard_format_with_para(pdf_info_list, img_buket_path)
elif parse_type == AbsPipe.PIP_OCR:
content_list = make_standard_format_with_para(pdf_info_list, img_buket_path)
return content_list
Expand Down

0 comments on commit d3542f6

Please sign in to comment.