Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
myhloli committed Jul 13, 2024
2 parents 4d6dcb0 + 351078f commit dad4258
Show file tree
Hide file tree
Showing 61 changed files with 13,591 additions and 79 deletions.
41 changes: 0 additions & 41 deletions .github/workflows/benchmark.yml

This file was deleted.

4 changes: 4 additions & 0 deletions .github/workflows/cli.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,14 @@ jobs:
run: |
pip install -r requirements.txt
pip install -r requirements-qa.txt
pip install magic-pdf
- name: test_cli
run: |
cp magic-pdf.template.json ~/magic-pdf.json
echo $GITHUB_WORKSPACE
cd $GITHUB_WORKSPACE && export PYTHONPATH=. && pytest -s -v tests/test_unit.py
cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_cli.py
- name: benchmark
run: |
cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_bench.py
Binary file not shown.
3 changes: 3 additions & 0 deletions tests/magic-pdf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"temp-output-dir": "/tmp/"
}
Binary file not shown.
Binary file not shown.
Binary file added tests/test_cli/conf/__pycache__/conf.cpython-39.pyc
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/test_cli/conf/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
conf = {
"code_path": os.environ.get('GITHUB_WORKSPACE'),
"pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev",
"pdf_res_path": "/tmp"
"pdf_res_path": "/tmp/magic-pdf"
}

Empty file added tests/test_cli/lib/__init__.py
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
116 changes: 116 additions & 0 deletions tests/test_cli/lib/calculate_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
"""
calculate_score
"""
import os
import re
import json
from Levenshtein import distance
from lib import scoring
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

class Scoring:
"""
calculate_score
"""
def __init__(self, result_path):
"""
init
"""
self.edit_distances = []
self.bleu_scores = []
self.sim_scores = []
self.filenames = []
self.score_dict = {}
self.anntion_cnt = 0
self.fw = open(result_path, "w+", encoding='utf-8')

def simple_bleu_score(self, candidate, reference):
"""
get bleu score
"""
candidate_tokens = word_tokenize(candidate)
reference_tokens = word_tokenize(reference)
return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method1)


def preprocess_string(self, s):
"""
preprocess_string
"""
sub_enter = re.sub(r'\n+', '\n', s)
return re.sub(r' ', ' ', sub_enter)

def calculate_similarity(self, annotion, actual, tool_type):
"""
calculate_similarity
"""
class_dict = {}
edit_distances = []
bleu_scores = []
sim_scores = list()
total_file = 0
for filename in os.listdir(annotion):
if filename.endswith('.md') and not filename.startswith('.'):
total_file = total_file + 1
with open(os.path.join(annotion, filename), 'r', encoding='utf-8') as file_a:
content_a = file_a.read()
self.anntion_cnt = self.anntion_cnt + 1
filepath_b = os.path.join(actual, filename)
if os.path.exists(filepath_b):
with open(filepath_b, 'r', encoding='utf-8') as file_b:
content_b = file_b.read()
self.filenames.append(filename)
edit_dist = distance(self.preprocess_string(content_b),self.preprocess_string(content_a)) / max(len(content_a), len(content_b))
self.edit_distances.append(edit_dist)
edit_distances.append(edit_dist)
bleu_score = self.simple_bleu_score(content_b, content_a)
bleu_scores.append(bleu_score)
self.bleu_scores.append(bleu_score)
score = scoring.score_text(content_b, content_a)
sim_scores.append(score)
self.sim_scores.append(score)
class_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
self.score_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
else:
print(f"File {filename} not found in actual directory.")
class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0
class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0
self.fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
ratio = len(class_dict)/total_file
self.fw.write(f"{tool_type} extract ratio: {ratio}" + "\n")
self.fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
self.fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
self.fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
print (f"{tool_type} extract ratio: {ratio}")
print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}")
print (f"{tool_type} Average BLEU Score: {class_average_bleu_score}")
print (f"{tool_type} Average Sim Score: {class_average_sim_score}")
return self.score_dict

def summary_scores(self):
"""
calculate the average of edit distance, bleu score and sim score
"""
over_all_dict = dict()
average_edit_distance = sum(self.edit_distances) / len(self.edit_distances) if self.edit_distances else 0
average_bleu_score = sum(self.bleu_scores) / len(self.bleu_scores) if self.bleu_scores else 0
average_sim_score = sum(self.sim_scores) / len(self.sim_scores) if self.sim_scores else 0
over_all_dict["average_edit_distance"] = average_edit_distance
over_all_dict["average_bleu_score"] = average_bleu_score
over_all_dict["average_sim_score"] = average_sim_score
self.fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
return over_all_dict

def calculate_similarity_total(self, tool_type, download_dir):
"""
calculate the average of edit distance, bleu score and sim score
"""
annotion = os.path.join(download_dir, "annotations", "cleaned")
actual = os.path.join(download_dir, tool_type, "cleaned")
score = self.calculate_similarity(annotion, actual, tool_type)
return score

26 changes: 12 additions & 14 deletions tests/test_cli/lib/common.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,21 @@
import subprocess
import os


def check_shell(cmd):
"""
shell successful
"""
res = os.system(cmd)
assert res == 0

def count_folders_and_check_contents(directory):
# 获取目录下的所有文件和文件夹
contents = os.listdir(directory)
folder_count = 0
for item in contents:
# 检查是否为文件夹
if os.path.isdir(os.path.join(directory, item)):
# 检查文件夹是否为空
folder_path = os.path.join(directory, item)
for folder in os.listdir(folder_path):
folder_count = folder_count + 1
assert os.listdir(folder_path) is not None
print (folder_count)
assert folder_count == 14
def count_folders_and_check_contents(file_path):
""""
获取文件夹大小
"""
if os.path.exists(file_path):
folder_count = os.path.getsize(file_path)
assert folder_count > 0


if __name__ == "__main__":
Expand Down
128 changes: 128 additions & 0 deletions tests/test_cli/lib/pre_clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""
clean data
"""
import argparse
import os
import re
import htmltabletomd # type: ignore
import pypandoc
import argparse

parser = argparse.ArgumentParser(description="get tool type")
parser.add_argument(
"--tool_name",
type=str,
required=True,
help="input tool name",
)
parser.add_argument(
"--download_dir",
type=str,
required=True,
help="input download dir",
)
args = parser.parse_args()

def clean_markdown_images(content):
"""
clean markdown images
"""
pattern = re.compile(r'!\[[^\]]*\]\([^)]*\)', re.IGNORECASE)
cleaned_content = pattern.sub('', content)
return cleaned_content

def clean_ocrmath_photo(content):
"""
clean ocrmath photo
"""
pattern = re.compile(r'\\includegraphics\[.*?\]\{.*?\}', re.IGNORECASE)
cleaned_content = pattern.sub('', content)
return cleaned_content

def convert_html_table_to_md(html_table):
"""
convert html table to markdown table
"""
lines = html_table.strip().split('\n')
md_table = ''
if lines and '<tr>' in lines[0]:
in_thead = True
for line in lines:
if '<th>' in line:
cells = re.findall(r'<th>(.*?)</th>', line)
md_table += '| ' + ' | '.join(cells) + ' |\n'
in_thead = False
elif '<td>' in line and not in_thead:
cells = re.findall(r'<td>(.*?)</td>', line)
md_table += '| ' + ' | '.join(cells) + ' |\n'
md_table = md_table.rstrip() + '\n'
return md_table

def convert_latext_to_md(content):
"""
convert latex table to markdown table
"""
tables = re.findall(r'\\begin\{tabular\}(.*?)\\end\{tabular\}', content, re.DOTALL)
placeholders = []
for table in tables:
placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"
replace_str = f"\\begin{{tabular}}{table}cl\\end{{tabular}}"
content = content.replace(replace_str, placeholder)
try:
pypandoc.convert_text(replace_str, format="latex", to="md", outputfile="output.md", encoding="utf-8")
except:
markdown_string = replace_str
else:
markdown_string = open('output.md', 'r', encoding='utf-8').read()
placeholders.append((placeholder, markdown_string))
new_content = content
for placeholder, md_table in placeholders:
new_content = new_content.replace(placeholder, md_table)
# 写入文件
return new_content


def convert_htmltale_to_md(content):
"""
convert html table to markdown table
"""
tables = re.findall(r'<table>(.*?)</table>', content, re.DOTALL)
placeholders = []
for table in tables:
placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"
content = content.replace(f"<table>{table}</table>", placeholder)
try:
convert_table = htmltabletomd.convert_table(table)
except:
convert_table = table
placeholders.append((placeholder,convert_table))
new_content = content
for placeholder, md_table in placeholders:
new_content = new_content.replace(placeholder, md_table)
# 写入文件
return new_content

def clean_data(prod_type, download_dir):
"""
clean data
"""
tgt_dir = os.path.join(download_dir, prod_type, "cleaned")
if not os.path.exists(tgt_dir):
os.makedirs(tgt_dir)
source_dir = os.path.join(download_dir, prod_type)
filenames = os.listdir(source_dir)
for filename in filenames:
if filename.endswith('.md'):
input_file = os.path.join(source_dir, filename)
output_file = os.path.join(tgt_dir, "cleaned_" + filename)
with open(input_file, 'r', encoding='utf-8') as fr:
content = fr.read()
new_content = clean_markdown_images(content)
with open(output_file, 'w', encoding='utf-8') as fw:
fw.write(new_content)


if __name__ == '__main__':
tool_type = args.tool_name
download_dir = args.download_dir
clean_data(tool_type, download_dir)
Loading

0 comments on commit dad4258

Please sign in to comment.