-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'origin/master'
- Loading branch information
Showing
61 changed files
with
13,591 additions
and
79 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{ | ||
"temp-output-dir": "/tmp/" | ||
} |
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
""" | ||
calculate_score | ||
""" | ||
import os | ||
import re | ||
import json | ||
from Levenshtein import distance | ||
from lib import scoring | ||
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction | ||
from nltk.tokenize import word_tokenize | ||
import nltk | ||
nltk.download('punkt') | ||
|
||
class Scoring: | ||
""" | ||
calculate_score | ||
""" | ||
def __init__(self, result_path): | ||
""" | ||
init | ||
""" | ||
self.edit_distances = [] | ||
self.bleu_scores = [] | ||
self.sim_scores = [] | ||
self.filenames = [] | ||
self.score_dict = {} | ||
self.anntion_cnt = 0 | ||
self.fw = open(result_path, "w+", encoding='utf-8') | ||
|
||
def simple_bleu_score(self, candidate, reference): | ||
""" | ||
get bleu score | ||
""" | ||
candidate_tokens = word_tokenize(candidate) | ||
reference_tokens = word_tokenize(reference) | ||
return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method1) | ||
|
||
|
||
def preprocess_string(self, s): | ||
""" | ||
preprocess_string | ||
""" | ||
sub_enter = re.sub(r'\n+', '\n', s) | ||
return re.sub(r' ', ' ', sub_enter) | ||
|
||
def calculate_similarity(self, annotion, actual, tool_type): | ||
""" | ||
calculate_similarity | ||
""" | ||
class_dict = {} | ||
edit_distances = [] | ||
bleu_scores = [] | ||
sim_scores = list() | ||
total_file = 0 | ||
for filename in os.listdir(annotion): | ||
if filename.endswith('.md') and not filename.startswith('.'): | ||
total_file = total_file + 1 | ||
with open(os.path.join(annotion, filename), 'r', encoding='utf-8') as file_a: | ||
content_a = file_a.read() | ||
self.anntion_cnt = self.anntion_cnt + 1 | ||
filepath_b = os.path.join(actual, filename) | ||
if os.path.exists(filepath_b): | ||
with open(filepath_b, 'r', encoding='utf-8') as file_b: | ||
content_b = file_b.read() | ||
self.filenames.append(filename) | ||
edit_dist = distance(self.preprocess_string(content_b),self.preprocess_string(content_a)) / max(len(content_a), len(content_b)) | ||
self.edit_distances.append(edit_dist) | ||
edit_distances.append(edit_dist) | ||
bleu_score = self.simple_bleu_score(content_b, content_a) | ||
bleu_scores.append(bleu_score) | ||
self.bleu_scores.append(bleu_score) | ||
score = scoring.score_text(content_b, content_a) | ||
sim_scores.append(score) | ||
self.sim_scores.append(score) | ||
class_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score} | ||
self.score_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score} | ||
else: | ||
print(f"File {filename} not found in actual directory.") | ||
class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0 | ||
class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0 | ||
class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0 | ||
self.fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n") | ||
ratio = len(class_dict)/total_file | ||
self.fw.write(f"{tool_type} extract ratio: {ratio}" + "\n") | ||
self.fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n") | ||
self.fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n") | ||
self.fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n") | ||
print (f"{tool_type} extract ratio: {ratio}") | ||
print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}") | ||
print (f"{tool_type} Average BLEU Score: {class_average_bleu_score}") | ||
print (f"{tool_type} Average Sim Score: {class_average_sim_score}") | ||
return self.score_dict | ||
|
||
def summary_scores(self): | ||
""" | ||
calculate the average of edit distance, bleu score and sim score | ||
""" | ||
over_all_dict = dict() | ||
average_edit_distance = sum(self.edit_distances) / len(self.edit_distances) if self.edit_distances else 0 | ||
average_bleu_score = sum(self.bleu_scores) / len(self.bleu_scores) if self.bleu_scores else 0 | ||
average_sim_score = sum(self.sim_scores) / len(self.sim_scores) if self.sim_scores else 0 | ||
over_all_dict["average_edit_distance"] = average_edit_distance | ||
over_all_dict["average_bleu_score"] = average_bleu_score | ||
over_all_dict["average_sim_score"] = average_sim_score | ||
self.fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n") | ||
return over_all_dict | ||
|
||
def calculate_similarity_total(self, tool_type, download_dir): | ||
""" | ||
calculate the average of edit distance, bleu score and sim score | ||
""" | ||
annotion = os.path.join(download_dir, "annotations", "cleaned") | ||
actual = os.path.join(download_dir, tool_type, "cleaned") | ||
score = self.calculate_similarity(annotion, actual, tool_type) | ||
return score | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
""" | ||
clean data | ||
""" | ||
import argparse | ||
import os | ||
import re | ||
import htmltabletomd # type: ignore | ||
import pypandoc | ||
import argparse | ||
|
||
parser = argparse.ArgumentParser(description="get tool type") | ||
parser.add_argument( | ||
"--tool_name", | ||
type=str, | ||
required=True, | ||
help="input tool name", | ||
) | ||
parser.add_argument( | ||
"--download_dir", | ||
type=str, | ||
required=True, | ||
help="input download dir", | ||
) | ||
args = parser.parse_args() | ||
|
||
def clean_markdown_images(content): | ||
""" | ||
clean markdown images | ||
""" | ||
pattern = re.compile(r'!\[[^\]]*\]\([^)]*\)', re.IGNORECASE) | ||
cleaned_content = pattern.sub('', content) | ||
return cleaned_content | ||
|
||
def clean_ocrmath_photo(content): | ||
""" | ||
clean ocrmath photo | ||
""" | ||
pattern = re.compile(r'\\includegraphics\[.*?\]\{.*?\}', re.IGNORECASE) | ||
cleaned_content = pattern.sub('', content) | ||
return cleaned_content | ||
|
||
def convert_html_table_to_md(html_table): | ||
""" | ||
convert html table to markdown table | ||
""" | ||
lines = html_table.strip().split('\n') | ||
md_table = '' | ||
if lines and '<tr>' in lines[0]: | ||
in_thead = True | ||
for line in lines: | ||
if '<th>' in line: | ||
cells = re.findall(r'<th>(.*?)</th>', line) | ||
md_table += '| ' + ' | '.join(cells) + ' |\n' | ||
in_thead = False | ||
elif '<td>' in line and not in_thead: | ||
cells = re.findall(r'<td>(.*?)</td>', line) | ||
md_table += '| ' + ' | '.join(cells) + ' |\n' | ||
md_table = md_table.rstrip() + '\n' | ||
return md_table | ||
|
||
def convert_latext_to_md(content): | ||
""" | ||
convert latex table to markdown table | ||
""" | ||
tables = re.findall(r'\\begin\{tabular\}(.*?)\\end\{tabular\}', content, re.DOTALL) | ||
placeholders = [] | ||
for table in tables: | ||
placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->" | ||
replace_str = f"\\begin{{tabular}}{table}cl\\end{{tabular}}" | ||
content = content.replace(replace_str, placeholder) | ||
try: | ||
pypandoc.convert_text(replace_str, format="latex", to="md", outputfile="output.md", encoding="utf-8") | ||
except: | ||
markdown_string = replace_str | ||
else: | ||
markdown_string = open('output.md', 'r', encoding='utf-8').read() | ||
placeholders.append((placeholder, markdown_string)) | ||
new_content = content | ||
for placeholder, md_table in placeholders: | ||
new_content = new_content.replace(placeholder, md_table) | ||
# 写入文件 | ||
return new_content | ||
|
||
|
||
def convert_htmltale_to_md(content): | ||
""" | ||
convert html table to markdown table | ||
""" | ||
tables = re.findall(r'<table>(.*?)</table>', content, re.DOTALL) | ||
placeholders = [] | ||
for table in tables: | ||
placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->" | ||
content = content.replace(f"<table>{table}</table>", placeholder) | ||
try: | ||
convert_table = htmltabletomd.convert_table(table) | ||
except: | ||
convert_table = table | ||
placeholders.append((placeholder,convert_table)) | ||
new_content = content | ||
for placeholder, md_table in placeholders: | ||
new_content = new_content.replace(placeholder, md_table) | ||
# 写入文件 | ||
return new_content | ||
|
||
def clean_data(prod_type, download_dir): | ||
""" | ||
clean data | ||
""" | ||
tgt_dir = os.path.join(download_dir, prod_type, "cleaned") | ||
if not os.path.exists(tgt_dir): | ||
os.makedirs(tgt_dir) | ||
source_dir = os.path.join(download_dir, prod_type) | ||
filenames = os.listdir(source_dir) | ||
for filename in filenames: | ||
if filename.endswith('.md'): | ||
input_file = os.path.join(source_dir, filename) | ||
output_file = os.path.join(tgt_dir, "cleaned_" + filename) | ||
with open(input_file, 'r', encoding='utf-8') as fr: | ||
content = fr.read() | ||
new_content = clean_markdown_images(content) | ||
with open(output_file, 'w', encoding='utf-8') as fw: | ||
fw.write(new_content) | ||
|
||
|
||
if __name__ == '__main__': | ||
tool_type = args.tool_name | ||
download_dir = args.download_dir | ||
clean_data(tool_type, download_dir) |
Oops, something went wrong.