Skip to content

Fix unicode encoding error #42

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 25 additions & 6 deletions consensus_and_scoring/TriagerScoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
jpath1 = 'FormTriager1.2C2-2018-07-25T23.json'
jpath2 = 'SemanticsTriager1.3C2-2018-07-25T23.json'

def importData(path, out_path):
def importData(path, out_path, texts_dir = None):
'''

:param path: location of the triage data
Expand Down Expand Up @@ -47,6 +47,22 @@ def importData(path, out_path):
#flagExclusions = exclusionList(users, flags, cats)
flagExclusions = []
#print(flagExclusions)

#try to handle texts if path is provided:
if texts_dir:
text_file = os.path.join(texts_dir, a + ".txt")
if not(os.path.exists(text_file)):
for root, dir, files in os.walk(text_file):
for file in files:
print(file)
raise Exception("Couldn't find text_file for article {}".format(text_file))

if text_file == None:
raise Exception("Couldn't find text_file for article", a)

with open(text_file, 'r', encoding='utf-8') as file:
source_text = file.read()

if annotator_count >= STRICT_MINIMUM_CONTRIBUTORS:
cats = np.unique(art_data['topic_name'])
for c in cats:
Expand All @@ -59,10 +75,11 @@ def importData(path, out_path):
namespaces = cat_data['namespace'].tolist()

length = floor(cat_data['article_text_length'].tolist()[0])
texts = cat_data['target_text'].str.decode('unicode-escape').tolist()

print('//Article:', a, 'Category:', c, 'numUsers:', numUsers)
source_text = addToSourceText(starts, ends, texts, source_text)
if texts_dir is None:
texts = cat_data['target_text'].str.decode('unicode-escape').tolist()
source_text = addToSourceText(starts, ends, texts, source_text)
pstarts, pends, pflags = scoreTriager(starts, ends, users, numUsers, flags, length, c, flagExclusions)
out = appendData(filename[0], a, task_uuids, namespaces, pstarts, pends, c, pflags, out, source_text)

Expand All @@ -78,7 +95,7 @@ def appendData(article_filename, article_sha256, task_uuids, namespaces,start_po
case_numbers = np.zeros(len(start_pos_list))
for i in range(len(start_pos_list)):
text = getText(start_pos_list[i], end_pos_list[i],source_text)
text = text.encode('unicode-escape').decode('utf-8')
#text = text.encode('unicode-escape').decode('utf-8')
#print(len(namespaces), len(start_pos_list), len(end_pos_list), len(case_numbers))
data.append([article_filename, article_sha256, task_uuids[i], namespaces[i], start_pos_list[i], end_pos_list[i], topic_name, int(case_numbers[i]), text])
return data
Expand Down Expand Up @@ -388,7 +405,9 @@ def load_args():

if __name__ == '__main__':
args = load_args()
input_file = '../data/highlighter/ESTF_HardTriage-2021-05-14T0016-Highlighter.csv'
input_file = '../data/highlighter/DK_off.csv'
texts_dir = '../data/texts/'

if args.input_file:
input_file = args.input_file
dirname = os.path.dirname(input_file)
Expand All @@ -398,4 +417,4 @@ def load_args():
output_file = args.output_file
print("Input: {}".format(input_file))
print("Output: {}".format(output_file))
importData(input_file, output_file)
importData(input_file, output_file, texts_dir)
3 changes: 3 additions & 0 deletions consensus_and_scoring/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ def fetch_tags_files(body, dir_dict):
def fetch_highlighter_files(body, dir_dict):
highlighters = body.get('Highlighters', [])
retrieve_file_list(highlighters, dir_dict['highlighters_dir'])
texts = body.get('Texts', [])
texts = use_article_sha256_filenames(texts)
retrieve_file_list(texts, dir_dict['texts_dir'])
logger.info("---FILES RETRIEVED SUCCESSFULLY in request_highlighter_consensus handler---")

def fetch_datahunt_files(body, dir_dict):
Expand Down
3 changes: 2 additions & 1 deletion consensus_and_scoring/process_dirs.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def configure_consensus_directories(task_type, parent_dirname):
if task_type == "HLTR":
dir_dict['highlighters_dir'] = make_dir(parent_dirname, 'highlighters')
dir_dict['consensus_dir']= make_dir(parent_dirname, "output_HLTR_consensus")
dir_dict['texts_dir'] = make_dir(parent_dirname, 'texts')
clean_output_csvs(dir_dict['consensus_dir'])
elif task_type == "QUIZ":
dir_dict['config_path'] = './config/'
Expand All @@ -54,7 +55,7 @@ def generate_highlighter_consensus(dir_dict):
if filename.endswith(".csv"):
input_file = os.path.join(highlighters_dir, filename)
output_file = os.path.join(consensus_dir, "S_IAA_" + filename)
importData(input_file, output_file)
importData(input_file, output_file, dir_dict['texts_dir'])

def generate_datahunt_consensus(dir_dict):
uuids_to_filter = read_filter_uuids('./data_patches/')
Expand Down