-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathexample_annotate.py
84 lines (63 loc) · 3.81 KB
/
example_annotate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import argparse
import os
import pkg_resources
from nalaf.utils.readers import TextFilesReader, PMIDReader, HTMLReader
from nalaf.utils.readers import StringReader
from nalaf.utils.writers import ConsoleWriter, TagTogFormat, PubTatorFormat
from nalaf.structures.dataset_pipelines import PrepareDatasetPipeline
from nalaf.learning.crfsuite import PyCRFSuite
from nalaf.learning.taggers import StubSameSentenceRelationExtractor
from nalaf.structures.data import Dataset
from nalaf.utils.annotation_readers import AnnJsonAnnotationReader
ENT1_CLASS_ID = 'e_x'
ENT2_CLASS_ID = 'e_y'
REL_ENT1_ENT2_CLASS_ID = 'r_z'
ENTREZ_GENE_ID = 'n_w'
UNIPROT_ID = 'n_v'
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='A simple demo for using the nalaf pipeline for prediction')
parser.add_argument('--color', help='uses color for highlighting predictions if supported '
'otherwise prints them in new line',
action='store_true', default=True, dest='color')
parser.add_argument('--no_color', help='prints predictions in new line',
action='store_false', dest='color')
parser.add_argument('-o', '--output_dir', help='write the output to the provided directory, '
'the format can be specified with the -f switch, '
'otherwise the output will be written to the standard console')
parser.add_argument('-f', '--file_format', help='the format for writing the output to a directory',
choices=['ann.json', 'pubtator'], default='ann.json')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('-s', '--string', help='string you want to predict for')
group.add_argument('-d', '--dir_or_file', help='directory or file you want to predict for')
group.add_argument('-p', '--pmids', nargs='+', help='a single PMID or a list of PMIDs separated by space')
args = parser.parse_args()
if args.string:
dataset = StringReader(args.string).read()
elif args.pmids:
dataset = PMIDReader(args.pmids).read()
elif os.path.exists(args.dir_or_file):
file = args.dir_or_file
file_extension = os.path.splitext(file)[1]
if file_extension == ".txt":
dataset = TextFilesReader(file).read()
elif file_extension == ".html":
dataset = HTMLReader(file, whole_basename_as_docid=True, hdfs_url=None, hdfs_user=None).read()
# We assume there is a second file, for the ann.json
file_json=file.replace('.html', '.json')
AnnJsonAnnotationReader(file_json, read_only_class_id=None, whole_basename_as_docid=True, hdfs_url=None, hdfs_user=None).annotate(dataset)
else:
raise FileNotFoundError('directory or file "{}" does not exist'.format(args.dir_or_file))
PrepareDatasetPipeline().execute(dataset)
# get the predictions -- "example_entity_model" is only available in the nalaf src distribution
crf = PyCRFSuite(model_file=pkg_resources.resource_filename('nalaf.data', 'example_entity_model'))
crf.annotate(dataset, class_id=ENT2_CLASS_ID)
StubSameSentenceRelationExtractor(ENT1_CLASS_ID, ENT2_CLASS_ID, REL_ENT1_ENT2_CLASS_ID).annotate(dataset)
if args.output_dir:
if not os.path.isdir(args.output_dir):
raise NotADirectoryError('{} is not a directory'.format(args.output_dir))
if args.file_format == 'ann.json':
TagTogFormat(dataset, use_predicted=True, to_save_to=args.output_dir).export(threshold_val=0)
elif args.file_format == 'pubtator':
PubTatorFormat(dataset, location=os.path.join(args.output_dir, 'pubtator.txt')).export()
else:
ConsoleWriter(ENT1_CLASS_ID, ENT2_CLASS_ID, args.color).write(dataset)