-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcombineAnnotationsParallel.py
executable file
·84 lines (77 loc) · 3.4 KB
/
combineAnnotationsParallel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/python -W all
"""
combineAnnotationsParallel.py: combine the tags generated by machine learning experiments
usage: paste file1 [file2 ..] | combineAnnotations.py [-m minConfidence] [-h] file1 [file2 ...]
notes:
- expected input line format: hashsign number colon gold-tag guessed-tag confidence-score [repeated]
- option -m specifies acceptance confidence score for classes (default: 0.0)
20170622 erikt(at)xs4all.nl
"""
import getopt
import re
import sys
COMMAND = sys.argv.pop(0) # this command
USAGE = "paste file1 [file2 ..] | "+COMMAND+"[-m minConfidence]"
OTHER = "O" # other tag
NONE = "-1" # unknown tag
HASH = "#" # hash sign
BLOCKSIZE = 5 # number of data items per block: hash id gold guess confidence
minConfidence = -1000.0 # minimum confidence score for class assignment
hasHeading = False
# check for command line options
def checkOptions():
global hasHeading,minConfidence
try: (opts,args) = getopt.getopt(sys.argv,"hm:",[])
except: sys.exit(USAGE)
for option in opts:
if option[0] == "-m": minConfidence = float(option[1])
elif option[0] == "-h": hasHeading = True
return(args) # remaining command line arguments
# check options
args = checkOptions()
# initialize gold tags, guess tags and confidence values
goldTags = {}
guessedTags = {} # best classes per item
confidenceValues = {} # best confidence scores per item
confidenceValues2 = {} # second best confidence scores per item
history = {}
nbrOfFields = -1
if hasHeading: print "gold bestClass bestConfidence secondBestClass secondBestConfidence"
# process input
for line in sys.stdin:
line = line.rstrip()
fields = line.split()
if nbrOfFields < 0: nbrOfFields = len(fields)
if len(fields) != nbrOfFields:
sys.exit(COMMAND+": unexpected number of fields on line: "+line+"\n")
confidences = []
gold = ""
for i in range(0,len(fields),BLOCKSIZE):
if fields[i] != HASH: sys.exit(COMMAND+": missing hash sign on position "+str(i+1)+" of line: "+line+"\n")
if len(fields) < i+BLOCKSIZE: sys.exit(COMMAND+": number of tokens is not a multiple of "+BLOCKSIZE+" on line "+line+"\n")
# assume that class values are numeric and data line is sorted
confidence = fields[i+BLOCKSIZE-1]
confidences.append(float(confidence))
if fields[i+2] != OTHER: gold = fields[i+2]
bestConfidence = -1
bestIndex = -1
secondBestConfidence = -1
secondBestIndex = -1
for i in range(0,len(confidences)):
if confidences[i] > bestConfidence and confidence >= minConfidence:
secondBestConfidence = bestConfidence
secondBestIndex = bestIndex
bestConfidence = confidences[i]
bestIndex = i
elif confidences[i] > secondBestConfidence and confidence >= minConfidence:
secondBestConfidence = confidences[i]
secondBestIndex = i
if len(args) == 0:
bestClass = str(bestIndex+1)
secondBestClass = str(secondBestIndex+1)
else:
if len(args) < bestIndex+1: sys.exit(COMMAND+": too few class arguments, looking for "+str(bestIndex+1)+"\n")
if len(args) < secondBestIndex+1: sys.exit(COMMAND+": too few class arguments, looking for "+str(secondBestIndex+1)+"\n")
bestClass = args(bestIndex)
secondBestClass = args(secondBestIndex)
print "%s %s %0.3f %s %0.3f" % (gold,bestClass,bestConfidence,secondBestClass,secondBestConfidence)