-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathreconcile.py
executable file
·245 lines (200 loc) · 7.75 KB
/
reconcile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#!/usr/bin/env python3
import argparse
import os
import textwrap
import warnings
import zipfile
from os.path import basename
from pylib import summary
from pylib import utils
from pylib.table import Table
VERSION = "0.8.4"
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
fromfile_prefix_chars="@",
description=textwrap.dedent(
"""
This takes raw Notes from Nature classifications and creates a
reconciliation of the classifications for a particular workflow.
That is, it reduces n classifications per subject to the "best"
values."""
),
epilog=textwrap.dedent(
"""
Current reconciliation types
----------------------------
select: Reconcile a fixed list of options.
text: Reconcile free text entries.
same: Check that all items in a group are the same.
box: Reconcile drawn bounding boxes, the mean of the corners.
Required box format:
{"x": <int>, "y": <int>, "width": <int>, "height": <int>}
point: Calculate the mean of a point. Required point format:
{"x": <int>, "y": <int>}
noop: Do nothing with this field.
length: Calculate the length of a drawn line. It first calculates the
mean of the end points and then uses a scale to get the
calibrated length relative to the scale. Required length format:
{"x1": <int>, "y1": <int>, "x2": <int>, "y2": <int>}
To get actual lengths (vs. pixel) you will need a scale length
header with a number and column with units. Ex: "scale 0.5 mm".
"""
),
)
parser.add_argument("input_file", metavar="INPUT-FILE", help="""The input file.""")
parser.add_argument(
"-u",
"--unreconciled",
help="""Write the unreconciled workflow classifications to this CSV file.""",
)
parser.add_argument(
"-r",
"--reconciled",
help="""Write the reconciled classifications to this CSV file.""",
)
parser.add_argument(
"-s",
"--summary",
help="""Write a summary of the reconciliation to this HTML file.""",
)
parser.add_argument(
"-e",
"--explanations",
action="store_true",
help="""Output the reconciled explanations with the reconciled classifications
CSV file.""",
)
parser.add_argument(
"-z",
"--zip",
help="""Zip the output files and put them into this archive.""",
)
parser.add_argument(
"-n",
"--workflow-name",
help="""The name of the workflow. NfN extracts can find a default.""",
)
parser.add_argument(
"-w",
"--workflow-id",
type=int,
help="""The workflow to extract. Required if there is more than one workflow in
the classifications file. This is only used for nfn formats.""",
)
parser.add_argument(
"--fuzzy-ratio-threshold",
default=90,
type=int,
help="""Sets the cutoff for fuzzy ratio matching (0-100) (default: %(default)s)
See https://github.com/seatgeek/fuzzywuzzy.""",
)
parser.add_argument(
"--fuzzy-set-threshold",
default=50,
type=int,
help="""Sets the cutoff for fuzzy set matching (0-100) (default: %(default)s).
See https://github.com/seatgeek/fuzzywuzzy.""",
)
parser.add_argument(
"--join-distance",
default=6,
type=int,
help="""When highlighted text are withing this distance join them into a
single text (default: %(default)s).""",
)
parser.add_argument(
"--workflow-csv",
default="",
metavar="CSV",
help="""Sometimes we need to translate a value from its numeric code to a
human-readable string. The workflow file will contain these translations.
""",
)
parser.add_argument(
"-f",
"--format",
choices=["nfn", "csv", "json"],
default="nfn",
help="""The unreconciled data is in what type of file? nfn=A Zooniverse
classification data dump. csv=A flat CSV file. json=A JSON file. When the
format is "csv" or "json" we require the --column-types. If the type is
"nfn" we can guess the --column-types but the --column-types option will
still override our guesses. (default: %(default)s)""",
)
parser.add_argument(
"-c",
"--column-types",
action="append",
help="""We need do identify what the column types are for CSV or JSON files.
This is a string with information on how to reconcile each column in the
input file. The format is --column-types "foo:select,bar:text,baz:text".
The list is comma separated with the column label going before the colon
and the reconciliation type after the colon. You may want to use this
argument multiple times. The default field type is a NoOp (Do nothing).""",
)
parser.add_argument(
"--group-by",
default="subject_id",
help="""Group CSV & JSON the rows by this column (Default=subject_id).""",
)
parser.add_argument(
"--page-size",
default=20,
type=int,
help="""Page size for the summary report's detail section.
(default: %(default)s)""",
)
parser.add_argument(
"--no-summary-detail",
action="store_true",
help="""Skip the Reconciliation Detail section in the summary report.""",
)
parser.add_argument(
"-V", "--version", action="version", version=f"%(prog)s {VERSION}"
)
args = parser.parse_args()
setattr(args, "row_key", "classification_id")
setattr(args, "user_column", "user_name")
setattr(args, "max_transcriptions", 50)
setattr(args, "format", f"{args.format}_format")
if args.fuzzy_ratio_threshold < 0 or args.fuzzy_ratio_threshold > 100:
utils.error_exit("--fuzzy-ratio-threshold must be between 0 and 100.")
if args.fuzzy_set_threshold < 0 or args.fuzzy_set_threshold > 100:
utils.error_exit("--fuzzy-set-threshold must be between 0 and 100.")
if args.format == "nfn" and args.column_types:
warnings.warn("Column types are ignored for 'nfn' format.")
return args
def zip_files(args):
zip_file = args.zip if args.zip else args.zip_keep
args_dict = vars(args)
arg_files = ["unreconciled", "reconciled", "summary"]
with zipfile.ZipFile(zip_file, mode="w") as zippy:
for arg_file in arg_files:
if args_dict[arg_file]:
zippy.write(
args_dict[arg_file],
arcname=basename(args_dict[arg_file]),
compress_type=zipfile.ZIP_DEFLATED,
)
for arg_file in arg_files:
if args_dict[arg_file]:
os.remove(args_dict[arg_file])
def main():
args = parse_args()
formats = utils.get_plugins("formats")
unreconciled: Table = formats[args.format].read(args)
if len(unreconciled) == 0:
utils.error_exit(f"Workflow {args.workflow_id} has no data.")
if args.unreconciled:
unreconciled.to_csv(args, args.unreconciled)
if args.reconciled or args.summary:
reconciled = unreconciled.reconcile(args)
if args.reconciled:
reconciled.to_csv(args, args.reconciled, args.explanations)
if args.summary:
summary.report(args, unreconciled, reconciled)
if args.zip:
zip_files(args)
if __name__ == "__main__":
main()