-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbatch_synthesize_controversial_pairs.py
360 lines (309 loc) · 13.3 KB
/
batch_synthesize_controversial_pairs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
import os, pickle
import random
import time
import torch
import numpy as np
import pandas as pd
from model_functions import model_factory
from sentence_optimization import (
optimize_sentence_set,
initialize_random_word_sentence,
controversiality_loss_func,
)
from utils import exclusive_write_line
from task_scheduler import TaskScheduler
class NaturalSentenceAssigner:
"""Assign natural sentences as initial sentences. Each natural sentence is assigned for one model pair."""
def __init__(self, model_pairs, seed=42, natural_sentence_file=None):
"""Initialize the assigner.
Args:
model_pairs: a list of model pair tuples
seed: random seed
natural_sentence_file: a file containing natural sentences. If None, use the default file.
"""
if natural_sentence_file is None:
natural_sentence_file = os.path.join(
"resources",
"sentence_corpora",
"natural_sentences_for_synthetic_controversial_sentence_pair_optimization.txt",
)
with open(natural_sentence_file) as f:
natural_sentences = [l.strip().rstrip(".") for l in f]
natural_sentences = pd.DataFrame({"sentence": natural_sentences})
natural_sentences = natural_sentences.sample(
frac=1, random_state=seed
) # shuffle sentences
unique_model_pairs = sorted(list(set([tuple(sorted(pair)) for pair in model_pairs])))
random.Random(seed).shuffle(unique_model_pairs)
n_unique_model_pairs = len(unique_model_pairs)
sentence_groups = natural_sentences.groupby(
np.arange(len(natural_sentences)) % n_unique_model_pairs
)
self.unique_model_pairs = unique_model_pairs
self.model_pair_dict = {
tuple(model_pair): sentence_group[1].sort_index()
for model_pair, sentence_group in zip(unique_model_pairs, sentence_groups)
}
def get_sentences(self, model_pair):
return self.model_pair_dict[tuple(sorted(model_pair))]
def synthesize_controversial_sentence_pair_set(
model_pairs,
initial_sentence_assigner,
results_csv_folder=None,
sent_len=8,
allow_only_prepositions_to_repeat=True,
max_sentence_pairs_per_run=5,
natural_initialization=True,
replacement_strategy="cyclic",
n_pairs_to_synthesize_per_model_pair=100,
max_non_decreasing_loss_attempts_per_word=5,
max_replacement_attempts_per_word=50,
max_opt_hours=None,
verbose=3,
):
"""Synthesize a set of controversial synthetic sentence pairs.
This function can be run in parallel by multiple nodes to build a large set of sentence pairs.
args:
model_pairs: list of tuples of model names
initial_sentence_assigner: NaturalSentenceAssigner object
results_csv_folder: string, path to folder where the resulting sentence pairs will be saved
sent_len: int, length of synthetic sentences (number of words)
allow_only_prepositions_to_repeat: bool, if True, only prepositions can be repeated in the sentence
max_pairs: int, maximum number of sentence pairs to synthesize with each run of the script (set to None to keep running). Useful if HPC jobs are time-limited.
natural_initialization: bool, if True, use natural sentences as initial sentences. Otherwise, initialize as random sentences.
n_pairs_to_synthesize_per_model_pair: int, number of sentence pairs to synthesize for each model pair.
max_opt_hours: int, maximum number of hours to run the optimization for each sentence pair.
verbose: int, verbosity level.
Generates a CSV file with the following columns:
natural sentence, synthetic sentence, loss, log p(natural sentence | model1), log p(synthetic sentence | model1), log p(natural sentence | model2), log p(synthetic sentence | model2)
# model1 and model2 are the two models that are used to generate the sentence pair, as determined from the filename {model1}_vs_{model2}.
"""
n_sentences = 2 # we optimize a pair of sentences
sentences_to_change = [1] # change the second sentence, keep the first fixed
sch = TaskScheduler(
max_job_time_in_seconds=3600 * 6
) # tracking multiple sentence optimization jobs
job_df = (
sch.to_pandas()
) # get a dataframe of jobs, this DataFrame will be non-empty if there are jobs running
try:
job_id_df = pd.DataFrame(list(job_df["job_id"]))
except:
job_id_df = None
print("job_id_df", job_id_df)
# determine which model pair has the least completed or running jobs
model_pairs_stats = []
for model_name_pair in model_pairs:
[model1_name, model2_name] = model_name_pair
if model1_name == model2_name:
continue
if job_id_df is not None and len(job_id_df) > 0:
n_jobs = (
(job_id_df["model_1"] == model1_name)
& (job_id_df["model_2"] == model2_name)
).sum()
else:
n_jobs = 0
model_pairs_stats.append(
{
"model_1": model1_name,
"model_2": model2_name,
"n_jobs": n_jobs,
"tie_breaker": random.random(),
}
)
model_pairs_stats = pd.DataFrame(model_pairs_stats)
model_pairs_stats = model_pairs_stats.sort_values(
by=["n_jobs", "tie_breaker"], ascending=True
)
# list model pairs, pairs with less sentences first
model_pair_list = list(
zip(model_pairs_stats["model_1"], model_pairs_stats["model_2"])
)
if allow_only_prepositions_to_repeat: # load a list of prepositions
allowed_repeating_words = set(
pickle.load(open(os.path.join("resources", "preps.pkl"), "rb"))
)
keep_words_unique = True
else:
allowed_repeating_words = None
keep_words_unique = False
for model_name_pair in model_pair_list:
[model1_name, model2_name] = model_name_pair
if results_csv_folder is not None:
results_csv_fname = os.path.join(
results_csv_folder, model1_name + "_vs_" + model2_name + ".csv"
)
# allocate GPUs. Ideally, we'd like a separate GPU for each model.
model_GPU_IDs = []
cur_GPU_ID = 0
for model_name in model_name_pair:
model_GPU_IDs.append(cur_GPU_ID)
if not model_name in [
"bigram",
"trigram",
]: # bigram and trigram models run on CPU, so gpu_id will be ignored
cur_GPU_ID += 1
if cur_GPU_ID >= torch.cuda.device_count():
cur_GPU_ID = 0
models_loaded = False
n_optimized = 0
natural_sentence_df = initial_sentence_assigner.get_sentences(model_name_pair)
for i_natural_sentence, (sentence_index, natural_sentence) in enumerate(
zip(natural_sentence_df.index, natural_sentence_df["sentence"])
):
if i_natural_sentence >= (n_pairs_to_synthesize_per_model_pair + 1):
break
job_id = {
"natural_sentence": natural_sentence,
"model_1": model1_name,
"model_2": model2_name,
}
success = sch.start_job(
job_id
) # tracking the optimization job (useful for HPC environments)
if not success:
continue
print(
"optimizing sentence {} ({}) for {} vs {}".format(
i_natural_sentence, sentence_index, model1_name, model2_name
)
)
if not models_loaded: # load models
models = []
for model_name, model_GPU_ID in zip(model_name_pair, model_GPU_IDs):
device_type = (
"cpu" if model_name in ["bigram", "trigram"] else "gpu"
)
print(
"loading "
+ model_name
+ " "
+ device_type
+ " "
+ str(model_GPU_ID)
+ "...",
end="",
)
models.append(model_factory(model_name, model_GPU_ID))
print("done.")
models_loaded = True
loss_func = controversiality_loss_func
def monitoring_func(sentences, sentences_log_p):
"""prints an update on optimization status"""
print(
model1_name
+ ":"
+ "{:.2f}/{:.2f}".format(
sentences_log_p[..., 0, 0], sentences_log_p[..., 0, 1]
)
)
print(
model2_name
+ ":"
+ "{:.2f}/{:.2f}".format(
sentences_log_p[..., 1, 0], sentences_log_p[..., 1, 1]
)
)
if max_opt_hours is not None:
# stop optimization after max_opt_time hours
start_time = time.time()
def stop_if_time_exceeded(loss):
time_elapsed_in_hours = (time.time() - start_time) / 3600
if time_elapsed_in_hours > max_opt_hours:
print(
f"time exceeded ({time_elapsed_in_hours:.2f} hours), stopping optimization"
)
return True
else:
return False
internal_stopping_condition = stop_if_time_exceeded
else:
internal_stopping_condition = (
lambda loss: False
) # stops optimization if condition is met
if natural_initialization:
initial_sentences = [natural_sentence] * n_sentences
else:
initial_sentences = [
initialize_random_word_sentence(
sent_len, initial_sampling="uniform"
)
] * n_sentences
results = optimize_sentence_set(
n_sentences,
models=models,
loss_func=loss_func,
sentences=initial_sentences,
sent_length_in_words=sent_len,
initial_sampling=None,
replacement_strategy=replacement_strategy,
monitoring_func=monitoring_func,
internal_stopping_condition=internal_stopping_condition,
start_with_identical_sentences=True,
max_steps=10000,
keep_words_unique=keep_words_unique,
allowed_repeating_words=allowed_repeating_words,
sentences_to_change=sentences_to_change,
max_replacement_attempts_per_word=max_replacement_attempts_per_word,
max_non_decreasing_loss_attempts_per_word=max_non_decreasing_loss_attempts_per_word,
verbose=verbose,
)
if results is False: # optimization was terminated
continue
sentences = results["sentences"]
sentences_log_p = results["sentences_log_p"]
print(sentences)
monitoring_func(sentences, sentences_log_p)
# save results.
# CSV format:
# sentence 1, sentence 2, loss, model_1_log_prob_sent1, model_1_log_prob_sent2, model_2_log_prob_sent1, model_2_log_prob_sent2,
outputs = (
[sentence_index]
+ results["sentences"]
+ [results["loss"]]
+ list(sentences_log_p.flat)
)
line = ",".join(map(str, outputs))
exclusive_write_line(results_csv_fname, line)
sch.job_done(job_id, results=results)
n_optimized += 1
if (max_sentence_pairs_per_run is not None) and (
n_optimized >= max_sentence_pairs_per_run
):
break
if __name__ == "__main__":
all_model_names = [
"bigram",
"trigram",
"rnn",
"lstm",
"gpt2",
"bert",
"roberta",
"xlm",
"electra",
]
# get all pairs excluding self-pairs:
model_pairs = []
for model1_name in all_model_names:
for model2_name in all_model_names:
if model1_name != model2_name:
model_pairs.append((model1_name, model2_name))
initial_sentence_assigner = NaturalSentenceAssigner(model_pairs)
sent_len = 8
results_csv_folder = os.path.join(
"synthesized_sentences_test",
"controverisal_sentence_pairs_natural_initialization",
"{}_word".format(sent_len),
)
synthesize_controversial_sentence_pair_set(
model_pairs,
initial_sentence_assigner,
results_csv_folder=results_csv_folder,
sent_len=sent_len, # in the preprint, we used 8 word sentences
allow_only_prepositions_to_repeat=True, # in the preprint, this was True
natural_initialization=True, # sample random sentence for initialization
max_sentence_pairs_per_run=5, # set this to a small number (e.g. 5) if HPC job time is limited, None if you want the code to keep running until it's done
verbose=3,
)