forked from nlx-group/overlapy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsynthetic_text_with_files_example.py
59 lines (46 loc) · 2.07 KB
/
synthetic_text_with_files_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from os.path import join
from overlapy import OverlapyTestSet, Overlapy
# This example uses the same pretraining dataset and testset
# as synthetic_text_example.py, but stored in files for demonstration purposes
# As such, 3 examples from the testset are expected to be matched
def tokenizer(s):
# Simple tokenization by whitespace.
return s.split()
with open(join("files", "testset.txt")) as fr:
testset_examples = list(map(str.rstrip, fr.readlines()))
# We'll override the parameter min_n and set it to 1 as we want the ngram value to be allowed
# to be less than 8. The testset examples were constructed for it to be 4, actually.
testset = OverlapyTestSet(
"test", min_n=1, examples=[tokenizer(s) for s in testset_examples]
)
print(f"N value: {testset.compute_n()}")
print(f"# NGrams: {len(set(map(tuple, list(testset.ngrams()))))}")
class SyntheticPreTrainingDataset:
def __getitem__(self, idx):
# Our pretraining dataset is split into 5 files.
# For large datasets, this is likely to happen.
# Here we offer a simple example where each file has just one sentence
# But the directory structure and file structure can be arbitrarily complex
with open(join("files", f"pretraining_dataset.{idx}.txt")) as fr:
return tokenizer(fr.read().rstrip())
def __len__(self):
# We could list the directory
# But we know its 5 files so why not just explicitly say
return 5
# We create an Overlapy object, handing three arguments:
# * Testsets: A list of OverlapyTestSet objects that we want to study.
# * Dataset: Dataset we want to calculate collisions with
# * n_workers: Number of worker processes to use
overlapy = Overlapy(
testsets=[testset],
dataset=SyntheticPreTrainingDataset(),
n_workers=2,
)
# Let's run and get the matches
matches = overlapy.run()
# We should be getting 3 testset examples that have been flagged for matches.
# #0 matches on A B A C
# #1 matches on F J K H
# #3 matches on T V Z E
# As we had noted above
print(f"Matches: {list(testset.get_matches(matches))}")