-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdataloader.py
169 lines (142 loc) · 6.6 KB
/
dataloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torchtext.vocab import build_vocab_from_iterator
import re
import numpy as np
from tqdm import tqdm
from transformers import BertTokenizer
import string
class DataLoader:
def __init__(self, tokenize):
self.seed = 42
self.tokenize = tokenize
print('dataset initializing start')
# Split a document into sentences
def to_sentence(self, doc):
return doc.strip().split('\n')
def max_min_length(self, sentences):
lengths = [len(s.split()) for s in sentences]
return min(lengths), max(lengths)
def load_doc(self, filename):
if isinstance(filename, str):
# open the file as read only
file = open(filename, mode='rt', encoding='utf-8')
# read all text
text = file.read()
# close the file
file.close()
elif isinstance(filename, list):
text = ""
for f in filename:
file = open(f, mode='rt', encoding='utf-8')
# read all text
text += file.read()
# close the file
file.close()
return text
def clean_lines(self, lines):
cleaned = list()
# prepare regex for char filtering
re_print = re.compile('[^%s]' % re.escape(string.printable))
# prepare translation table for removing punctuation
table = str.maketrans('', '', string.punctuation)
# Special characters for German
special_char_map = {ord('ä'):'ae', ord('ü'):'ue', ord('ö'):'oe', ord('ß'):'ss'}
for line in lines:
# convert the special chars
line = line.translate(special_char_map)
# tokenize on white space
line = line.split()
# convert to lower case
line = [word.lower() for word in line]
# remove punctuation from each token
line = [word.translate(table) for word in line]
# remove non-printable chars form each token
line = [re_print.sub('', w) for w in line]
# remove tokens with numbers in them
line = [word for word in line if word.isalpha()]
# store as string
cleaned.append(' '.join(line))
return cleaned
def make_dataset(self, data, train_size=0.8):
data.iloc[:, 0] = data.iloc[:, 0].apply(lambda x : x.replace("<br />", " "))
data.iloc[:, 0] = data.iloc[:, 0].apply(lambda x : x.lower())
data["len"] = data.iloc[:, 0].apply(lambda x : len(x.split()))
data = data[data["len"] < 256]
print("Length of data after first step of preprocessing: ", len(data))
print("Tokenizing the data...")
data["len"] = data.iloc[:, 0].apply(lambda x : len(self.tokenize(x)))
data = data[data["len"] < 256]
print("Length of the data : ", len(data))
train, rem = train_test_split(data, train_size=train_size, random_state = self.seed)
valid_size = 0.5
valid, test = train_test_split(rem, train_size=valid_size, random_state = self.seed)
print(train.iloc[:, 1].iloc[0])
train.iloc[:, 0] = train.iloc[:, 0].apply(lambda row: re.sub("[^A-Za-z]+", " ", row)).apply(self.tokenize)
valid.iloc[:, 0] = valid.iloc[:, 0].apply(lambda row: re.sub("[^A-Za-z]+", " ", row)).apply(self.tokenize)
test.iloc[:, 0] = test.iloc[:, 0].apply(lambda row: re.sub("[^A-Za-z]+", " ", row)).apply(self.tokenize)
return train, valid, test
def get_vocab(self, training_corpus):
vocab = {'__PAD__': 0, '__SOS__': 1, '__UNK__': 2}
for item in training_corpus:
for word in item:
if word not in vocab:
vocab[word] = len(vocab)
return vocab
def make_iter(self, train, validate, test, batch_size, device, vocab=None):
if vocab is None:
vocab = self.get_vocab(train.iloc[:, 0]) # TODO: I added this line - otherwise vocab is not defined - correct?
else:
vocab = vocab
print(train.iloc[0])
train_y = torch.tensor(train.iloc[:, 1].values.astype(np.float32), device=device)
valid_y = torch.tensor(validate.iloc[:, 1].values.astype(np.float32), device=device)
test_y = torch.tensor(test.iloc[:, 1].values.astype(np.float32), device=device)
unk_ID = vocab["__UNK__"]
train_x_tensor = []
for idx, text_corpus in enumerate(tqdm(train.iloc[:, 0])):
foo = []
for token in text_corpus:
word_ID = vocab.get(token, unk_ID)
foo.append(word_ID)
while len(foo) < 256:
foo.append(vocab["__PAD__"])
train_x_tensor.append(foo)
valid_x_tensor = []
for idx, text_corpus in enumerate(tqdm(validate.iloc[:, 0])):
foo = []
for token in text_corpus:
word_ID = vocab.get(token, unk_ID)
foo.append(word_ID)
while len(foo) < 256:
foo.append(vocab["__PAD__"])
valid_x_tensor.append(foo)
test_x_tensor = []
for idx, text_corpus in enumerate(tqdm(test.iloc[:, 0])):
foo = []
for token in text_corpus:
word_ID = vocab.get(token, unk_ID)
foo.append(word_ID)
while len(foo) < 256:
foo.append(vocab["__PAD__"])
test_x_tensor.append(foo)
train_x = torch.tensor(train_x_tensor, device=device)
valid_x = torch.tensor(valid_x_tensor, device=device)
test_x = torch.tensor(test_x_tensor, device=device)
train = torch.utils.data.TensorDataset(train_x, train_y)
validate = torch.utils.data.TensorDataset(valid_x, valid_y)
test = torch.utils.data.TensorDataset(test_x, test_y)
train_iterator = torch.utils.data.DataLoader(dataset = train, batch_size = batch_size, shuffle = True)
valid_iterator = torch.utils.data.DataLoader(dataset = validate, batch_size = batch_size, shuffle = True)
test_iterator = torch.utils.data.DataLoader(dataset = test, batch_size = batch_size, shuffle = True)
print('dataset initializing done')
return train_iterator, valid_iterator, test_iterator
class Tokenizer:
def __init__(self):
self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize(self, text):
"""
Tokenizes English text from a string into a list of strings
"""
return self.tokenizer.convert_ids_to_tokens(self.tokenizer.encode(text))