-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentence_prep.py
31 lines (26 loc) · 899 Bytes
/
sentence_prep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import config
import pandas as pd
""" simple data cleanup prior to tokenization """
def ignore(sentence):
""" ignore sentences with very infrequent character occurence """
for c in sentence:
if c in config.IGNORE:
return True
return False
def replace_chars(sentence):
char_replace = {
"\xa0": "",
"\u200b": "",
}
for k, v in char_replace.items():
sentence = sentence.replace(k,v)
return sentence
def initial_cleanup(df):
df = df[df.isnull().any(axis=1)==False] # remove 1x null entry
data = df[~(df.applymap(ignore).any(axis=1))]
data = data[~(data.applymap(lambda x: "..." in x).any(axis=1))]
data = data.applymap(replace_chars)
data = data.applymap(lambda x: x.strip())
# add begin/end of sentence characters for tokenization
data = data.apply(lambda x: config.SPECIAL_TOKENS["BOS_TOKEN"] + x + config.SPECIAL_TOKENS["EOS_TOKEN"])
return data.reset_index(drop=True)