-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocess_data.py
65 lines (49 loc) · 2.66 KB
/
preprocess_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from pathlib import Path
def tokenize_code(text):
"""Gets filtered fucntion tokens"""
# Remove decorators and function signatures till the def token
keyword = 'def '
before_keyword, keyword, after_keyword = text.partition(keyword)
words = RegexpTokenizer(r'[a-zA-Z0-9]+').tokenize(after_keyword)
# Convert function tokens to lowercase and remove single alphabet variables
new_words = [word.lower() for word in words if (word.isalpha() and len(word) > 1) or (word.isnumeric())]
return new_words
def tokenize_docstring(text):
"""Gets filtered docstring tokens which help describe the function"""
# Remove decorators and other parameter signatures in the docstring
before_keyword, keyword, after_keyword = text.partition(':')
before_keyword, keyword, after_keyword = before_keyword.partition('@param')
before_keyword, keyword, after_keyword = before_keyword.partition('param')
before_keyword, keyword, after_keyword = before_keyword.partition('@brief')
if after_keyword:
words = RegexpTokenizer(r'[a-zA-Z0-9]+').tokenize(after_keyword)
else:
before_keyword, keyword, after_keyword = before_keyword.partition('@')
words = RegexpTokenizer(r'[a-zA-Z0-9]+').tokenize(before_keyword)
# Convert all docstrings to lowercase
new_words = [word.lower() for word in words if word.isalnum()]
return new_words
def jsonl_list_to_dataframe(files):
"""Load a list of jsonl.gz files into a pandas DataFrame."""
return pd.concat([pd.read_json(f,
orient='records',
compression='gzip',
lines=True)
for f in files], sort=False)
def preprocess_data(data_type):
data_frame = jsonl_list_to_dataframe(
sorted(Path('resources/data/python/final/jsonl/' + data_type + '/').glob('**/*.gz')))
data_frame['docstring_filtered'] = data_frame['docstring'].map(tokenize_docstring)
data_frame['code_filtered'] = data_frame['code'].map(tokenize_code)
data_frame['docstring_filtered'] = [' '.join(map(str, l)) for l in data_frame['docstring_filtered']]
data_frame['code_filtered'] = [' '.join(map(str, l)) for l in data_frame['code_filtered']]
return data_frame
if __name__ == '__main__':
train_dataframe = preprocess_data('train')
valid_dataframe = preprocess_data('valid')
test_dataframe = preprocess_data('test')
train_dataframe.to_csv('generated_resources/train_data.csv')
valid_dataframe.to_csv('generated_resources/valid_data.csv')
test_dataframe.to_csv('generated_resources/test_data.csv')