-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_dataloader.py
54 lines (42 loc) · 1.71 KB
/
get_dataloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import json
import pandas as pd
from pathlib import Path
import os
import torch
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
class CodeSearchNetDataset(Dataset):
def __init__(self, dataframe):
self.dataframe = dataframe
self.model = SentenceTransformer('bert-base-nli-mean-tokens').to('cuda')
def __len__(self):
return len(self.dataframe)
def __getitem__(self, idx):
return {'code': self.dataframe.iloc[idx]['code'],
'docstring': self.dataframe.iloc[idx]['docstring'],
'code_emb': self.model.encode(self.dataframe.iloc[idx]['code']),
'docstring_emb': self.model.encode(self.dataframe.iloc[idx]['docstring'])}
def jsonl_list_to_dataframe(files):
"""Load a list of jsonl.gz files into a pandas DataFrame."""
return pd.concat([pd.read_json(f,
orient='records',
compression='gzip',
lines=True)
for f in files], sort=False)
def get_dataset(data_type):
"""
:parameter
data_type (str): Either of train, valid and test
:return: Dataset for given type
"""
return CodeSearchNetDataset(pd.read_csv(f'generated_resources/{data_type}_data.csv'))
def get_dataloaders(data_type, batch_size, shuffle):
"""
:parameter
data_type (str): Either of train, valid and test
batch_size (int): Batch size for the dataloader
shuffle (bool): Whether you want to shuffle the entries or not
:return:
Dataloader for given type
"""
return DataLoader(get_dataset(data_type), batch_size=batch_size, shuffle=True)