Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions octis/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def _save_metadata(self, file_name):
else:
raise Exception("error in saving metadata")

def _load_metadata(self, file_name):
def _load_metadata(self, file_name, encoding='utf-8'):
"""
Loads metadata from json serialized format
Parameters
Expand All @@ -128,7 +128,7 @@ def _load_metadata(self, file_name):
"""
file = Path(file_name)
if file.is_file():
with open(file_name, 'r') as metadata_file:
with open(file_name, 'r', encoding=encoding) as metadata_file:
metadata = json.load(metadata_file)
self.__metadata = metadata

Expand Down Expand Up @@ -234,7 +234,7 @@ def _save_document_indexes(self, file_name):
for i in self.__original_indexes:
outfile.write(str(i) + "\n")

def _load_vocabulary(self, file_name):
def _load_vocabulary(self, file_name, encoding='utf-8'):
"""
Loads vocabulary from a file
Parameters
Expand All @@ -244,7 +244,7 @@ def _load_vocabulary(self, file_name):
vocabulary = []
file = Path(file_name)
if file.is_file():
with open(file_name, 'r') as vocabulary_file:
with open(file_name, 'r', encoding=encoding) as vocabulary_file:
for line in vocabulary_file:
vocabulary.append(line.strip())
self.__vocabulary = vocabulary
Expand Down Expand Up @@ -311,7 +311,7 @@ def save(self, path, multilabel=False):
except:
raise Exception("error in saving the dataset")

def load_custom_dataset_from_folder(self, path, multilabel=False):
def load_custom_dataset_from_folder(self, path, multilabel=False, encoding='utf-8'):
"""
Loads all the dataset from a folder
Parameters
Expand All @@ -321,11 +321,11 @@ def load_custom_dataset_from_folder(self, path, multilabel=False):
self.dataset_path = path
try:
if exists(self.dataset_path + "/metadata.json"):
self._load_metadata(self.dataset_path + "/metadata.json")
self._load_metadata(self.dataset_path + "/metadata.json", encoding=encoding)
else:
self.__metadata = dict()
df = pd.read_csv(
self.dataset_path + "/corpus.tsv", sep='\t', header=None)
self.dataset_path + "/corpus.tsv", sep='\t', header=None, encoding=encoding)
if len(df.keys()) > 1:
# just make sure docs are sorted in the right way (train - val - test)
final_df = pd.concat(
Expand All @@ -351,7 +351,7 @@ def load_custom_dataset_from_folder(self, path, multilabel=False):
self.__metadata['last-training-doc'] = len(df[0])

if exists(self.dataset_path + "/vocabulary.txt"):
self._load_vocabulary(self.dataset_path + "/vocabulary.txt")
self._load_vocabulary(self.dataset_path + "/vocabulary.txt", encoding=encoding)
else:
vocab = set()
for d in self.__corpus:
Expand Down