-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathutils.py
More file actions
431 lines (367 loc) · 13.7 KB
/
Copy pathutils.py
File metadata and controls
431 lines (367 loc) · 13.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
"""
The utils module contains general functionality that can be used at various places in the application
"""
# imports
from typing import Any, Dict, List, Tuple
import os
import sys
import time
import datetime as dt
import numpy as np
from loguru import logger
from langdetect import detect, LangDetectException
import psutil
import keyboard
# local imports
import settings
VALID_EXTENSIONS = [
".pdf",
".docx",
".html",
".md",
".txt"
]
LANGUAGE_MAP = {
'cs': 'czech',
'da': 'danish',
'nl': 'dutch',
'en': 'english',
'et': 'estonian',
'fi': 'finnish',
'fr': 'french',
'de': 'german',
'el': 'greek',
'it': 'italian',
'no': 'norwegian',
'pl': 'polish',
'pt': 'portuguese',
'sl': 'slovene',
'es': 'spanish',
'sv': 'swedish',
'tr': 'turkish'
} # languages supported by nltk
def create_vectordb_folder(my_folder_path_selected: str) -> None:
"""
Creates subfolder for storage of vector databases if not existing
Parameters
----------
my_folder_path_selected : str
the selected document folder path
"""
if "vector_stores" not in os.listdir(my_folder_path_selected):
os.mkdir(os.path.join(my_folder_path_selected, "vector_stores"))
def create_summaries_folder(my_folder_path_selected: str) -> None:
"""
Creates subfolder for storage of summaries if not existing
Parameters
----------
my_folder_path_selected : str
the selected document folder path
"""
if "summaries" not in os.listdir(my_folder_path_selected):
os.mkdir(os.path.join(my_folder_path_selected, "summaries"))
def create_vectordb_path(content_folder_path: str,
retriever_type: str = None,
embeddings_provider: str = None,
embeddings_model: str = None,
text_splitter_method: str = None,
chunk_size: int = None,
chunk_overlap: int = None,
text_splitter_method_child: str = None,
chunk_size_child: int = None,
chunk_overlap_child: int = None) -> str:
"""
Creates the full path for the vectorstore
Parameters
----------
content_folder_path : str
name of the content folder (including the path)
retriever_type : str, optional
name of the retriever type, by default None
embeddings_provider : str, optional
name of the embeddings provider, by default None
embeddings_model : str, optional
name of the embeddings model, by default None
text_splitter_method : str, optional
name of the text splitter method, by default None
chunk_size : int, optional
the maximum chunk size, by default None
chunk_overlap : int, optional
the chunk overlap, by default None
text_splitter_method_child : str, optional
name of the text splitter method used for child chunks, by default None
chunk_size_child : int, optional
the maximum chunk size of child chunks, by default None
chunk_overlap_child : int, optional
the chunk overlap of child chunks, by default None
Returns
-------
str
vectorstore folder path
"""
retriever_type = settings.RETRIEVER_TYPE if retriever_type is None else retriever_type
embeddings_provider = settings.EMBEDDINGS_PROVIDER if embeddings_provider is None else embeddings_provider
embeddings_model = settings.EMBEDDINGS_MODEL if embeddings_model is None else embeddings_model
text_splitter_method = settings.TEXT_SPLITTER_METHOD if text_splitter_method is None else text_splitter_method
chunk_size = str(settings.CHUNK_SIZE) if chunk_size is None else str(chunk_size)
chunk_overlap = str(settings.CHUNK_OVERLAP) if chunk_overlap is None else str(chunk_overlap)
text_splitter_method_child = settings.TEXT_SPLITTER_METHOD_CHILD if text_splitter_method_child is None else \
text_splitter_method_child
chunk_size_child = str(settings.CHUNK_SIZE_CHILD) if chunk_size_child is None else str(chunk_size_child)
chunk_overlap_child = str(settings.CHUNK_OVERLAP_CHILD) \
if chunk_overlap_child is None else str(chunk_overlap_child)
# vectordb_name is created from retriever_type, embeddings_provider, embeddings_model, and
# parent and child text_splitter_method, chunk_size and chunk_overlap
vectordb_name = retriever_type + "_" + embeddings_provider + "_" + embeddings_model + "_" + \
text_splitter_method + "_" + chunk_size + "_" + chunk_overlap + "_" + text_splitter_method_child + "_" +\
chunk_size_child + "_" + chunk_overlap_child
vectordb_folder_path = os.path.join(content_folder_path, "vector_stores", vectordb_name)
return vectordb_folder_path
def is_relevant_file(content_folder_path: str, document_selection: List[str], my_file: str) -> bool:
"""
Decides whether or not a file is a relevant file
Parameters
----------
content_folder_path : str
name of the content folder (including the path)
document_selection: List[str]
list of documents that have been selected
my_file: str
name of the file
Returns
-------
bool
True if file is relevant, otherwise False
"""
relevant = False
if ((document_selection is None) or (document_selection == ["All"])):
relevant = ((os.path.isfile(os.path.join(content_folder_path, my_file))) and
(os.path.splitext(my_file)[1] in VALID_EXTENSIONS) and
(not my_file.startswith("~")))
else:
relevant = ((os.path.isfile(os.path.join(content_folder_path, my_file))) and
(os.path.splitext(my_file)[1] in VALID_EXTENSIONS) and
(not my_file.startswith("~")) and
(my_file in document_selection))
return relevant
def get_relevant_files_in_folder(content_folder_path: str, document_selection: List[str] = None) -> List[str]:
"""
Gets a list of relevant files from a given content folder path
Parameters
----------
content_folder_path : str
name of the content folder (including the path)
document_selection: List[str]
list of documents that have been selected
Returns
-------
List[str]
list of files, without path
"""
all_files = os.listdir(content_folder_path)
relevant_files = []
for file in all_files:
if is_relevant_file(content_folder_path=content_folder_path,
document_selection=document_selection,
my_file=file):
logger.info(f"file {file} is found relevant for ingestion")
relevant_files.append(file)
return relevant_files
def exit_program() -> None:
"""
Exits the Python process
"""
logger.info("Exiting the program...")
sys.exit(0)
def exit_ui() -> None:
"""
Exits the User Interface process.
First, the last tab in the browser is closed
Then the python process is stopped
From: https://discuss.streamlit.io/t/close-streamlit-app-with-button-click/35132
"""
# Give a bit of delay for user experience
time.sleep(1)
# Close streamlit browser tab
keyboard.press_and_release('ctrl+w')
logger.info("Closing the application")
# Terminate streamlit python process
pid = os.getpid()
p = psutil.Process(pid)
p.terminate()
def getattr_or_default(obj: Any,
attr, default: Any = None) -> Any | None:
""" Get an attribute from an object, returning a default value if the attribute
is not found or its value is None
Parameters
----------
obj : Any
object from which to obtain the attribute
attr : str
name of the attribute
default : Any, optional
default argument, by default None
Returns
-------
Any or None
value of the attribute if not None else the default value, which can be None
"""
value = getattr(obj, attr, default)
return value if value is not None else default
def get_settings_as_dictionary(file_name: str) -> Dict[str, Any]:
""" Turns the parameters read from the settings file into a dictionary
Parameters
----------
file_name : str
name of the settings file path
Returns
-------
Dict[str, Any]
dictionary with parameter names as keys, parameter values as values
"""
# Initialize an empty dictionary to store the variables and their values
variables_dict = {}
# Open and read the file
with open(file=file_name, mode='r', encoding="utf-8") as file:
lines = file.readlines()
start_reading = False
# Process each line in the file
for line in lines:
# start reading below the line with # #########
if line.startswith("# #########"):
start_reading = True
# ignore comment lines
if start_reading and not line.startswith("#"):
# Remove leading and trailing whitespace and split the line by '='
parts = line.strip().split('=')
# Check if the line can be split into two parts
if len(parts) == 2:
# Extract the variable name and value
variable_name = parts[0].strip()
variable_value = parts[1].strip()
# exclude embedding map and llm map
if not variable_value.startswith("{"):
# Use exec() to assign the value to the variable name
exec(f'{variable_name} = {variable_value}')
# Add the variable and its value to the dictionary
variables_dict[variable_name] = eval(variable_name)
return variables_dict
def get_timestamp() -> str:
""" returns the current time as a string, used for logging
Returns
-------
str
string timestamp of current time
"""
return str(dt.datetime.now())
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> np.float64:
"""
_summary_
Parameters
----------
a : np.ndarray
first array of numbers
b : np.ndarray
second array of numbers
Returns
-------
np.float64
the calculated cosine similarity between the two arrays of numbers
"""
cos_sim = np.dot(a, b)/(np.linalg.norm(a) * np.linalg.norm(b))
return cos_sim
def euclidean_distance(a: np.ndarray, b: np.ndarray) -> np.float64:
"""
Calculation of euclidean distance between a and b
Parameters
----------
a : np.ndarray
first array of numbers
b : np.ndarray
second array of numbers
Returns
-------
np.float64
the calculated euclidean distance between the two arrays of numbers
"""
eucl_dist = np.sqrt(np.sum((a - b) ** 2))
return eucl_dist
def detect_language(text: str, number_of_characters: int = 1000) -> str:
"""
Detects language based on the first X number of characters
"""
text_snippet = text[:number_of_characters] if len(text) > number_of_characters else text
if not text_snippet.strip():
# Handle the case where the text snippet is empty or only contains whitespace
return 'unknown'
try:
return detect(text_snippet)
except LangDetectException as e:
if 'No features in text' in str(e):
# Handle the specific error where no features are found in the text
return 'unknown'
def get_relevant_models(summary: bool, private: bool) -> Tuple[str, str, str, str]:
"""
Gets the appropriate embeddings provider and model and llm provider and model
based on whether a summary is wanted and/or whether the documents involved are private
Parameters
----------
summary : bool
indicator for summary as purpose
private : bool
indicator for private document
Returns
-------
Tuple[str, str, str, str]
tuple of embedding provider, embedding model, llm provider and llm model
"""
if private:
return settings.PRIVATE_LLM_PROVIDER, settings.PRIVATE_LLM_MODEL, \
settings.PRIVATE_EMBEDDINGS_PROVIDER, settings.PRIVATE_EMBEDDINGS_MODEL
else:
if summary:
return settings.SUMMARY_LLM_PROVIDER, settings.SUMMARY_LLM_MODEL, \
settings.SUMMARY_EMBEDDINGS_PROVIDER, settings.SUMMARY_EMBEDDINGS_MODEL
else:
return settings.LLM_PROVIDER, settings.LLM_MODEL, \
settings.EMBEDDINGS_PROVIDER, settings.EMBEDDINGS_MODEL
def answer_idontknow(language: str) -> str:
"""
Returns the answer "I don't know" in the correct language
Parameters
----------
language : str
language of the answer
Returns
-------
str
the answer "I don't know" in the correct language
"""
if language == 'nl':
result = "Ik weet het niet omdat er geen relevante context is die het antwoord bevat"
elif language == 'de':
result = "Ich weiß es nicht, weil es keinen relevanten Kontext gibt, der die Antwort enthält"
elif language == 'fr':
result = "Je ne sais pas car il n'y a pas de contexte pertinent contenant la réponse"
else:
result = "I don't know because there is no relevant context containing the answer"
return result
def check_size(content_folder, document_selection):
"""
Checks the size of the list of files and returns the size of files in memory
Parameters
----------
list_of_files : List[str]
list of files
content_folder : str
Returns
-------
float
size of files in MB
"""
list_of_files = get_relevant_files_in_folder(content_folder, document_selection)
size = 0
for file in list_of_files:
size += os.path.getsize(os.path.join(content_folder, file))
size = size / 1024 / 1024 # convert to MB
return size