-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeep_memory_implementation.py
146 lines (113 loc) · 4.68 KB
/
deep_memory_implementation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# pylint: disable=locally-disabled, multiple-statements, fixme, line-too-long, missing-function-docstring
import getpass
import os
import argparse
from deeplake import VectorStore
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.deeplake import DeepLake
from global_variables import TYPE_BIOMEDICAL, TYPE_LEGAL, TYPE_FINANCE
from dataset_generator_langchain import get_chunk_question
from global_variables import YAML_FILE, HUB_NAME
TYPE = TYPE_FINANCE
parser = argparse.ArgumentParser()
parser.add_argument("--credentials", action="store_true")
args = parser.parse_args()
if args.credentials:
os.environ["ACTIVELOOP_TOKEN"] = getpass.getpass(
"Copy and paste your ActiveLoop token: "
)
os.environ["OPENAI_API_KEY"] = getpass.getpass(
"Copy and paste your OpenAI API key: "
)
else:
os.environ["ACTIVELOOP_TOKEN"] = os.getenv("ACTIVELOOP_TOKEN")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
def load_vector_store(user_hub, name_db):
vector_store_db = DeepLake(
f"hub://{user_hub}/{name_db}",
embedding_function=embeddings.embed_documents,
runtime={"tensor_db": True},
read_only=True,
)
return vector_store_db.vectorstore
def training_job(vector_store_db, chunk_question_quantity: int):
questions = []
relevances = []
for idx, el in enumerate(vector_store_db.dataset):
if idx >= chunk_question_quantity:
break
print(f"Generating question: {idx}")
chunk_id = str(el.id.data()["value"])
text = str(el.text.data()["value"])
print(f"Processing chunk: {idx}")
single_question = get_chunk_question(text)
questions.append(single_question)
relevances.append([(chunk_id, 1)])
job_id = vector_store_db.deep_memory.train(
queries=questions,
relevance=relevances,
embedding_function=embeddings.embed_documents,
)
vector_store_db.deep_memory.status(job_id)
return vector_store_db
def get_answer(vector_store_db, user_question, deep_memory):
# deep memory inside the vectore store ==> deep_memory=True
answer = vector_store_db.search(
embedding_data=user_question,
embedding_function=embeddings.embed_query,
deep_memory=deep_memory,
return_view=False,
)
print(answer)
return answer
def check_status(job_id: str, vectore_store_db: str):
db = VectorStore(vectore_store_db, read_only=True)
status = db.deep_memory.status(job_id)
print(status)
def evaluate_deep_memory(vector_store_db, train_question_quantity, number_of_questions):
test_questions = []
test_relevance = []
dataset_unseen = vector_store_db.dataset[train_question_quantity:]
for idx, el in enumerate(dataset_unseen):
if idx >= number_of_questions:
break
print(f"Generating question: {idx}")
chunk_id = str(el.id.data()["value"])
text = str(el.text.data()["value"])
single_question = get_chunk_question(text)
test_questions.append(single_question)
test_relevance.append([(chunk_id, 1)])
evaluation = vector_store_db.deep_memory.evaluate(
queries=test_questions,
relevance=test_relevance,
embedding_function=embeddings.embed_documents,
top_k=[1, 3, 5, 10, 50, 100],
)
with open(f"question_evaluation_{YAML_FILE['db'][TYPE]['name']}.txt", "w") as file:
file.write(f"Test Question: {test_questions}\n")
file.write(f"Evaluation Result: {evaluation}\n")
return evaluation
if __name__ == "__main__":
DATASET_NAME = YAML_FILE["db"][TYPE]["name"]
train_question_quantity = YAML_FILE["db"][TYPE]["query_numbers"]
vector_store_db = load_vector_store(HUB_NAME, DATASET_NAME)
# UPLOAD THE DATASET WITH DEEPCOPY
# upload_with_deepcopy("embedding_custom4_11_13_2023")
# TRAIN PHASE
# training_job(vector_store_db, train_question_quantity)
# print(vector_store_db.deep_memory.list_jobs())
# user_question = "Female carriers of the Apolipoprotein E4 (APOE4) allele have increased risk for dementia."
# TEST PHASE
# search_response_deep_memory = get_answer(vector_store_db, user_question, deep_memory=True)
# search_response_standard = get_answer(vector_store_db, user_question, deep_memory=False)
evaluation = evaluate_deep_memory(
vector_store_db, train_question_quantity, number_of_questions=100
)
# job_id = YAML_FILE["db"][TYPE]["job_id"]
# vectore_store_db = f"""hub://manufe/{YAML_FILE["db"][TYPE]["name"]}"""
# check_status(
# job_id=job_id,
# vectore_store_db=vectore_store_db,
# )
print(evaluation)