-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_embeddings_and_upsert_data.py
executable file
·158 lines (126 loc) · 6.03 KB
/
create_embeddings_and_upsert_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python
################################################################################
# commbase-genai-slm-ollama-phi3-mini-memory-remote-rag-pinecone #
# #
# A sophisticated AI assistant's Small Language Model (Phi3), enhanced by #
# Retrieval-Augmented Generation (RAG) for improved response accuracy, and #
# supported by a Pinecone semantic vector database. #
# #
# Change History #
# 06/25/2024 Esteban Herrera Original code. #
# Add new history entries as needed. #
# #
# #
################################################################################
################################################################################
################################################################################
# #
# Copyright (c) 2022-present Esteban Herrera C. #
# #
# This program is free software; you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation; either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #
# create_embeddings_and_upsert_data.py
# Creates and configures a new Pinecone index named 'new-pinecone-index'
# Imports
import functions
import json
import pandas as pd
import sentence_transformers
# import time
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm
# Load dataset
# df = pd.read_csv('./build/dataset.json')
df = pd.read_json('./build/dataset.json')
# Print the top 10 rows (or all rows if there are less than 10)
print(df.head(10))
# Get the total number of rows
total_rows = len(df)
# Print the total number of rows
print(f"Total rows in the DataFrame: {total_rows}")
# See the raw data in the JSON file as it is, without converting it to a DataFrame
# Read the JSON file
with open('./build/log_chats.json', 'r') as file:
data = json.load(file)
# Print the raw JSON data
print(json.dumps(data, indent=4))
# Extract and print each text content on a new line similarly to
# 'for record in dataset['object_name']:'.
# for chat in data['log_chats']:
# print(chat['text'])
# Extract and print each text content on a new line, limit to 5 lines
for i, chat in enumerate(data['log_chats']):
if i >= 5:
break
print(chat['text'])
# Building Embeddings and Upsert Format
# Call test_embedding_model()
model, xq = functions.test_embedding_model()
# Print the model details
print("Model: ", model)
# Print the dimensions of the embedding vector
print("Single query dimensions:", xq.shape)
# Now we upsert the data, we will do this in batches of 128.
# Note: On Google Colab with GPU expected runtime is ~7 minutes. If using CPU this will be significantly longer.
# Initialize the Pinecone client with your API key
pc = Pinecone(api_key="")
index_name = 'commbase-log-chats'
# Connect to index and print the index statistics
index = pc.Index(index_name)
batch_size = 10
# batch_size = 128
vector_limit = 50
# vector_limit = 100000
# Extract the first 50 entries into a variable similarly to
# 'dataset = dataset[:vector_limit]'.
entries = data['log_chats'][:50]
# Print the first 50 entries (optional)
for entry in entries:
print(entry)
# Now you can use `entries` variable in your code
print("Number of entries:", len(entries))
for i in tqdm(range(0, len(entries), batch_size)):
# find end of batch
i_end = min(i+batch_size, len(entries))
# create IDs batch
ids = [str(x) for x in range(i, i_end)]
# create metadata batch
# v1 - constructing metadatas as a list of dictionaries, each containing a 'text' key whose value is a dictionary ({'text': text}), rather than just a string.
# metadatas = [{'text': text} for text in entries[i:i_end]]
# v2 - this assumes that each chat object within entries has a 'text' key, and you're extracting its value (text['text']) to be used as the metadata.
# metadatas = [{'text': text['text']} for text in entries[i:i_end]]
# Adding multiple metadata fields
metadatas = [
{
'id': text['id'],
'timestamp': text['timestamp'],
'origin': text['origin'],
'severity': text['severity'],
'speaker': text['speaker'],
'text': text['text']
}
for text in entries[i:i_end]
]
# create embeddings
xc = model.encode(entries[i:i_end])
# create records list for upsert
records = zip(ids, xc, metadatas)
print(records)
# upsert to Pinecone
index.upsert(vectors=records)
# Check number of records in the index
print("")
print("Index statistics:")
print(index.describe_index_stats())