|
37 | 37 | "- Signup for an OpenAI Developer Account and create an API Key. See [OpenAI's developer platform](https://platform.openai.com/overview).\n",
|
38 | 38 | "- Install Python\n",
|
39 | 39 | "- Install and configure a python virtual environment. We recommend [Pyenv](https://github.com/pyenv/pyenv)\n",
|
40 |
| - "- Install the requirements for this notebook using the following command:\n", |
41 |
| - "\n", |
42 |
| - "```\n", |
43 |
| - "pip install -r requirements.txt\n", |
44 |
| - "```" |
| 40 | + "- Install the requirements for this notebook using the following command:" |
| 41 | + ] |
| 42 | + }, |
| 43 | + { |
| 44 | + "cell_type": "code", |
| 45 | + "execution_count": 188, |
| 46 | + "metadata": {}, |
| 47 | + "outputs": [], |
| 48 | + "source": [ |
| 49 | + "%pip install -r requirements.txt" |
45 | 50 | ]
|
46 | 51 | },
|
47 | 52 | {
|
48 | 53 | "cell_type": "code",
|
49 |
| - "execution_count": null, |
| 54 | + "execution_count": 189, |
50 | 55 | "metadata": {},
|
51 | 56 | "outputs": [],
|
52 | 57 | "source": [
|
|
66 | 71 | },
|
67 | 72 | {
|
68 | 73 | "cell_type": "code",
|
69 |
| - "execution_count": null, |
| 74 | + "execution_count": 190, |
70 | 75 | "metadata": {},
|
71 | 76 | "outputs": [],
|
72 | 77 | "source": [
|
|
89 | 94 | },
|
90 | 95 | {
|
91 | 96 | "cell_type": "code",
|
92 |
| - "execution_count": null, |
| 97 | + "execution_count": 191, |
93 | 98 | "metadata": {},
|
94 | 99 | "outputs": [],
|
95 | 100 | "source": [
|
|
112 | 117 | },
|
113 | 118 | {
|
114 | 119 | "cell_type": "code",
|
115 |
| - "execution_count": null, |
| 120 | + "execution_count": 192, |
116 | 121 | "metadata": {},
|
117 | 122 | "outputs": [],
|
118 | 123 | "source": [
|
|
137 | 142 | "# Assumes we're using the text-embedding-ada-002 model\n",
|
138 | 143 | "# See https://openai.com/pricing\n",
|
139 | 144 | "def get_embedding_cost(num_tokens):\n",
|
140 |
| - " return num_tokens/1000*0.0001\n", |
| 145 | + " return num_tokens/1000*0.00002\n", |
141 | 146 | "\n",
|
142 | 147 | "# Helper function: calculate total cost of embedding all content in the dataframe\n",
|
143 | 148 | "def get_total_embeddings_cost():\n",
|
|
147 | 152 | " token_len = num_tokens_from_string(text)\n",
|
148 | 153 | " total_tokens = total_tokens + token_len\n",
|
149 | 154 | " total_cost = get_embedding_cost(total_tokens)\n",
|
150 |
| - " return total_cost\n", |
151 |
| - "\n", |
152 |
| - "# Helper function: get embeddings for a text\n", |
153 |
| - "def get_embeddings(text):\n", |
154 |
| - " response = openai.Embedding.create(\n", |
155 |
| - " model=\"text-embedding-ada-002\",\n", |
156 |
| - " input = text.replace(\"\\n\",\" \")\n", |
157 |
| - " )\n", |
158 |
| - " embedding = response['data'][0]['embedding']\n", |
159 |
| - " return embedding" |
| 155 | + " return total_cost" |
160 | 156 | ]
|
161 | 157 | },
|
162 | 158 | {
|
163 | 159 | "cell_type": "code",
|
164 |
| - "execution_count": null, |
| 160 | + "execution_count": 193, |
165 | 161 | "metadata": {},
|
166 | 162 | "outputs": [],
|
167 | 163 | "source": [
|
|
189 | 185 | },
|
190 | 186 | {
|
191 | 187 | "cell_type": "code",
|
192 |
| - "execution_count": null, |
| 188 | + "execution_count": 194, |
193 | 189 | "metadata": {},
|
194 | 190 | "outputs": [],
|
195 | 191 | "source": [
|
196 | 192 | "###############################################################################\n",
|
197 | 193 | "# Create new list with small content chunks to not hit max token limits\n",
|
198 | 194 | "# Note: the maximum number of tokens for a single request is 8191\n",
|
199 |
| - "# https://openai.com/docs/api-reference/requests\n", |
| 195 | + "# https://platform.openai.com/docs/guides/embeddings/embedding-models\n", |
200 | 196 | "###############################################################################\n",
|
201 | 197 | "# list for chunked content and embeddings\n",
|
202 | 198 | "new_list = []\n",
|
|
241 | 237 | },
|
242 | 238 | {
|
243 | 239 | "cell_type": "code",
|
244 |
| - "execution_count": null, |
| 240 | + "execution_count": 195, |
| 241 | + "metadata": {}, |
| 242 | + "outputs": [], |
| 243 | + "source": [ |
| 244 | + "openai_client = openai.OpenAI()\n", |
| 245 | + "\n", |
| 246 | + "# Helper function: get embeddings for a text\n", |
| 247 | + "def get_embeddings(text):\n", |
| 248 | + " response = openai_client.embeddings.create(\n", |
| 249 | + " model=\"text-embedding-3-small\",\n", |
| 250 | + " input = text.replace(\"\\n\",\" \")\n", |
| 251 | + " )\n", |
| 252 | + " return response.data[0].embedding" |
| 253 | + ] |
| 254 | + }, |
| 255 | + { |
| 256 | + "cell_type": "code", |
| 257 | + "execution_count": 196, |
245 | 258 | "metadata": {},
|
246 | 259 | "outputs": [],
|
247 | 260 | "source": [
|
|
258 | 271 | },
|
259 | 272 | {
|
260 | 273 | "cell_type": "code",
|
261 |
| - "execution_count": null, |
| 274 | + "execution_count": 197, |
262 | 275 | "metadata": {},
|
263 | 276 | "outputs": [],
|
264 | 277 | "source": [
|
265 | 278 | "# Save the dataframe with embeddings as a CSV file\n",
|
266 | 279 | "df_new.to_csv('blog_data_and_embeddings.csv', index=False)\n",
|
267 | 280 | "# It may also be useful to save as a json file, but we won't use this in the tutorial\n",
|
268 |
| - "#df_new.to_json('blog_data_and_embeddings.json')" |
| 281 | + "#df_new.to_json('blog_data_and_embeddings.json') " |
269 | 282 | ]
|
270 | 283 | },
|
271 | 284 | {
|
|
291 | 304 | },
|
292 | 305 | {
|
293 | 306 | "cell_type": "code",
|
294 |
| - "execution_count": null, |
| 307 | + "execution_count": 198, |
295 | 308 | "metadata": {},
|
296 | 309 | "outputs": [],
|
297 | 310 | "source": [
|
|
304 | 317 | },
|
305 | 318 | {
|
306 | 319 | "cell_type": "code",
|
307 |
| - "execution_count": null, |
| 320 | + "execution_count": 199, |
308 | 321 | "metadata": {},
|
309 | 322 | "outputs": [],
|
310 | 323 | "source": [
|
|
313 | 326 | "cur = conn.cursor()\n",
|
314 | 327 | "\n",
|
315 | 328 | "#install pgvector \n",
|
316 |
| - "cur.execute(\"CREATE EXTENSION IF NOT EXISTS vector\");\n", |
| 329 | + "cur.execute(\"CREATE EXTENSION IF NOT EXISTS vector;\")\n", |
| 330 | + "conn.commit()\n", |
| 331 | + "\n", |
| 332 | + "#install pgvectorscale \n", |
| 333 | + "cur.execute(\"CREATE EXTENSION IF NOT EXISTS vectorscale CASCADE;\")\n", |
317 | 334 | "conn.commit()\n",
|
318 | 335 | "\n",
|
319 | 336 | "# Register the vector type with psycopg2\n",
|
|
346 | 363 | },
|
347 | 364 | {
|
348 | 365 | "cell_type": "code",
|
349 |
| - "execution_count": null, |
| 366 | + "execution_count": 200, |
350 | 367 | "metadata": {},
|
351 | 368 | "outputs": [],
|
352 | 369 | "source": [
|
|
381 | 398 | },
|
382 | 399 | {
|
383 | 400 | "cell_type": "code",
|
384 |
| - "execution_count": null, |
| 401 | + "execution_count": 201, |
385 | 402 | "metadata": {},
|
386 | 403 | "outputs": [],
|
387 | 404 | "source": [
|
|
391 | 408 | },
|
392 | 409 | {
|
393 | 410 | "cell_type": "code",
|
394 |
| - "execution_count": null, |
| 411 | + "execution_count": 202, |
395 | 412 | "metadata": {},
|
396 | 413 | "outputs": [],
|
397 | 414 | "source": [
|
|
409 | 426 | },
|
410 | 427 | {
|
411 | 428 | "cell_type": "code",
|
412 |
| - "execution_count": null, |
| 429 | + "execution_count": 203, |
413 | 430 | "metadata": {},
|
414 | 431 | "outputs": [],
|
415 | 432 | "source": [
|
|
433 | 450 | },
|
434 | 451 | {
|
435 | 452 | "cell_type": "code",
|
436 |
| - "execution_count": null, |
| 453 | + "execution_count": 204, |
437 | 454 | "metadata": {},
|
438 | 455 | "outputs": [],
|
439 | 456 | "source": [
|
|
445 | 462 | },
|
446 | 463 | {
|
447 | 464 | "cell_type": "code",
|
448 |
| - "execution_count": null, |
| 465 | + "execution_count": 205, |
449 | 466 | "metadata": {},
|
450 | 467 | "outputs": [],
|
451 | 468 | "source": [
|
|
465 | 482 | },
|
466 | 483 | {
|
467 | 484 | "cell_type": "code",
|
468 |
| - "execution_count": null, |
| 485 | + "execution_count": 206, |
469 | 486 | "metadata": {},
|
470 | 487 | "outputs": [],
|
471 | 488 | "source": [
|
472 | 489 | "# Create an index on the data for faster retrieval\n",
|
473 | 490 | "# this isn't really needed for 129 vectors, but it shows the usage for larger datasets\n",
|
474 | 491 | "# Note: always create this type of index after you have data already inserted into the DB\n",
|
475 | 492 | "\n",
|
476 |
| - "#calculate the index parameters according to best practices\n", |
477 |
| - "num_lists = num_records / 1000\n", |
478 |
| - "if num_lists < 10:\n", |
479 |
| - " num_lists = 10\n", |
480 |
| - "if num_records > 1000000:\n", |
481 |
| - " num_lists = math.sqrt(num_records)\n", |
482 |
| - "\n", |
483 |
| - "#use the cosine distance measure, which is what we'll later use for querying\n", |
484 |
| - "cur.execute(f'CREATE INDEX ON embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = {num_lists});')\n", |
485 |
| - "conn.commit() " |
| 493 | + "# for different tuning suggestions check this: https://github.com/timescale/pgvectorscale?tab=readme-ov-file#tuning\n", |
| 494 | + "cur.execute('CREATE INDEX embedding_idx ON embeddings USING diskann (embedding);')\n", |
| 495 | + "conn.commit()" |
486 | 496 | ]
|
487 | 497 | },
|
488 | 498 | {
|
|
499 | 509 | },
|
500 | 510 | {
|
501 | 511 | "cell_type": "code",
|
502 |
| - "execution_count": null, |
| 512 | + "execution_count": 207, |
503 | 513 | "metadata": {},
|
504 | 514 | "outputs": [],
|
505 | 515 | "source": [
|
506 | 516 | "# Helper function: get text completion from OpenAI API\n",
|
507 | 517 | "# Note max tokens is 4097\n",
|
508 | 518 | "# Note we're using the latest gpt-3.5-turbo-0613 model\n",
|
509 |
| - "def get_completion_from_messages(messages, model=\"gpt-3.5-turbo-0613\", temperature=0, max_tokens=1000):\n", |
510 |
| - " response = openai.ChatCompletion.create(\n", |
| 519 | + "def get_completion_from_messages(messages, model=\"gpt-4o\", temperature=0, max_tokens=1000):\n", |
| 520 | + " response = openai_client.chat.completions.create(\n", |
511 | 521 | " model=model,\n",
|
512 | 522 | " messages=messages,\n",
|
513 | 523 | " temperature=temperature, \n",
|
514 | 524 | " max_tokens=max_tokens, \n",
|
515 | 525 | " )\n",
|
516 |
| - " return response.choices[0].message[\"content\"]" |
| 526 | + " return response.choices[0].message.content" |
517 | 527 | ]
|
518 | 528 | },
|
519 | 529 | {
|
520 | 530 | "cell_type": "code",
|
521 |
| - "execution_count": null, |
| 531 | + "execution_count": 208, |
522 | 532 | "metadata": {},
|
523 | 533 | "outputs": [],
|
524 | 534 | "source": [
|
|
547 | 557 | },
|
548 | 558 | {
|
549 | 559 | "cell_type": "code",
|
550 |
| - "execution_count": null, |
| 560 | + "execution_count": 209, |
551 | 561 | "metadata": {},
|
552 | 562 | "outputs": [],
|
553 | 563 | "source": [
|
|
557 | 567 | },
|
558 | 568 | {
|
559 | 569 | "cell_type": "code",
|
560 |
| - "execution_count": null, |
| 570 | + "execution_count": 210, |
561 | 571 | "metadata": {},
|
562 | 572 | "outputs": [],
|
563 | 573 | "source": [
|
|
590 | 600 | },
|
591 | 601 | {
|
592 | 602 | "cell_type": "code",
|
593 |
| - "execution_count": null, |
| 603 | + "execution_count": 211, |
594 | 604 | "metadata": {},
|
595 | 605 | "outputs": [],
|
596 | 606 | "source": [
|
|
601 | 611 | },
|
602 | 612 | {
|
603 | 613 | "cell_type": "code",
|
604 |
| - "execution_count": null, |
| 614 | + "execution_count": 212, |
605 | 615 | "metadata": {},
|
606 | 616 | "outputs": [],
|
607 | 617 | "source": [
|
|
629 | 639 | "name": "python",
|
630 | 640 | "nbconvert_exporter": "python",
|
631 | 641 | "pygments_lexer": "ipython3",
|
632 |
| - "version": "3.8.16" |
| 642 | + "version": "3.9.6" |
633 | 643 | }
|
634 | 644 | },
|
635 | 645 | "nbformat": 4,
|
|
0 commit comments