Add new python notebook

qn895 · qn895 · commit 680b27c3aa03 · 2025-09-28T19:30:52.000-05:00
diff --git a/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb b/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb
@@ -0,0 +1,225 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data successfully downloaded and saved to multilingual_coco_sample.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "import requests\n",
+    "import json\n",
+    "import os\n",
+    "\n",
+    "### Download multilingual coco dataset\n",
+    "### Here we are retrieving first 100 rows for this example\n",
+    "### Alternatively, you can use dataset library from Hugging Face\n",
+    "url = \"https://datasets-server.huggingface.co/rows?dataset=romrawinjp%2Fmultilingual-coco&config=default&split=restval&offset=0&length=100\"\n",
+    "# Make the GET request\n",
+    "response = requests.get(url)\n",
+    "\n",
+    "# Check if the request was successful\n",
+    "if response.status_code == 200:\n",
+    "    # Parse the JSON response\n",
+    "    data = response.json()\n",
+    "\n",
+    "    # Define the output file path\n",
+    "    output_file = \"multilingual_coco_sample.json\"\n",
+    "\n",
+    "    # Save the JSON data to a file\n",
+    "    with open(output_file, \"w\", encoding=\"utf-8\") as f:\n",
+    "        json.dump(data, f, indent=4, ensure_ascii=False)\n",
+    "\n",
+    "    print(f\"Data successfully downloaded and saved to {output_file}\")\n",
+    "else:\n",
+    "    print(f\"Failed to download data: {response.status_code}\")\n",
+    "    print(response.text)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from getpass import getpass\n",
+    "\n",
+    "# Get credentials securely for localhost Elasticsearch\n",
+    "print(\"Enter your Elasticsearch credentials:\")\n",
+    "cloud_id = input(\"Enter your cloud_id: \")\n",
+    "api_key = getpass(\"Enter your api_key: \")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Successfully connected to Elasticsearch\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/elasticsearch/_sync/client/__init__.py:311: SecurityWarning: Connecting to 'https://localhost:9200' using TLS with verify_certs=False is insecure\n",
+      "  _transport = transport_class(\n",
+      "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:1099: InsecureRequestWarning: Unverified HTTPS request is being made to host 'localhost'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "from elasticsearch import Elasticsearch\n",
+    "try:\n",
+    "    es = Elasticsearch(\n",
+    "        hosts=[{\"host\": \"localhost\", \"port\": 9200, \"scheme\": \"https\"}],\n",
+    "        basic_auth=(\"elastic\", \"qaf_admin\"),\n",
+    "        verify_certs=False,  # Set to True if you have valid SSL certificates\n",
+    "        # Alternatively, you can use Elastic cloud_id and api_key\n",
+    "        #api_key=getpass(\"API Key: \")\n",
+    "        #cloud_id=getpass(\"Cloud ID: \"),\n",
+    "    )\n",
+    "\n",
+    "    # Test the connection\n",
+    "    if not es.ping():\n",
+    "        raise Exception(\"Failed to connect to Elasticsearch\")\n",
+    "\n",
+    "    print(\"Successfully connected to Elasticsearch\")\n",
+    "\n",
+    "except Exception as e:\n",
+    "    print(f\"Error connecting to Elasticsearch: {e}\")\n",
+    "    print(\"Please check your credentials\")\n",
+    "    raise\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Successfully bulk indexed 4840 documents\n",
+      "Indexing complete!\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:1099: InsecureRequestWarning: Unverified HTTPS request is being made to host 'localhost'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n",
+      "  warnings.warn(\n",
+      "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:1099: InsecureRequestWarning: Unverified HTTPS request is being made to host 'localhost'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Define the index mapping\n",
+    "index_name = \"coco\"\n",
+    "mapping = {\n",
+    "    \"mappings\": {\n",
+    "        \"properties\": {\n",
+    "            \"language\": {\"type\": \"keyword\"},\n",
+    "            \"description\": {\"type\": \"text\"},\n",
+    "            \"en\": {\"type\": \"text\"},\n",
+    "            \"image_url\": {\"type\": \"keyword\"},\n",
+    "        }\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "# Create the index if it doesn't exist\n",
+    "if not es.indices.exists(index=index_name):\n",
+    "    es.indices.create(index=index_name, body=mapping)\n",
+    "\n",
+    "# Load the JSON data\n",
+    "with open('./multilingual_coco_sample.json', 'r') as f:\n",
+    "    data = json.load(f)\n",
+    "\n",
+    "rows = data[\"rows\"]\n",
+    "# List of languages to process\n",
+    "languages = [\"en\", \"es\", \"de\", \"it\", \"vi\", \"th\"]\n",
+    "\n",
+    "bulk_data = []\n",
+    "for obj in rows:\n",
+    "    row = obj[\"row\"]\n",
+    "    image_url = row.get(\"image\")\n",
+    "    image_url = image_url[\"src\"]\n",
+    "\n",
+    "    # Process each language\n",
+    "    for lang in languages:\n",
+    "        # Skip if language not present in this row\n",
+    "        if lang not in row:\n",
+    "            continue\n",
+    "\n",
+    "        # Get all descriptions for this language\n",
+    "        descriptions = row[lang]\n",
+    "        first_eng_caption = row[\"en\"][0]\n",
+    "\n",
+    "        # Prepare bulk indexing data\n",
+    "        for description in descriptions:\n",
+    "            if description == \"\":\n",
+    "                continue\n",
+    "            # Add index operation\n",
+    "            bulk_data.append(\n",
+    "                {\"index\": {\"_index\": index_name}}\n",
+    "            )\n",
+    "            # Add document\n",
+    "            bulk_data.append({\n",
+    "                \"language\": lang,\n",
+    "                \"description\": description,\n",
+    "                \"en\": first_eng_caption,\n",
+    "                \"image_url\": image_url,\n",
+    "            })\n",
+    "\n",
+    "# Perform bulk indexing\n",
+    "if bulk_data:\n",
+    "    try:\n",
+    "        response = es.bulk(operations=bulk_data)\n",
+    "        if response[\"errors\"]:\n",
+    "            print(\"Some documents failed to index\")\n",
+    "        else:\n",
+    "            print(f\"Successfully bulk indexed {len(bulk_data)} documents\")\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error during bulk indexing: {str(e)}\")\n",
+    "\n",
+    "print(\"Indexing complete!\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}