Skip to content

Commit 680b27c

Browse files
committed
Add new python notebook
1 parent 8948851 commit 680b27c

File tree

1 file changed

+225
-0
lines changed

1 file changed

+225
-0
lines changed
Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [
8+
{
9+
"name": "stdout",
10+
"output_type": "stream",
11+
"text": [
12+
"Data successfully downloaded and saved to multilingual_coco_sample.json\n"
13+
]
14+
}
15+
],
16+
"source": [
17+
"import requests\n",
18+
"import json\n",
19+
"import os\n",
20+
"\n",
21+
"### Download multilingual coco dataset\n",
22+
"### Here we are retrieving first 100 rows for this example\n",
23+
"### Alternatively, you can use dataset library from Hugging Face\n",
24+
"url = \"https://datasets-server.huggingface.co/rows?dataset=romrawinjp%2Fmultilingual-coco&config=default&split=restval&offset=0&length=100\"\n",
25+
"# Make the GET request\n",
26+
"response = requests.get(url)\n",
27+
"\n",
28+
"# Check if the request was successful\n",
29+
"if response.status_code == 200:\n",
30+
" # Parse the JSON response\n",
31+
" data = response.json()\n",
32+
"\n",
33+
" # Define the output file path\n",
34+
" output_file = \"multilingual_coco_sample.json\"\n",
35+
"\n",
36+
" # Save the JSON data to a file\n",
37+
" with open(output_file, \"w\", encoding=\"utf-8\") as f:\n",
38+
" json.dump(data, f, indent=4, ensure_ascii=False)\n",
39+
"\n",
40+
" print(f\"Data successfully downloaded and saved to {output_file}\")\n",
41+
"else:\n",
42+
" print(f\"Failed to download data: {response.status_code}\")\n",
43+
" print(response.text)\n"
44+
]
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": null,
49+
"metadata": {},
50+
"outputs": [],
51+
"source": [
52+
"from getpass import getpass\n",
53+
"\n",
54+
"# Get credentials securely for localhost Elasticsearch\n",
55+
"print(\"Enter your Elasticsearch credentials:\")\n",
56+
"cloud_id = input(\"Enter your cloud_id: \")\n",
57+
"api_key = getpass(\"Enter your api_key: \")\n"
58+
]
59+
},
60+
{
61+
"cell_type": "code",
62+
"execution_count": null,
63+
"metadata": {},
64+
"outputs": [
65+
{
66+
"name": "stdout",
67+
"output_type": "stream",
68+
"text": [
69+
"Successfully connected to Elasticsearch\n"
70+
]
71+
},
72+
{
73+
"name": "stderr",
74+
"output_type": "stream",
75+
"text": [
76+
"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/elasticsearch/_sync/client/__init__.py:311: SecurityWarning: Connecting to 'https://localhost:9200' using TLS with verify_certs=False is insecure\n",
77+
" _transport = transport_class(\n",
78+
"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:1099: InsecureRequestWarning: Unverified HTTPS request is being made to host 'localhost'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n",
79+
" warnings.warn(\n"
80+
]
81+
}
82+
],
83+
"source": [
84+
"from elasticsearch import Elasticsearch\n",
85+
"try:\n",
86+
" es = Elasticsearch(\n",
87+
" hosts=[{\"host\": \"localhost\", \"port\": 9200, \"scheme\": \"https\"}],\n",
88+
" basic_auth=(\"elastic\", \"qaf_admin\"),\n",
89+
" verify_certs=False, # Set to True if you have valid SSL certificates\n",
90+
" # Alternatively, you can use Elastic cloud_id and api_key\n",
91+
" #api_key=getpass(\"API Key: \")\n",
92+
" #cloud_id=getpass(\"Cloud ID: \"),\n",
93+
" )\n",
94+
"\n",
95+
" # Test the connection\n",
96+
" if not es.ping():\n",
97+
" raise Exception(\"Failed to connect to Elasticsearch\")\n",
98+
"\n",
99+
" print(\"Successfully connected to Elasticsearch\")\n",
100+
"\n",
101+
"except Exception as e:\n",
102+
" print(f\"Error connecting to Elasticsearch: {e}\")\n",
103+
" print(\"Please check your credentials\")\n",
104+
" raise\n"
105+
]
106+
},
107+
{
108+
"cell_type": "code",
109+
"execution_count": 16,
110+
"metadata": {},
111+
"outputs": [
112+
{
113+
"name": "stdout",
114+
"output_type": "stream",
115+
"text": [
116+
"Successfully bulk indexed 4840 documents\n",
117+
"Indexing complete!\n"
118+
]
119+
},
120+
{
121+
"name": "stderr",
122+
"output_type": "stream",
123+
"text": [
124+
"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:1099: InsecureRequestWarning: Unverified HTTPS request is being made to host 'localhost'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n",
125+
" warnings.warn(\n",
126+
"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:1099: InsecureRequestWarning: Unverified HTTPS request is being made to host 'localhost'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n",
127+
" warnings.warn(\n"
128+
]
129+
}
130+
],
131+
"source": [
132+
"# Define the index mapping\n",
133+
"index_name = \"coco\"\n",
134+
"mapping = {\n",
135+
" \"mappings\": {\n",
136+
" \"properties\": {\n",
137+
" \"language\": {\"type\": \"keyword\"},\n",
138+
" \"description\": {\"type\": \"text\"},\n",
139+
" \"en\": {\"type\": \"text\"},\n",
140+
" \"image_url\": {\"type\": \"keyword\"},\n",
141+
" }\n",
142+
" }\n",
143+
"}\n",
144+
"\n",
145+
"# Create the index if it doesn't exist\n",
146+
"if not es.indices.exists(index=index_name):\n",
147+
" es.indices.create(index=index_name, body=mapping)\n",
148+
"\n",
149+
"# Load the JSON data\n",
150+
"with open('./multilingual_coco_sample.json', 'r') as f:\n",
151+
" data = json.load(f)\n",
152+
"\n",
153+
"rows = data[\"rows\"]\n",
154+
"# List of languages to process\n",
155+
"languages = [\"en\", \"es\", \"de\", \"it\", \"vi\", \"th\"]\n",
156+
"\n",
157+
"bulk_data = []\n",
158+
"for obj in rows:\n",
159+
" row = obj[\"row\"]\n",
160+
" image_url = row.get(\"image\")\n",
161+
" image_url = image_url[\"src\"]\n",
162+
"\n",
163+
" # Process each language\n",
164+
" for lang in languages:\n",
165+
" # Skip if language not present in this row\n",
166+
" if lang not in row:\n",
167+
" continue\n",
168+
"\n",
169+
" # Get all descriptions for this language\n",
170+
" descriptions = row[lang]\n",
171+
" first_eng_caption = row[\"en\"][0]\n",
172+
"\n",
173+
" # Prepare bulk indexing data\n",
174+
" for description in descriptions:\n",
175+
" if description == \"\":\n",
176+
" continue\n",
177+
" # Add index operation\n",
178+
" bulk_data.append(\n",
179+
" {\"index\": {\"_index\": index_name}}\n",
180+
" )\n",
181+
" # Add document\n",
182+
" bulk_data.append({\n",
183+
" \"language\": lang,\n",
184+
" \"description\": description,\n",
185+
" \"en\": first_eng_caption,\n",
186+
" \"image_url\": image_url,\n",
187+
" })\n",
188+
"\n",
189+
"# Perform bulk indexing\n",
190+
"if bulk_data:\n",
191+
" try:\n",
192+
" response = es.bulk(operations=bulk_data)\n",
193+
" if response[\"errors\"]:\n",
194+
" print(\"Some documents failed to index\")\n",
195+
" else:\n",
196+
" print(f\"Successfully bulk indexed {len(bulk_data)} documents\")\n",
197+
" except Exception as e:\n",
198+
" print(f\"Error during bulk indexing: {str(e)}\")\n",
199+
"\n",
200+
"print(\"Indexing complete!\")"
201+
]
202+
}
203+
],
204+
"metadata": {
205+
"kernelspec": {
206+
"display_name": "Python 3",
207+
"language": "python",
208+
"name": "python3"
209+
},
210+
"language_info": {
211+
"codemirror_mode": {
212+
"name": "ipython",
213+
"version": 3
214+
},
215+
"file_extension": ".py",
216+
"mimetype": "text/x-python",
217+
"name": "python",
218+
"nbconvert_exporter": "python",
219+
"pygments_lexer": "ipython3",
220+
"version": "3.11.6"
221+
}
222+
},
223+
"nbformat": 4,
224+
"nbformat_minor": 2
225+
}

0 commit comments

Comments
 (0)