Skip to content

Commit 184e40b

Browse files
authored
Bugfix in Crawler Migration Notebook extraction rule cell (elastic#410)
1 parent d0605d2 commit 184e40b

File tree

1 file changed

+56
-43
lines changed

1 file changed

+56
-43
lines changed

notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb

Lines changed: 56 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -269,55 +269,68 @@
269269
" config_oid = source[\"configuration_oid\"]\n",
270270
" domain_oid = source[\"domain_oid\"]\n",
271271
"\n",
272-
" all_rules = source[\"rules\"]\n",
273-
" all_url_filters = source[\"url_filters\"]\n",
274-
"\n",
275-
" # extract url filters\n",
276-
" url_filters = []\n",
277-
" if all_url_filters:\n",
278-
" url_filters = [\n",
279-
" {\n",
280-
" \"type\": all_url_filters[0][\"filter\"],\n",
281-
" \"pattern\": all_url_filters[0][\"pattern\"],\n",
282-
" }\n",
283-
" ]\n",
284-
"\n",
285-
" # extract rulesets\n",
286-
" action_translation_map = {\n",
287-
" \"fixed\": \"set\",\n",
288-
" \"extracted\": \"extract\",\n",
289-
" }\n",
272+
" # ensure the config and domain oids actually exist in our in-memory data structure\n",
273+
" if (\n",
274+
" config_oid in inflight_configuration_data\n",
275+
" and domain_oid in inflight_configuration_data[config_oid][\"domains_temp\"]\n",
276+
" ):\n",
277+
"\n",
278+
" # initialize extraction rulesets an empty array if it doesn't exist yet\n",
279+
" if (\n",
280+
" not \"extraction_rulesets\"\n",
281+
" in inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid]\n",
282+
" ):\n",
283+
" inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\n",
284+
" \"extraction_rulesets\"\n",
285+
" ] = []\n",
286+
"\n",
287+
" all_rules = source[\"rules\"]\n",
288+
" all_url_filters = source[\"url_filters\"]\n",
289+
"\n",
290+
" # extract url filters\n",
291+
" url_filters = []\n",
292+
" if all_url_filters:\n",
293+
" url_filters = [\n",
294+
" {\n",
295+
" \"type\": all_url_filters[0][\"filter\"],\n",
296+
" \"pattern\": all_url_filters[0][\"pattern\"],\n",
297+
" }\n",
298+
" ]\n",
290299
"\n",
291-
" ruleset = {}\n",
292-
" if all_rules:\n",
293-
" ruleset = [\n",
294-
" {\n",
295-
" \"action\": action_translation_map[\n",
296-
" all_rules[0][\"content_from\"][\"value_type\"]\n",
297-
" ],\n",
298-
" \"field_name\": all_rules[0][\"field_name\"],\n",
299-
" \"selector\": all_rules[0][\"selector\"],\n",
300-
" \"join_as\": all_rules[0][\"multiple_objects_handling\"],\n",
301-
" \"value\": all_rules[0][\"content_from\"][\"value\"],\n",
302-
" \"source\": all_rules[0][\"source_type\"],\n",
303-
" }\n",
304-
" ]\n",
300+
" # extract rulesets\n",
301+
" action_translation_map = {\n",
302+
" \"fixed\": \"set\",\n",
303+
" \"extracted\": \"extract\",\n",
304+
" }\n",
305305
"\n",
306-
" # populate the in-memory data structure\n",
307-
" temp_extraction_rulesets = [\n",
308-
" {\n",
306+
" ruleset = []\n",
307+
" if all_rules:\n",
308+
" ruleset = [\n",
309+
" {\n",
310+
" \"action\": action_translation_map[\n",
311+
" all_rules[0][\"content_from\"][\"value_type\"]\n",
312+
" ],\n",
313+
" \"field_name\": all_rules[0][\"field_name\"],\n",
314+
" \"selector\": all_rules[0][\"selector\"],\n",
315+
" \"join_as\": all_rules[0][\"multiple_objects_handling\"],\n",
316+
" \"value\": all_rules[0][\"content_from\"][\"value\"],\n",
317+
" \"source\": all_rules[0][\"source_type\"],\n",
318+
" }\n",
319+
" ]\n",
320+
"\n",
321+
" temp_extraction_rulesets = {\n",
309322
" \"url_filters\": url_filters,\n",
310323
" \"rules\": ruleset,\n",
311324
" }\n",
312-
" ]\n",
313325
"\n",
314-
" print(\n",
315-
" f\"{extr_count}.) Crawler {config_oid} has extraction rules {temp_extraction_rulesets}\\n\"\n",
316-
" )\n",
317-
" extr_count += 1\n",
318-
" inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\n",
319-
" \"extraction_rulesets\"\n",
320-
" ] = temp_extraction_rulesets"
326+
" print(\n",
327+
" f\"{extr_count}.) Crawler {config_oid} has extraction rules {temp_extraction_rulesets}\\n\"\n",
328+
" )\n",
329+
" extr_count += 1\n",
330+
"\n",
331+
" inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\n",
332+
" \"extraction_rulesets\"\n",
333+
" ].append(temp_extraction_rulesets)"
321334
]
322335
},
323336
{

0 commit comments

Comments
 (0)