Skip to content

Commit 13b353a

Browse files
ZeratuuLLLifeeng Wei
and
Lifeeng Wei
authoredApr 1, 2022
Lifengwei/update nlp notebooks (#1090)
* remove some packages * add back a package * update notebooks, mainly change multilabel data file * data file * fix typos, codes in notebooks * update experiment name * fix typo, remove comments Co-authored-by: Lifeeng Wei <lifengwei@microsoft.com>
1 parent c41594c commit 13b353a

11 files changed

+51882
-51885
lines changed
 

‎python-sdk/tutorials/automl-with-azureml/automl-nlp-multiclass/automl-nlp-text-classification-multiclass.ipynb

+28-37
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
"cell_type": "markdown",
1414
"metadata": {},
1515
"source": [
16-
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/classification-text-dnn/auto-ml-classification-text-dnn.png)"
16+
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/experimental/automl-nlp-multiclass/automl-nlp-text-classification-multiclass.png)"
1717
]
1818
},
1919
{
@@ -68,6 +68,7 @@
6868
"source": [
6969
"import logging\n",
7070
"import os\n",
71+
"import tempfile\n",
7172
"\n",
7273
"import pandas as pd\n",
7374
"\n",
@@ -77,9 +78,13 @@
7778
"from azureml.core.dataset import Dataset\n",
7879
"from azureml.core.compute import AmlCompute\n",
7980
"from azureml.core.compute import ComputeTarget\n",
81+
"from azureml.core.compute_target import ComputeTargetException\n",
82+
"from azureml.core.script_run_config import ScriptRunConfig\n",
8083
"from azureml.core.run import Run\n",
84+
"from azureml.data.datapath import DataPath\n",
8185
"from azureml.train.automl import AutoMLConfig\n",
82-
"from sklearn.datasets import fetch_20newsgroups"
86+
"from sklearn.datasets import fetch_20newsgroups\n",
87+
"from sklearn.metrics import classification_report"
8388
]
8489
},
8590
{
@@ -123,7 +128,7 @@
123128
"ws = Workspace.from_config()\n",
124129
"\n",
125130
"# Choose an experiment name.\n",
126-
"experiment_name = \"automl-nlp-text-multiclass\"\n",
131+
"experiment_name = \"automl-nlp-text-classification-multiclass\"\n",
127132
"\n",
128133
"experiment = Experiment(ws, experiment_name)\n",
129134
"\n",
@@ -143,7 +148,7 @@
143148
"metadata": {},
144149
"source": [
145150
"## Set up a compute cluster\n",
146-
"This section uses a user-provided compute cluster (named \"dist-compute\" in this example). If a cluster with this name does not exist in the user's workspace, the below code will create a new cluster. You can choose the parameters of the cluster as mentioned in the comments."
151+
"This section uses a user-provided compute cluster (named \"gpu-compute\" in this example). If a cluster with this name does not exist in the user's workspace, the below code will create a new cluster. You can choose the parameters of the cluster as mentioned in the comments."
147152
]
148153
},
149154
{
@@ -156,13 +161,10 @@
156161
},
157162
"outputs": [],
158163
"source": [
159-
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
160-
"from azureml.core.compute_target import ComputeTargetException\n",
161-
"\n",
162164
"num_nodes = 1\n",
163165
"\n",
164166
"# Choose a name for your cluster.\n",
165-
"amlcompute_cluster_name = \"dist-compute\"\n",
167+
"amlcompute_cluster_name = \"gpu-compute\"\n",
166168
"\n",
167169
"# Verify that cluster does not exist already\n",
168170
"try:\n",
@@ -222,21 +224,19 @@
222224
" {feature_column_name: data.data, target_column_name: data.target}\n",
223225
" )\n",
224226
"\n",
225-
" data_train = data[:200]\n",
226-
" data_val = data[200:300]\n",
227-
" data_test = data[300:400]\n",
227+
" data_train = data.loc[:200]\n",
228+
" data_val = data.loc[200:300]\n",
229+
" data_test = data.loc[300:400]\n",
228230
"\n",
229-
" data_train = remove_blanks_20news(\n",
230-
" data_train, feature_column_name, target_column_name\n",
231-
" )\n",
232-
" data_val = remove_blanks_20news(data_val, feature_column_name, target_column_name)\n",
233-
" data_test = remove_blanks_20news(data_test, feature_column_name, target_column_name)\n",
231+
" data_train = remove_blanks_20news(data_train)\n",
232+
" data_val = remove_blanks_20news(data_val)\n",
233+
" data_test = remove_blanks_20news(data_test)\n",
234234
"\n",
235235
" return data_train, data_val, data_test\n",
236236
"\n",
237237
"\n",
238-
"def remove_blanks_20news(data, feature_column_name, target_column_name):\n",
239-
"\n",
238+
"def remove_blanks_20news(data):\n",
239+
" data = data.copy()\n",
240240
" data[feature_column_name] = (\n",
241241
" data[feature_column_name]\n",
242242
" .replace(r\"\\n\", \" \", regex=True)\n",
@@ -280,7 +280,12 @@
280280
"data_test.to_csv(test_data_fname, index=False)\n",
281281
"\n",
282282
"datastore = ws.get_default_datastore()\n",
283-
"datastore.upload(src_dir=data_dir, target_path=blobstore_datadir, overwrite=True)"
283+
"target = DataPath(\n",
284+
" datastore=datastore, path_on_datastore=blobstore_datadir, name=\"news_group_data\"\n",
285+
")\n",
286+
"Dataset.File.upload_directory(\n",
287+
" src_dir=data_dir, target=target, overwrite=True, show_progress=True\n",
288+
")"
284289
]
285290
},
286291
{
@@ -424,12 +429,7 @@
424429
"metadata": {},
425430
"outputs": [],
426431
"source": [
427-
"(\n",
428-
" best_run,\n",
429-
" best_model,\n",
430-
") = (\n",
431-
" automl_run.get_output()\n",
432-
") # You might see a warning about \"enable_distributed_dnn_training\". Please simply ignore.\n",
432+
"best_run, best_model = automl_run.get_output()\n",
433433
"best_run"
434434
]
435435
},
@@ -456,10 +456,7 @@
456456
"source": [
457457
"test_dataset = Dataset.Tabular.from_delimited_files(\n",
458458
" path=[(datastore, blobstore_datadir + \"/test_data.csv\")]\n",
459-
")\n",
460-
"\n",
461-
"# preview the first 3 rows of the dataset\n",
462-
"test_dataset.take(3).to_pandas_dataframe()"
459+
")"
463460
]
464461
},
465462
{
@@ -490,8 +487,7 @@
490487
},
491488
"outputs": [],
492489
"source": [
493-
"# Load training script run corresponding to AutoML run above.\n",
494-
"training_run_id = automl_run.id + \"_HD_0\"\n",
490+
"training_run_id = best_run.id\n",
495491
"training_run = Run(experiment, training_run_id)"
496492
]
497493
},
@@ -526,9 +522,6 @@
526522
},
527523
"outputs": [],
528524
"source": [
529-
"import tempfile\n",
530-
"from azureml.core.script_run_config import ScriptRunConfig\n",
531-
"\n",
532525
"scoring_args = arguments\n",
533526
"with tempfile.TemporaryDirectory() as tmpdir:\n",
534527
" # Download required files from training run into temp folder.\n",
@@ -640,8 +633,6 @@
640633
},
641634
"outputs": [],
642635
"source": [
643-
"from sklearn.metrics import classification_report\n",
644-
"\n",
645636
"print(\n",
646637
" classification_report(\n",
647638
" test_data_df[target_column_name], test_set_predictions_df[target_column_name]\n",
@@ -678,7 +669,7 @@
678669
"name": "python3-azureml"
679670
},
680671
"kernelspec": {
681-
"display_name": "Python 3.7.0 64-bit ('pypi': conda)",
672+
"display_name": "Python 3.6",
682673
"language": "python",
683674
"name": "python3"
684675
},

‎python-sdk/tutorials/automl-with-azureml/automl-nlp-multiclass/update_env.yml

+1-2
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,4 @@ dependencies:
44
- pandas~=1.1.5
55

66
- pip:
7-
- azureml-automl-dnn-nlp==1.39.0
8-
- horovod==0.21.3
7+
- azureml-automl-dnn-nlp==1.39.0

0 commit comments

Comments
 (0)