|
13 | 13 | "cell_type": "markdown",
|
14 | 14 | "metadata": {},
|
15 | 15 | "source": [
|
16 |
| - "" |
| 16 | + "" |
17 | 17 | ]
|
18 | 18 | },
|
19 | 19 | {
|
|
68 | 68 | "source": [
|
69 | 69 | "import logging\n",
|
70 | 70 | "import os\n",
|
| 71 | + "import tempfile\n", |
71 | 72 | "\n",
|
72 | 73 | "import pandas as pd\n",
|
73 | 74 | "\n",
|
|
77 | 78 | "from azureml.core.dataset import Dataset\n",
|
78 | 79 | "from azureml.core.compute import AmlCompute\n",
|
79 | 80 | "from azureml.core.compute import ComputeTarget\n",
|
| 81 | + "from azureml.core.compute_target import ComputeTargetException\n", |
| 82 | + "from azureml.core.script_run_config import ScriptRunConfig\n", |
80 | 83 | "from azureml.core.run import Run\n",
|
| 84 | + "from azureml.data.datapath import DataPath\n", |
81 | 85 | "from azureml.train.automl import AutoMLConfig\n",
|
82 |
| - "from sklearn.datasets import fetch_20newsgroups" |
| 86 | + "from sklearn.datasets import fetch_20newsgroups\n", |
| 87 | + "from sklearn.metrics import classification_report" |
83 | 88 | ]
|
84 | 89 | },
|
85 | 90 | {
|
|
123 | 128 | "ws = Workspace.from_config()\n",
|
124 | 129 | "\n",
|
125 | 130 | "# Choose an experiment name.\n",
|
126 |
| - "experiment_name = \"automl-nlp-text-multiclass\"\n", |
| 131 | + "experiment_name = \"automl-nlp-text-classification-multiclass\"\n", |
127 | 132 | "\n",
|
128 | 133 | "experiment = Experiment(ws, experiment_name)\n",
|
129 | 134 | "\n",
|
|
143 | 148 | "metadata": {},
|
144 | 149 | "source": [
|
145 | 150 | "## Set up a compute cluster\n",
|
146 |
| - "This section uses a user-provided compute cluster (named \"dist-compute\" in this example). If a cluster with this name does not exist in the user's workspace, the below code will create a new cluster. You can choose the parameters of the cluster as mentioned in the comments." |
| 151 | + "This section uses a user-provided compute cluster (named \"gpu-compute\" in this example). If a cluster with this name does not exist in the user's workspace, the below code will create a new cluster. You can choose the parameters of the cluster as mentioned in the comments." |
147 | 152 | ]
|
148 | 153 | },
|
149 | 154 | {
|
|
156 | 161 | },
|
157 | 162 | "outputs": [],
|
158 | 163 | "source": [
|
159 |
| - "from azureml.core.compute import ComputeTarget, AmlCompute\n", |
160 |
| - "from azureml.core.compute_target import ComputeTargetException\n", |
161 |
| - "\n", |
162 | 164 | "num_nodes = 1\n",
|
163 | 165 | "\n",
|
164 | 166 | "# Choose a name for your cluster.\n",
|
165 |
| - "amlcompute_cluster_name = \"dist-compute\"\n", |
| 167 | + "amlcompute_cluster_name = \"gpu-compute\"\n", |
166 | 168 | "\n",
|
167 | 169 | "# Verify that cluster does not exist already\n",
|
168 | 170 | "try:\n",
|
|
222 | 224 | " {feature_column_name: data.data, target_column_name: data.target}\n",
|
223 | 225 | " )\n",
|
224 | 226 | "\n",
|
225 |
| - " data_train = data[:200]\n", |
226 |
| - " data_val = data[200:300]\n", |
227 |
| - " data_test = data[300:400]\n", |
| 227 | + " data_train = data.loc[:200]\n", |
| 228 | + " data_val = data.loc[200:300]\n", |
| 229 | + " data_test = data.loc[300:400]\n", |
228 | 230 | "\n",
|
229 |
| - " data_train = remove_blanks_20news(\n", |
230 |
| - " data_train, feature_column_name, target_column_name\n", |
231 |
| - " )\n", |
232 |
| - " data_val = remove_blanks_20news(data_val, feature_column_name, target_column_name)\n", |
233 |
| - " data_test = remove_blanks_20news(data_test, feature_column_name, target_column_name)\n", |
| 231 | + " data_train = remove_blanks_20news(data_train)\n", |
| 232 | + " data_val = remove_blanks_20news(data_val)\n", |
| 233 | + " data_test = remove_blanks_20news(data_test)\n", |
234 | 234 | "\n",
|
235 | 235 | " return data_train, data_val, data_test\n",
|
236 | 236 | "\n",
|
237 | 237 | "\n",
|
238 |
| - "def remove_blanks_20news(data, feature_column_name, target_column_name):\n", |
239 |
| - "\n", |
| 238 | + "def remove_blanks_20news(data):\n", |
| 239 | + " data = data.copy()\n", |
240 | 240 | " data[feature_column_name] = (\n",
|
241 | 241 | " data[feature_column_name]\n",
|
242 | 242 | " .replace(r\"\\n\", \" \", regex=True)\n",
|
|
280 | 280 | "data_test.to_csv(test_data_fname, index=False)\n",
|
281 | 281 | "\n",
|
282 | 282 | "datastore = ws.get_default_datastore()\n",
|
283 |
| - "datastore.upload(src_dir=data_dir, target_path=blobstore_datadir, overwrite=True)" |
| 283 | + "target = DataPath(\n", |
| 284 | + " datastore=datastore, path_on_datastore=blobstore_datadir, name=\"news_group_data\"\n", |
| 285 | + ")\n", |
| 286 | + "Dataset.File.upload_directory(\n", |
| 287 | + " src_dir=data_dir, target=target, overwrite=True, show_progress=True\n", |
| 288 | + ")" |
284 | 289 | ]
|
285 | 290 | },
|
286 | 291 | {
|
|
424 | 429 | "metadata": {},
|
425 | 430 | "outputs": [],
|
426 | 431 | "source": [
|
427 |
| - "(\n", |
428 |
| - " best_run,\n", |
429 |
| - " best_model,\n", |
430 |
| - ") = (\n", |
431 |
| - " automl_run.get_output()\n", |
432 |
| - ") # You might see a warning about \"enable_distributed_dnn_training\". Please simply ignore.\n", |
| 432 | + "best_run, best_model = automl_run.get_output()\n", |
433 | 433 | "best_run"
|
434 | 434 | ]
|
435 | 435 | },
|
|
456 | 456 | "source": [
|
457 | 457 | "test_dataset = Dataset.Tabular.from_delimited_files(\n",
|
458 | 458 | " path=[(datastore, blobstore_datadir + \"/test_data.csv\")]\n",
|
459 |
| - ")\n", |
460 |
| - "\n", |
461 |
| - "# preview the first 3 rows of the dataset\n", |
462 |
| - "test_dataset.take(3).to_pandas_dataframe()" |
| 459 | + ")" |
463 | 460 | ]
|
464 | 461 | },
|
465 | 462 | {
|
|
490 | 487 | },
|
491 | 488 | "outputs": [],
|
492 | 489 | "source": [
|
493 |
| - "# Load training script run corresponding to AutoML run above.\n", |
494 |
| - "training_run_id = automl_run.id + \"_HD_0\"\n", |
| 490 | + "training_run_id = best_run.id\n", |
495 | 491 | "training_run = Run(experiment, training_run_id)"
|
496 | 492 | ]
|
497 | 493 | },
|
|
526 | 522 | },
|
527 | 523 | "outputs": [],
|
528 | 524 | "source": [
|
529 |
| - "import tempfile\n", |
530 |
| - "from azureml.core.script_run_config import ScriptRunConfig\n", |
531 |
| - "\n", |
532 | 525 | "scoring_args = arguments\n",
|
533 | 526 | "with tempfile.TemporaryDirectory() as tmpdir:\n",
|
534 | 527 | " # Download required files from training run into temp folder.\n",
|
|
640 | 633 | },
|
641 | 634 | "outputs": [],
|
642 | 635 | "source": [
|
643 |
| - "from sklearn.metrics import classification_report\n", |
644 |
| - "\n", |
645 | 636 | "print(\n",
|
646 | 637 | " classification_report(\n",
|
647 | 638 | " test_data_df[target_column_name], test_set_predictions_df[target_column_name]\n",
|
|
678 | 669 | "name": "python3-azureml"
|
679 | 670 | },
|
680 | 671 | "kernelspec": {
|
681 |
| - "display_name": "Python 3.7.0 64-bit ('pypi': conda)", |
| 672 | + "display_name": "Python 3.6", |
682 | 673 | "language": "python",
|
683 | 674 | "name": "python3"
|
684 | 675 | },
|
|
0 commit comments