From ffe2e899395dae4c6a1f07eb0e24f1f346bad1ce Mon Sep 17 00:00:00 2001 From: Juan Uribe Date: Thu, 5 Aug 2021 11:34:09 -0700 Subject: [PATCH] Add demo with mock task for experimental method run_experiment_cloud. PiperOrigin-RevId: 388983797 --- ...tf_model_garden_on_gcp_with_tf_cloud.ipynb | 558 ++++++++++++++++++ 1 file changed, 558 insertions(+) create mode 100644 src/python/tensorflow_cloud/core/experimental/tests/examples/running_model_experiments_from_tf_model_garden_on_gcp_with_tf_cloud.ipynb diff --git a/src/python/tensorflow_cloud/core/experimental/tests/examples/running_model_experiments_from_tf_model_garden_on_gcp_with_tf_cloud.ipynb b/src/python/tensorflow_cloud/core/experimental/tests/examples/running_model_experiments_from_tf_model_garden_on_gcp_with_tf_cloud.ipynb new file mode 100644 index 00000000..042a2d78 --- /dev/null +++ b/src/python/tensorflow_cloud/core/experimental/tests/examples/running_model_experiments_from_tf_model_garden_on_gcp_with_tf_cloud.ipynb @@ -0,0 +1,558 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Running model experiments from TF Model Garden on GCP with TF Cloud", + "provenance": [], + "collapsed_sections": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "cIG5d4Kvls6m" + }, + "source": [ + "##### Copyright 2021 The TensorFlow Cloud Authors.\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "eR70XKMMmC8I", + "cellView": "form" + }, + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wKcTRRxsAmDl" + }, + "source": [ + "# Running model experiments from TF Model Garden on GCP with TF Cloud\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " Run in Google Colab\n", + " \n", + " View on GitHub\n", + " \n", + " \"KaggleRun in Kaggle\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FAUbwFuJB3bw" + }, + "source": [ + "In this example we will use [run_experiment_cloud](https://github.com/tensorflow/cloud/blob/690c3eee65dadee8af260a19341ff23f42f1f070/src/python/tensorflow_cloud/core/experimental/models.py#L230) from the experimental module of TF Cloud to run a mock experiment from [TF Model Garden](https://github.com/tensorflow/models/tree/master/official). We will also be showing the different distribution strategies that this method supports." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EFCSAVDbC8-W" + }, + "source": [ + "## Install Packages\n", + "\n", + "We need tensorflow-cloud and the official release of tf-models-official." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "iSiip7khfBvI" + }, + "source": [ + "!pip install -q tensorflow-cloud tf-models-official" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iZ-0PtcKhIqz" + }, + "source": [ + "## Import required modules" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "S31gRAUTfOTM", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "d037ef96-a321-42e3-f981-3dd78b53316d" + }, + "source": [ + "import copy\n", + "import os\n", + "import sys\n", + "\n", + "import tensorflow_cloud as tfc\n", + "from tensorflow_cloud.core.experimental.models import run_experiment_cloud\n", + "\n", + "from official.core import task_factory\n", + "from official.utils.testing import mock_task\n", + "\n", + "print(tfc.__version__)" + ], + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "text": [ + "0.1.17.dev\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "siarj0TEhMzb" + }, + "source": [ + "## Project Configurations\n", + "Setting project parameters. For more details on Google Cloud Specific parameters please refer to [Google Cloud Project Setup Instructions](https://www.kaggle.com/nitric/google-cloud-project-setup-instructions/)." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "YCeDOU9Ufny9" + }, + "source": [ + "# Set Google Cloud Specific parameters\n", + "\n", + "# TODO: Please set GCP_PROJECT_ID to your own Google Cloud project ID.\n", + "GCP_PROJECT_ID = 'YOUR_PROJECT_ID' #@param {type:\"string\"}\n", + "\n", + "# TODO: set GCS_BUCKET to your own Google Cloud Storage (GCS) bucket.\n", + "GCS_BUCKET = 'YOUR_BUCKET_NAME' #@param {type:\"string\"}\n", + "\n", + "# DO NOT CHANGE: Currently only the 'us-central1' region is supported.\n", + "REGION = 'us-central1'" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mo6Wvg10DraI" + }, + "source": [ + "## Authenticating the notebook to use your Google Cloud Project\n", + "\n", + "This code authenticates the notebook, checking your valid Google Cloud credentials and identity. It is inside the `if not tfc.remote()` block to ensure that it is only run in the notebook, and will not be run when the notebook code is sent to Google Cloud.\n", + "\n", + "Note: For Kaggle Notebooks click on \"Add-ons\"->\"Google Cloud SDK\" before running the cell below." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "QeAmVS5KDtlR" + }, + "source": [ + "if not tfc.remote():\n", + "\n", + " # Authentication for Kaggle Notebooks\n", + " if \"kaggle_secrets\" in sys.modules:\n", + " from kaggle_secrets import UserSecretsClient\n", + " UserSecretsClient().set_gcloud_credentials(project=GCP_PROJECT_ID)\n", + "\n", + " # Authentication for Colab Notebooks\n", + " if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + " auth.authenticate_user()\n", + " os.environ[\"GOOGLE_CLOUD_PROJECT\"] = GCP_PROJECT_ID" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "E1Fy7xadhSRi" + }, + "source": [ + "## Set Up TF Model Garden Experiment\n", + "\n", + "We are going to set up the experiment from TF Model Garden that we want to run. In this case, we are going to be running a mock experiment. However, you can chose any experiment from TF Model Garden. Also, we are going to be overriding some of the params from the original experiment to include a trainer.\n", + "\n", + "After having the experiment config ready, we can store all of the params in a dictionary. The only param we are missing is model_dir, however we will be setting this one up later on.\n", + "\n", + "For more details refer to [run_experiment GitHub](https://github.com/tensorflow/models/blob/7c2ff1afc4423266223bcd50cba0ed55aca826c8/official/core/train_lib.py#L35).\n", + "\n", + "Note: run_experiment requires a distribution_strategy parameter. However, run_experiment_cloud selects the distribution strategy based on the cloud configuration. Therefore, you should not pass this parameter as part of run_experiment_kwargs." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "IKu5XD9MhbMm" + }, + "source": [ + "config = mock_task.mock_experiment()\n", + "\n", + "overrides = {\n", + " \"trainer\": {\n", + " \"checkpoint_interval\": 10,\n", + " \"steps_per_loop\": 10,\n", + " \"summary_interval\": 10,\n", + " \"train_steps\": 10,\n", + " \"validation_steps\": 5,\n", + " \"validation_interval\": 10,\n", + " \"continuous_eval_timeout\": 1,\n", + " \"validation_summary_subdir\": \"validation\",\n", + " \"optimizer_config\": {\n", + " \"optimizer\": {\n", + " \"type\": \"sgd\",\n", + " },\n", + " \"learning_rate\": {\n", + " \"type\": \"constant\"\n", + " }\n", + " }\n", + " },\n", + "}\n", + "config.override(overrides, is_strict=False)\n", + "\n", + "run_experiment_kwargs = dict(\n", + " params=config,\n", + " task=task_factory.get_task(config.task),\n", + " mode=\"train_and_eval\",\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "E94YK3sGikkx" + }, + "source": [ + "## Set up TensorFlowCloud run\n", + "\n", + "Set up parameters for tfc.run(). The chief_config, worker_count and worker_config will be set up individually for each distribution strategy. For more details refer to [TensorFlow Cloud overview tutorial](https://colab.research.google.com/github/tensorflow/cloud/blob/master/g3doc/tutorials/overview.ipynb)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "C8VWQ3AANj3V" + }, + "source": [ + "with open('requirements.txt','w') as f:\n", + " f.write('tf-models-official\\n')\n", + "\n", + "run_kwargs = dict(\n", + " requirements_txt = 'requirements.txt',\n", + " docker_config=tfc.DockerConfig(\n", + " parent_image=\"gcr.io/deeplearning-platform-release/tf2-gpu.2-5\",\n", + " image_build_bucket=GCS_BUCKET\n", + " ),\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZgDk8hO4gwdp" + }, + "source": [ + "## Set up distirbution strategies\n", + "\n", + "Currently run_experiment_cloud supports 4 different distirbution strategies:\n", + "\n", + "1. One Device\n", + "2. Mirror\n", + "3. Multi Worker Mirror\n", + "4. TPU\n", + "\n", + "However, unlike run_experiment from TF Model Garden, the user does not specify the distirbution strategy. Instead, it is selected based on the machine configuration provided in run_kwargs. The following sections show how to set up the machine config to use the different distribution strategies.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kQ-NifU0g1mk" + }, + "source": [ + "### One device strategy\n", + "\n", + "Using default values for config (One accelerator (T4_1X) and 0 workers)." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "nUF7eBIxg5Qe" + }, + "source": [ + "JOB_NAME = 'one_device' #@param {type:\"string\"}\n", + "\n", + "# Setting location were training logs and checkpoints will be stored\n", + "GCS_BASE_PATH = f'gs://{GCS_BUCKET}/{JOB_NAME}'\n", + "one_device_model_dir = os.path.join(GCS_BASE_PATH,\"saved_model\")\n", + "\n", + "one_device_run_experiment_kwargs = copy.deepcopy(run_experiment_kwargs)\n", + "one_device_run_experiment_kwargs.update(dict(\n", + " model_dir=one_device_model_dir,\n", + "))\n", + "\n", + "one_device_run_kwargs = copy.deepcopy(run_kwargs)\n", + "one_device_run_kwargs.update(dict(\n", + " job_labels={'job': JOB_NAME}\n", + "))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ekFfRN66nRI4" + }, + "source": [ + "### Mirror strategy\n", + "\n", + "Requires at least two accelerators in the chief_config and 0 workers." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "CuqZ6ro7nYzb" + }, + "source": [ + "JOB_NAME = 'mirror' #@param {type:\"string\"}\n", + "\n", + "# Setting location were training logs and checkpoints will be stored\n", + "GCS_BASE_PATH = f'gs://{GCS_BUCKET}/{JOB_NAME}'\n", + "mirror_model_dir = os.path.join(GCS_BASE_PATH,\"saved_model\")\n", + "\n", + "mirror_run_experiment_kwargs = copy.deepcopy(run_experiment_kwargs)\n", + "mirror_run_experiment_kwargs.update(dict(\n", + " model_dir=mirror_model_dir,\n", + "))\n", + "\n", + "mirror_run_kwargs = copy.deepcopy(run_kwargs)\n", + "mirror_run_kwargs.update(dict(\n", + " chief_config=tfc.COMMON_MACHINE_CONFIGS[\"P100_4X\"],\n", + " job_labels={'job': JOB_NAME}\n", + "))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "svxrpHIYoh6W" + }, + "source": [ + "### Multi mirror strategy\n", + "\n", + "Requires at least one worker." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "V-F-cgwbomNj" + }, + "source": [ + "JOB_NAME = 'multi_mirror' #@param {type:\"string\"}\n", + "\n", + "# Setting location were training logs and checkpoints will be stored\n", + "GCS_BASE_PATH = f'gs://{GCS_BUCKET}/{JOB_NAME}'\n", + "multi_mirror_model_dir = os.path.join(GCS_BASE_PATH,\"saved_model\")\n", + "\n", + "multi_mirror_run_experiment_kwargs = copy.deepcopy(run_experiment_kwargs)\n", + "multi_mirror_run_experiment_kwargs.update(dict(\n", + " model_dir=multi_mirror_model_dir,\n", + "))\n", + "\n", + "multi_mirror_run_kwargs = copy.deepcopy(run_kwargs)\n", + "multi_mirror_run_kwargs.update(dict(\n", + " chief_config=tfc.COMMON_MACHINE_CONFIGS[\"P100_1X\"],\n", + " worker_count=1,\n", + " worker_config=tfc.COMMON_MACHINE_CONFIGS[\"P100_1X\"],\n", + " job_labels={'job': JOB_NAME}\n", + "))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "toq28wZApFIM" + }, + "source": [ + "### TPU strategy\n", + "\n", + "Rquires TPU as worker_config." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "bQ7Q_RZ9pIKV" + }, + "source": [ + "JOB_NAME = 'tpu' #@param {type:\"string\"}\n", + "\n", + "# Setting location were training logs and checkpoints will be stored\n", + "GCS_BASE_PATH = f'gs://{GCS_BUCKET}/{JOB_NAME}'\n", + "tpu_model_dir = os.path.join(GCS_BASE_PATH,\"saved_model\")\n", + "\n", + "tpu_run_experiment_kwargs = copy.deepcopy(run_experiment_kwargs)\n", + "tpu_run_experiment_kwargs.update(dict(\n", + " model_dir=tpu_model_dir,\n", + "))\n", + "\n", + "tpu_run_kwargs = copy.deepcopy(run_kwargs)\n", + "tpu_run_kwargs.update(dict(\n", + " chief_config=tfc.COMMON_MACHINE_CONFIGS[\"CPU\"],\n", + " worker_count=1,\n", + " worker_config=tfc.COMMON_MACHINE_CONFIGS[\"TPU\"],\n", + " job_labels={'job': JOB_NAME}\n", + "))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tWhfuseFPnBa" + }, + "source": [ + "## Run remote experiment\n", + "\n", + "Select the distribution strategy to use and then run the remote experiment by calling run_experiment_cloud with the specified configs." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "h8K3Q3-5PuoY" + }, + "source": [ + "run_experiment_configs = dict(\n", + " one_device=one_device_run_experiment_kwargs,\n", + " mirror=mirror_run_experiment_kwargs,\n", + " multi_mirror=multi_mirror_run_experiment_kwargs,\n", + " tpu=tpu_run_experiment_kwargs,\n", + ")\n", + "\n", + "run_configs = dict(\n", + " one_device=one_device_run_kwargs,\n", + " mirror=mirror_run_kwargs,\n", + " multi_mirror=multi_mirror_run_kwargs,\n", + " tpu=tpu_run_kwargs,\n", + ")\n", + "\n", + "distribution_strategy = 'one_device' #@param [\"one_device\", \"mirror\", \"multi_mirror\", \"tpu\"]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "jRuqBEKdREeB" + }, + "source": [ + "run_experiment_cloud(run_experiment_configs[distribution_strategy],\n", + " run_configs[distribution_strategy])" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZLFHnIFUF8Fx" + }, + "source": [ + "## Training Results\n", + "### Reconnect your Colab instance\n", + "Most remote training jobs are long running, if you are using Colab it may time out before the training results are available. In that case rerun the following sections to reconnect and configure your Colab instance to access the training results. Run the following sections in order:\n", + "\n", + "1. Import required modules\n", + "2. Project Configurations\n", + "3. Authenticating the notebook to use your Google Cloud Project" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNBMCk0MFi1A" + }, + "source": [ + "### Load your trained model\n", + "\n", + "Once training is complete, you can retrieve your model from the GCS Bucket you specified above." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "B7vpXm3l9iiT" + }, + "source": [ + "import tensorflow as tf\n", + "\n", + "saved_model_dir = run_experiment_configs[distribution_strategy]['model_dir']\n", + "\n", + "trained_model = tf.keras.models.load_model(saved_model_dir)\n", + "trained_model.summary()" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file