Add integration tests for run_experiment_cloud wrapper.

juanuribe28 · Tensorflow Cloud maintainers · commit 9a1180aef6f1 · 2021-07-30T09:15:33.000-07:00
PiperOrigin-RevId: 383893019
diff --git a/src/python/dependencies.py b/src/python/dependencies.py
@@ -27,6 +27,8 @@ def make_required_install_packages():
         "tensorflow>=1.15.0,<3.0",
         "tensorflow_datasets",
         "tensorflow_transform",
+        "tf-models-official",
+        "importlib_resources ; python_version<'3.7'"
     ]
 
 
@@ -38,4 +40,5 @@ def make_required_test_packages():
         "numpy",
         "nbconvert",
         "tf-models-official",
+        "importlib_resources ; python_version<'3.7'"
     ]
diff --git a/src/python/tensorflow_cloud/core/containerize.py b/src/python/tensorflow_cloud/core/containerize.py
@@ -285,7 +285,7 @@ def _get_file_path_map(self):
             self.entry_point = sys.argv[0]
 
         # Map entry_point directory to the dst directory.
-        if not self.called_from_notebook:
+        if not self.called_from_notebook or self.entry_point is not None:
             entry_point_dir, _ = os.path.split(self.entry_point)
             if not entry_point_dir:  # Current directory
                 entry_point_dir = "."
diff --git a/src/python/tensorflow_cloud/core/experimental/models.py b/src/python/tensorflow_cloud/core/experimental/models.py
@@ -15,17 +15,27 @@
 """Module that contains the `run_models` wrapper for training models from TF Model Garden."""
 
 import os
+import pickle
+import shutil
 from typing import Any, Dict, Optional
+import uuid
 
 from .. import machine_config
 from .. import run
 import tensorflow as tf
 import tensorflow_datasets as tfds
 
-from official.core import train_lib
 from official.vision.image_classification.efficientnet import efficientnet_model
 from official.vision.image_classification.resnet import resnet_model
 
+# pylint: disable=g-import-not-at-top
+try:
+    import importlib.resources as pkg_resources
+except ImportError:
+    # Backported for python<3.7
+    import importlib_resources as pkg_resources
+# pylint: enable=g-import-not-at-top
+
 
 def run_models(dataset_name: str,
                model_name: str,
@@ -251,48 +261,42 @@ def run_experiment_cloud(run_experiment_kwargs: Dict[str, Any],
     """
     if run_kwargs is None:
         run_kwargs = dict()
-
-    if run.remote():
-        default_machine_config = machine_config.COMMON_MACHINE_CONFIGS['T4_1X']
-        if 'chief_config' in run_kwargs:
-            chief_config = run_kwargs['chief_config']
-        else:
-            chief_config = default_machine_config
-        if 'worker_count' in run_kwargs:
-            worker_count = run_kwargs['worker_count']
+    distribution_strategy = get_distribution_strategy_str(run_kwargs)
+    run_experiment_kwargs.update(
+        dict(distribution_strategy=distribution_strategy))
+    file_id = str(uuid.uuid4())
+    params_file = save_params(run_experiment_kwargs, file_id)
+
+    with pkg_resources.path(__package__, 'models_entry_point.py') as path:
+        entry_point = f'{file_id}.py'
+        shutil.copyfile(str(path), entry_point)
+        run_kwargs.update(dict(entry_point=entry_point,
+                               distribution_strategy=None))
+        info = run.run(**run_kwargs)
+    os.remove(entry_point)
+    os.remove(params_file)
+    return info
+
+
+def get_distribution_strategy_str(run_kwargs):
+    """Gets the name of a distribution strategy based on cloud run config."""
+    if ('worker_count' in run_kwargs
+        and run_kwargs['worker_count'] > 0):
+        if ('worker_config' in run_kwargs
+            and machine_config.is_tpu_config(run_kwargs['worker_config'])):
+            return 'tpu'
         else:
-            worker_count = 0
-        if 'worker_config' in run_kwargs:
-            worker_config = run_kwargs['worker_config']
-        else:
-            worker_config = default_machine_config
-        distribution_strategy = get_distribution_strategy(chief_config,
-                                                          worker_count,
-                                                          worker_config)
-        run_experiment_kwargs.update(
-            dict(distribution_strategy=distribution_strategy))
-        model, _ = train_lib.run_experiment(**run_experiment_kwargs)
-        model.save(run_experiment_kwargs['model_dir'])
-
-    run_kwargs.update(dict(entry_point=None,
-                           distribution_strategy=None))
-    return run.run(**run_kwargs)
-
-
-def get_distribution_strategy(chief_config, worker_count, worker_config):
-    """Gets a tf distribution strategy based on the cloud run config."""
-    if worker_count > 0:
-        if machine_config.is_tpu_config(worker_config):
-            # TODO(b/194857231) Dependency conflict for using TPUs
-            resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
-                tpu='local')
-            tf.config.experimental_connect_to_cluster(resolver)
-            tf.tpu.experimental.initialize_tpu_system(resolver)
-            return tf.distribute.TPUStrategy(resolver)
-        else:
-            # TODO(b/148619319) Saving model currently failing
-            return tf.distribute.MultiWorkerMirroredStrategy()
-    elif chief_config.accelerator_count > 1:
-        return tf.distribute.MirroredStrategy()
+            return 'multi_mirror'
+    elif ('chief_config' in run_kwargs
+          and run_kwargs['chief_config'].accelerator_count > 1):
+        return 'mirror'
     else:
-        return tf.distribute.OneDeviceStrategy(device='/gpu:0')
+        return 'one_device'
+
+
+def save_params(params, file_id):
+    """Pickles the params object using the file_id as prefix."""
+    file_name = f'{file_id}_params'
+    with open(file_name, 'xb') as f:
+        pickle.dump(params, f)
+    return file_name
diff --git a/src/python/tensorflow_cloud/core/experimental/models_entry_point.py b/src/python/tensorflow_cloud/core/experimental/models_entry_point.py
@@ -0,0 +1,63 @@
+# Lint as: python3
+# Copyright 2021 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Entry point file for run_experiment_cloud."""
+
+import os
+import pickle
+
+import tensorflow as tf
+
+from official.core import train_lib
+
+
+def load_params(file_name):
+    with open(file_name, 'rb') as f:
+        params = pickle.load(f)
+    return params
+
+
+def get_tpu_strategy():
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+        tpu='local')
+    tf.config.experimental_connect_to_cluster(resolver)
+    tf.tpu.experimental.initialize_tpu_system(resolver)
+    return tf.distribute.TPUStrategy(resolver)
+
+
+def get_one_device():
+    return tf.distribute.OneDeviceStrategy(device='/gpu:0')
+
+_DISTRIBUTION_STRATEGIES = dict(
+        # TODO(b/194857231) Dependency conflict for using TPUs
+        tpu=get_tpu_strategy,
+        # TODO(b/148619319) Saving model currently failing for multi_mirror
+        multi_mirror=tf.distribute.MultiWorkerMirroredStrategy,
+        mirror=tf.distribute.MirroredStrategy,
+        one_device=get_one_device)
+
+
+def main():
+    prefix, _ = os.path.splitext(os.path.basename(__file__))
+    run_experiment_kwargs = load_params(f'{prefix}_params')
+    strategy_str = run_experiment_kwargs['distribution_strategy']
+    strategy = _DISTRIBUTION_STRATEGIES[strategy_str]()
+    run_experiment_kwargs.update(dict(
+        distribution_strategy=strategy))
+    model, _ = train_lib.run_experiment(**run_experiment_kwargs)
+    model.save(run_experiment_kwargs['model_dir'])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/python/tensorflow_cloud/core/experimental/tests/integration/run_experiment_cloud_test.py b/src/python/tensorflow_cloud/core/experimental/tests/integration/run_experiment_cloud_test.py
@@ -0,0 +1,167 @@
+# Lint as: python3
+# Copyright 2021 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Integration tests for calling run_experiment_cloud."""
+
+import os
+import uuid
+
+import tensorflow as tf
+import tensorflow_cloud as tfc
+from tensorflow_cloud.core.experimental import models
+from tensorflow_cloud.utils import google_api_client
+from official.core import task_factory
+from official.utils.testing import mock_task
+
+# The staging bucket to use for cloudbuild as well as save the model and data.
+_TEST_BUCKET = os.environ["TEST_BUCKET"]
+_PROJECT_ID = os.environ["PROJECT_ID"]
+_PARENT_IMAGE = "gcr.io/deeplearning-platform-release/tf2-gpu.2-5"
+_BASE_PATH = f"gs://{_TEST_BUCKET}/{uuid.uuid4()}"
+
+
+class RunExperimentCloudTest(tf.test.TestCase):
+
+    def setUp(self):
+        super(RunExperimentCloudTest, self).setUp()
+        self.test_data_path = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "../testdata/"
+        )
+        self.requirements_txt = os.path.join(self.test_data_path,
+                                             "requirements.txt")
+
+        self._test_config = {
+            "trainer": {
+                "checkpoint_interval": 10,
+                "steps_per_loop": 10,
+                "summary_interval": 10,
+                "train_steps": 10,
+                "validation_steps": 5,
+                "validation_interval": 10,
+                "continuous_eval_timeout": 1,
+                "validation_summary_subdir": "validation",
+                "optimizer_config": {
+                    "optimizer": {
+                        "type": "sgd",
+                    },
+                    "learning_rate": {
+                        "type": "constant"
+                    }
+                }
+            },
+        }
+
+        self.params = mock_task.mock_experiment()
+        self.params.override(self._test_config, is_strict=False)
+        self.run_experiment_kwargs = dict(
+            params=self.params,
+            task=task_factory.get_task(self.params.task),
+            mode="train_and_eval",
+        )
+        self.docker_config = tfc.DockerConfig(
+            parent_image=_PARENT_IMAGE,
+            image_build_bucket=_TEST_BUCKET
+        )
+
+    def tpu_strategy(self):
+        run_kwargs = dict(
+            chief_config=tfc.COMMON_MACHINE_CONFIGS["CPU"],
+            worker_count=1,
+            worker_config=tfc.COMMON_MACHINE_CONFIGS["TPU"],
+            requirements_txt=self.requirements_txt,
+            job_labels={
+                "job": "tpu_strategy",
+                "team": "run_experiment_cloud_tests",
+            },
+            docker_config=self.docker_config,
+        )
+        run_experiment_kwargs = dict(
+            model_dir=os.path.join(_BASE_PATH, "tpu", "saved_model"),
+            **self.run_experiment_kwargs,
+        )
+        return models.run_experiment_cloud(run_experiment_kwargs,
+                                           run_kwargs)
+
+    def multi_mirror_strategy(self):
+        run_kwargs = dict(
+            chief_config=tfc.COMMON_MACHINE_CONFIGS["P100_1X"],
+            worker_count=1,
+            worker_config=tfc.COMMON_MACHINE_CONFIGS["P100_1X"],
+            requirements_txt=self.requirements_txt,
+            job_labels={
+                "job": "multi_mirror_strategy",
+                "team": "run_experiment_cloud_tests",
+            },
+            docker_config=self.docker_config,
+        )
+        run_experiment_kwargs = dict(
+            model_dir=os.path.join(_BASE_PATH, "multi_mirror", "saved_model"),
+            **self.run_experiment_kwargs,
+        )
+        return models.run_experiment_cloud(run_experiment_kwargs,
+                                           run_kwargs)
+
+    def mirror_strategy(self):
+        run_kwargs = dict(
+            chief_config=tfc.COMMON_MACHINE_CONFIGS["P100_4X"],
+            requirements_txt=self.requirements_txt,
+            job_labels={
+                "job": "mirror",
+                "team": "run_experiment_cloud_tests",
+            },
+            docker_config=self.docker_config,
+        )
+        run_experiment_kwargs = dict(
+            model_dir=os.path.join(_BASE_PATH, "mirror", "saved_model"),
+            **self.run_experiment_kwargs,
+        )
+        return models.run_experiment_cloud(run_experiment_kwargs,
+                                           run_kwargs)
+
+    def one_device_strategy(self):
+        run_kwargs = dict(
+            requirements_txt=self.requirements_txt,
+            job_labels={
+                "job": "one_device",
+                "team": "run_experiment_cloud_tests",
+            },
+            docker_config=self.docker_config,
+        )
+        run_experiment_kwargs = dict(
+            model_dir=os.path.join(_BASE_PATH, "one_device", "saved_model"),
+            **self.run_experiment_kwargs,
+        )
+        # Using the default T4 GPU for this test.
+        return models.run_experiment_cloud(run_experiment_kwargs,
+                                           run_kwargs)
+
+    def test_run_experiment_cloud(self):
+        track_status = {
+            "one_device_strategy": self.one_device_strategy(),
+            "mirror_strategy": self.mirror_strategy(),
+            # TODO(b/148619319) Enable when bug is solved
+            # "multi_mirror_strategy": self.multi_mirror_strategy(),
+            # TODO(b/194857231) Enable when bug is solved
+            # "tpu_strategy": self.tpu_strategy(),
+        }
+
+        for test_name, ret_val in track_status.items():
+            self.assertTrue(
+                google_api_client.wait_for_aip_training_job_completion(
+                    ret_val["job_id"], _PROJECT_ID),
+                "Job {} generated from the test: {} has failed".format(
+                    ret_val["job_id"], test_name))
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/src/python/tensorflow_cloud/core/experimental/tests/testdata/requirements.txt b/src/python/tensorflow_cloud/core/experimental/tests/testdata/requirements.txt
@@ -0,0 +1,2 @@
+git+https://github.com/tensorflow/cloud.git@refs/pull/360/head#egg=tensorflow-cloud&subdirectory=src/python
+tf-models-official
diff --git a/src/python/tensorflow_cloud/core/experimental/tests/unit/models_entry_point_test.py b/src/python/tensorflow_cloud/core/experimental/tests/unit/models_entry_point_test.py
diff --git a/src/python/tensorflow_cloud/core/experimental/tests/unit/models_test.py b/src/python/tensorflow_cloud/core/experimental/tests/unit/models_test.py

Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,8 @@ def make_required_install_packages():`
`27`	`27`	`"tensorflow>=1.15.0,<3.0",`
`28`	`28`	`"tensorflow_datasets",`
`29`	`29`	`"tensorflow_transform",`
	`30`	`+ "tf-models-official",`
	`31`	`+ "importlib_resources ; python_version<'3.7'"`
`30`	`32`	`]`
`31`	`33`
`32`	`34`
`@@ -38,4 +40,5 @@ def make_required_test_packages():`
`38`	`40`	`"numpy",`
`39`	`41`	`"nbconvert",`
`40`	`42`	`"tf-models-official",`
	`43`	`+ "importlib_resources ; python_version<'3.7'"`
`41`	`44`	`]`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+git+https://github.com/tensorflow/cloud.git@refs/pull/360/head#egg=tensorflow-cloud&subdirectory=src/python`
	`2`	`+tf-models-official`