Merge pull request #3878 from GeorgianaElena/no-more-hub-type

Allow to default to a basehub infrastructure setup during phase2 of hub deploy
2i2c-org · Apr 9, 2024 · 4114bf5 · 4114bf5
2 parents d8018e1 + 5e22955
commit 4114bf5
Show file tree

Hide file tree

Showing 11 changed files with 694 additions and 559 deletions.
diff --git a/deployer/commands/generate/dedicated_cluster/aws.py b/deployer/commands/generate/dedicated_cluster/aws.py
@@ -12,6 +12,7 @@
 
 import jinja2
 import typer
+from typing_extensions import Annotated
 
 from deployer.utils.file_acquisition import REPO_ROOT_PATH
 from deployer.utils.rendering import print_colour
@@ -102,12 +103,15 @@ def generate_infra_files(vars):
 @dedicated_cluster_app.command()
 def aws(
     cluster_name: str = typer.Option(..., prompt="Name of the cluster to deploy"),
-    hub_type: str = typer.Option(
-        ..., prompt="Type of hub. Choose from `basehub` or `daskhub`"
-    ),
     cluster_region: str = typer.Option(
         ..., prompt="The region where to deploy the cluster"
     ),
+    hub_type: Annotated[
+        str,
+        typer.Option(
+            prompt="Please type in the hub type: basehub/daskhub.\n-> If this cluster will host daskhubs, please type `daskhub`.\n-> If you don't know this info, or this is not the case, just hit ENTER"
+        ),
+    ] = "basehub",
     force: bool = typer.Option(
         False,
         "--force",

diff --git a/deployer/commands/generate/dedicated_cluster/gcp.py b/deployer/commands/generate/dedicated_cluster/gcp.py
@@ -43,9 +43,7 @@ def generate_terraform_file(vars):
     Generates the `terraform/gcp/projects/<cluster_name>.tfvars` terraform file
     required to create a GCP cluster
     """
-    with open(
-        REPO_ROOT_PATH / f'terraform/gcp/projects/{vars["hub_type"]}-template.tfvars'
-    ) as f:
+    with open(REPO_ROOT_PATH / "terraform/gcp/projects/cluster.tfvars.template") as f:
         tfvars_template = jinja2.Template(f.read())
 
     print_colour("Generating the terraform infrastructure file...", "yellow")
@@ -59,31 +57,24 @@ def generate_terraform_file(vars):
 
 @dedicated_cluster_app.command()
 def gcp(
-    cluster_name: Annotated[
-        str, typer.Option(prompt="Please type the name of the new cluster")
-    ],
-    project_id: Annotated[
-        str, typer.Option(prompt="Please insert the Project ID of the GCP project")
-    ],
-    hub_name: Annotated[
+    cluster_name: str = typer.Option(..., prompt="Name of the cluster to deploy"),
+    cluster_region: str = typer.Option(
+        ..., prompt="The region where to deploy the cluster"
+    ),
+    project_id: str = typer.Option(
+        ..., prompt="Please insert the Project ID of the GCP project"
+    ),
+    hub_type: Annotated[
         str,
         typer.Option(
-            prompt="Please insert the name of first hub to add to the cluster"
+            prompt="Please type in the hub type: basehub/daskhub.\n-> If this cluster will host daskhubs, please type `daskhub`.\n-> If you don't know this info, or this is not the case, just hit ENTER"
         ),
-    ],
-    cluster_region: Annotated[
-        str, typer.Option(prompt="Please insert the name of the cluster region")
-    ] = "us-central1",
-    hub_type: Annotated[
-        str, typer.Option(prompt="Please insert the hub type of the first hub")
     ] = "basehub",
-    force: Annotated[
-        bool,
-        typer.Option(
-            "--force",
-            help="Whether or not to force the override of the files that already exist",
-        ),
-    ] = False,
+    force: bool = typer.Option(
+        False,
+        "--force",
+        help="Whether or not to force the override of the files that already exist",
+    ),
 ):
     """
     Automatically generates the initial files, required to setup a new cluster on GCP if they don't exist.
@@ -96,7 +87,6 @@ def gcp(
         "hub_type": hub_type,
         "cluster_region": cluster_region,
         "project_id": project_id,
-        "hub_name": hub_name,
     }
 
     if not check_before_continuing_with_generate_command(

diff --git a/docs/howto/features/dask.md b/docs/howto/features/dask.md
@@ -0,0 +1,78 @@
+(howto:features:daskhubs)=
+# Add support for daskhubs in an existing cluster
+
+## GCP
+
+Setting up dask nodepools with terraform can be done by adding the following to the cluster's terraform config file:
+
+```terraform
+# Setup a single node pool for dask workers.
+#
+# A not yet fully established policy is being developed about using a single
+# node pool, see https://github.com/2i2c-org/infrastructure/issues/2687.
+#
+dask_nodes = {
+  "n2-highmem-16" : {
+    min : 0,
+    max : 100,
+    machine_type : "n2-highmem-16",
+  },
+}
+```
+
+This provisions a single `n2-highmem-16` nodepool. The reasons behind the choice of machine can be found in https://github.com/2i2c-org/infrastructure/issues/2687.
+
+````{tip}
+Don't forget to run terraform plan and terraform apply for the new node pool to get created.
+```bash
+terraform plan -var-file projects/$CLUSTER_NAME.tfvars
+```
+```bash
+terraform apply -var-file projects/$CLUSTER_NAME.tfvars
+```
+````
+
+### AWS
+
+We use `eksctl` with `jsonnet` to provision our kubernetes clusters on
+AWS, and we can configure a node group there for the dask pods to run onto.
+
+1. In the appropriate `.jsonnet` file, update the `local daskNodes`:
+
+    This is how it could look in a .jsonnet file after updating the local daskNodes = [] variable:
+
+    ```
+    local daskNodes = [
+        // Node definitions for dask worker nodes. Config here is merged
+        // with our dask worker node definition, which uses spot instances.
+        // A `node.kubernetes.io/instance-type label is set to the name of the
+        // *first* item in instanceDistribution.instanceTypes, to match
+        // what we do with notebook nodes. Pods can request a particular
+        // kind of node with a nodeSelector
+        //
+        // A not yet fully established policy is being developed about using a single
+        // node pool, see https://github.com/2i2c-org/infrastructure/issues/2687.
+        //
+        { instancesDistribution+: { instanceTypes: ["r5.4xlarge"] }},
+    ];
+    ```
+
+2. Render the `.jsonnet` file into a `.yaml` file that `eksctl` can use
+
+   ```bash
+   export CLUSTER_NAME=<your_cluster>
+   ```
+
+   ```bash
+   jsonnet $CLUSTER_NAME.jsonnet > $CLUSTER_NAME.eksctl.yaml
+   ```
+
+3. Create the nodegroup
+
+   ```bash
+   eksctl create nodegroup -f $CLUSTER_NAME.eksctl.yaml
+   ```
+
+   This should create the nodegroup with 0 nodes in it, and the
+   autoscaler should recognize this! `eksctl` will also setup the
+   appropriate driver installer, so you won't have to.
diff --git a/docs/howto/features/index.md b/docs/howto/features/index.md
@@ -12,6 +12,7 @@ anonymized-usernames.md
 buckets.md
 cloud-access.md
 cryptnono.md
+dask.md
 dedicated-nodepool.md
 ephemeral.md
 github.md

diff --git a/docs/hub-deployment-guide/hubs/new-hub.md b/docs/hub-deployment-guide/hubs/new-hub.md
@@ -64,10 +64,65 @@ To deploy a new hub, follow these steps:
    If however a specific feature has been requested that does not come out-of-the-box with `basehub` or `daskhub`, see [](hub-features) for information on how to deploy it and the relevant config that should be added to the `<hub>.values.yaml` file.
    ```
 
-5. Create a Pull Request with the new hub entry, and get a team member to review it.
-6. Once you merge the pull request, the GitHub Action workflow will detect that a new entry has been added to the configuration file.
+5. Make sure you setup the [PersistentVolume](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) in the hub's config.
+
+`````{tab-set}
+````{tab-item} AWS
+:sync: aws-key
+
+An [EFS instance](https://aws.amazon.com/efs/) to store the hub home directories, should exist from when the cluster was created.
+
+Get the address a hub on this cluster should use for connecting to NFS with
+`terraform output nfs_server_dns`, and set it in the hub's config under
+`nfs.pv.serverIP` (nested under `basehub` when necessary) in the appropriate
+`<hub>.values.yaml` file.
+
+```yaml
+nfs:
+   enabled: true
+      enabled: false
+   pv:
+      enabled: true
+      # from https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-nfs-mount-settings.html
+      mountOptions:
+      - rsize=1048576
+      - wsize=1048576
+      - timeo=600
+      - soft # We pick soft over hard, so NFS lockups don't lead to hung processes
+      - retrans=2
+      - noresvport
+      serverIP: <from-terraform>
+      baseShareName: /
+```
+````
+
+````{tab-item} Google Cloud
+:sync: gcp-key
+```yaml
+nfs:
+  enabled: true
+  pv:
+    enabled: true
+    mountOptions:
+      - soft
+      - noatime
+    # Google FileStore IP
+    serverIP: <gcp-filestore-ip>
+    # Name of Google Filestore share
+    baseShareName: /homes/
+```
+````
+
+````{tab-item} Azure
+:sync: azure-key
+N/A
+````
+`````
+
+6. Create a Pull Request with the new hub entry, and get a team member to review it.
+7. Once you merge the pull request, the GitHub Action workflow will detect that a new entry has been added to the configuration file.
    It will then deploy a new JupyterHub with the configuration you've specified onto the corresponding cluster.
-7. Monitor the action to make sure that it completes.
+8. Monitor the action to make sure that it completes.
    If something goes wrong and the workflow does not finish, try [deploying locally](hubs:manual-deploy) to access the logs to help understand what is going on.
    It may be necessary to make new changes to the hub's configuration via a Pull Request, or to *revert* the old Pull Request if you cannot determine how to resolve the problem.
 
@@ -76,8 +131,8 @@ To deploy a new hub, follow these steps:
    You will need to run the [health check locally](hubs:manual-deploy:health-check) to inspect these logs.
    ```
 
-8. Log in to the hub and ensure that the hub works as expected from a user's perspective.
-9. Send a link to the hub's Community Representative(s) so they can confirm that it works from their perspective as well.
+9. Log in to the hub and ensure that the hub works as expected from a user's perspective.
+10. Send a link to the hub's Community Representative(s) so they can confirm that it works from their perspective as well.
 
 ## Automated vs. manual deploys