opensyllabus · mingiryu · Mar 14, 2023 · mingiryu · Dec 6, 2023 · mingiryu
diff --git a/.gitignore b/.gitignore
@@ -1,143 +1,4 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
 
 .terraform*
-terraform.tfstate*
-config.*
-inventory
+*.auto.tfvars
+*tfstate*
diff --git a/README.md b/README.md
@@ -1,96 +1,71 @@
 
-# pyspark-deploy
+# Spark deploy
 
-This project automates the process of provisioning and deploying a standalone Spark cluster on EC2 spot instances. This is designed with small teams / organizations in mind, where there isn't budget to maintain permanent infrastructure - everything is optimized around making it easy to spin up a cluster, run jobs, and then immediately terminate the AWS resources on completion. 
-
-Once the Docker image is pushed to ECR, deploys generally take ~2 minutes.
-
-- [**Docker**](https://www.docker.com/) is used to wrap up a complete Python + Java + Spark environment, making it easy to develop locally and then deploy an identical environment to a cluster. Just extend the base image and add you code. (Or, if needed, build a totally custom image from scratch.)
-
-- [**Terraform**](https://www.terraform.io/) is used to provision instances on AWS.
-
-- [**Ansible**](https://www.ansible.com/) is used to configure the cluster. Since the application environment is wrapped up in Docker, Ansible just pulls the image on the nodes, injects production config values, and starts the Spark services.
+This directory contains a Terraform project for deploying Spark clusters.
 
 ## Setup
 
-1. On the host machine, install:
-
-    - [Poetry](https://python-poetry.org/)
-    - [Terraform](https://www.terraform.io/) (>= 0.12.28)
-
-1. Add this repository as a submodule in your project. Eg, under `/deploy` at the root level.
-
-1. Change into the directory, and run `./setup.sh`. This will initialize the Terraform project and install the Python dependencies.
-
-1. Add a `cluster.yml` file in the parent directory - `cp config.yml.changeme ../config.yml` (the root directory of your project, tracked in git). Fill in values for the required fields:
+1. Install [Terraform](https://www.terraform.io/) and [Poetry](https://python-poetry.org/).
+1. Run `./setup.sh` to initialize the Terraform and Poetry projects.
 
-    ```python
-    aws_vpc_id: str
-    aws_subnet_id: str
-    aws_ami: str
-    public_key_path: str
-    docker_image: str
-    aws_access_key_id: str
-    aws_secret_access_key: str
-    ```
+Create a filed in this directory called `secrets.auto.tfvars` with the three required credentials:
 
-    For full reference on the supported fields, see the `ClusterConfig` class in `cluster.py`, a [pydantic](https://pydantic-docs.helpmanual.io/) model that defines the configuration schema.
-
-    **Note:** pyspark-deploy assumes the Docker image is pushed to an ECR repository, and that the provided AWS keypair has permissions to pull the image.
-
-    **Note:** For secret values like `aws_access_key_id`, it's recommended to use Ansible vault to encrypt the values. (See - [Single Encrypted Variable](https://docs.ansible.com/ansible/2.3/playbooks_vault.html#single-encrypted-variable)). pyspark-deploy will automatically decrypt these values at deploy time. If you do this, it generally also makes sense to put the vault password in a file (eg, `~/.vault-pw.txt`), and then set the `ANSIBLE_VAULT_PASSWORD_FILE` environment variable. This avoids having to manually enter the password each time a cluster is created.
+```hlcl
+aws_access_key_id     = "XXX"
+aws_secret_access_key = "XXX"
+wandb_api_key         = "XXX"
+```
 
-## Usage
+In the parent directory, create a directory called `cluster/` and add a file called `default.tfvars`. This file contains default variables that will be applied to all clusters. Generally it makes sense to define at least these four variables, which are required:
 
-Control the cluster with `./cluster.sh`:
+```hcl
+ecr_server    = "XXX.dkr.ecr.us-east-1.amazonaws.com"
+ecr_repo      = "XXX:latest"
+aws_vpc_id    = "vpc-XXX"
+aws_subnet_id = "subnet-XXX"
+```
 
-```bash
-Usage: pyspark_deploy.py [OPTIONS] COMMAND [ARGS]...
+Then, custom cluster "profiles" can be defined as `.tfvars` files under `../cluster/profiles/`. For example, if a project needs a CPU cluster for ETL jobs and a GPU cluster for model inference, these could be defined with two profiles like -
 
-Options:
-  --help  Show this message and exit.
+`../cluster/profiles/cpu.tfvars`
 
-Commands:
-  create     Start a cluster.
-  destroy    Destroy the cluster.
-  login      SSH into the master node.
-  web-admin  Open the Spark web UI.
+```hcl
+spot_worker_count    = 30
+worker_instance_type = "m5d.metal"
+executor_memory      = "360g"
 ```
 
-Generally the workflow looks like:
+`../cluster/profiles/gpu.tfvars`
 
-- Develop locally in Docker. When ready to deploy, push the Docker image to the ECR repository specified in `docker_image`.
-- Run `./cluster.sh create`, then `./cluster.sh login` once the cluster is up.
-- Run jobs.
-- Tear down with `./cluster.sh destroy`.
+```hcl
+on_demand_worker_count = 10
+gpu_workers            = true
+worker_instance_type   = "g4dn.2xlarge"
+executor_memory        = "28g"
+```
 
-## Profiles
+## Usage
 
-It's common to need a handful of cluster "profiles" for a single project. Eg, you might have some jobs / workflows that need a small number of modest worker instances; but other jobs that need a large number of big workers, and others that need GPU workers. To support this, the `cluster.yml` file can container any number of named "profiles," which can provide override values that customize the cluster loadout.
+Then, use `./cluster.sh <command>` to control the cluster.
 
-```yaml
-profiles:
+### `./cluster.sh create <profile>`
+
+Create a cluster. If a profile name is passed, the variables in the corresponding `.tfvars` file under `../cluster/profiles/` will be added as overrides to the Terraform configuration.
 
-  cpu_small:
-    worker_count: 3
-    worker_instance_type: m5a.8xlarge
-    worker_spot_price: 0.8
-    executor_memory: 100g
+The `create` command will apply the Terraform configuration and then tail the outputs of the `/var/log/cloud-init-output.log` file on the master node, to give some visibility onto the state of the startup process. When cloud-init finishes, the command will exit.
 
-  cpu_big:
-    worker_count: 10
-    worker_instance_type: r5n.16xlarge
-    worker_spot_price: 1.6
-    executor_memory: 480g
+### `./cluster.sh destroy`
+
+Destroy the cluster
 
-  gpu:
-    worker_count: 5
-    worker_instance_type: p3.2xlarge
-    worker_docker_runtime: nvidia
-    worker_spot_price: 1.0
-    executor_memory: 40g
-```
+### `./cluster.sh login`
+
+SSH into the master node.
 
-Then, when creating a cluster, just pass the profile name, and these values will be merged into the configuration used to deploy the cluster:
+### `./cluster.sh admin`
+
+Open the Spark web UI in a browser.
 
-`./cluster.sh create gpu`
+### `./cluster.sh cat-cloudinit-log`
+
+Download the `cloud-init-output.log` file from the master node. Useful for debugging cluster startup problems.
diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg
diff --git a/ansible/deploy.yml b/ansible/deploy.yml
diff --git a/ansible/roles/spark/defaults/main.yml b/ansible/roles/spark/defaults/main.yml
diff --git a/ansible/roles/spark/tasks/main.yml b/ansible/roles/spark/tasks/main.yml
diff --git a/ansible/roles/spark/tasks/start_container.yml b/ansible/roles/spark/tasks/start_container.yml
diff --git a/ansible/roles/spark/templates/docker-bash.sh.j2 b/ansible/roles/spark/templates/docker-bash.sh.j2
diff --git a/ansible/roles/spark/templates/log4j.properties.j2 b/ansible/roles/spark/templates/log4j.properties.j2
diff --git a/ansible/roles/spark/templates/spark-defaults.conf.j2 b/ansible/roles/spark/templates/spark-defaults.conf.j2
diff --git a/ansible/roles/spark/templates/spark-env.sh.j2 b/ansible/roles/spark/templates/spark-env.sh.j2
diff --git a/cloud-config.yaml b/cloud-config.yaml
@@ -0,0 +1,139 @@
+#cloud-config
+
+write_files:
+
+  - path: /etc/spark/conf/log4j.properties
+    content: |
+      #
+      # Licensed to the Apache Software Foundation (ASF) under one or more
+      # contributor license agreements.  See the NOTICE file distributed with
+      # this work for additional information regarding copyright ownership.
+      # The ASF licenses this file to You under the Apache License, Version 2.0
+      # (the "License"); you may not use this file except in compliance with
+      # the License.  You may obtain a copy of the License at
+      #
+      #    http://www.apache.org/licenses/LICENSE-2.0
+      #
+      # Unless required by applicable law or agreed to in writing, software
+      # distributed under the License is distributed on an "AS IS" BASIS,
+      # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+      # See the License for the specific language governing permissions and
+      # limitations under the License.
+      #
+
+      # Set everything to be logged to the console
+      log4j.rootCategory=INFO, console
+      log4j.appender.console=org.apache.log4j.ConsoleAppender
+      log4j.appender.console.target=System.err
+      log4j.appender.console.layout=org.apache.log4j.PatternLayout
+      log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+      # Set the default spark-shell log level to WARN. When running the spark-shell, the
+      # log level for this class is used to overwrite the root logger's log level, so that
+      # the user can have different defaults for the shell and regular Spark apps.
+      log4j.logger.org.apache.spark.repl.Main=WARN
+
+      # Settings to quiet third party logs that are too verbose
+      log4j.logger.org.spark_project.jetty=WARN
+      log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
+      log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
+      log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
+      log4j.logger.org.apache.parquet=ERROR
+      log4j.logger.parquet=ERROR
+
+      # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
+      log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
+      log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
+
+      # Suppress Parquet logging.
+      log4j.logger.org.apache.spark.sql.execution.datasources.parquet=ERROR
+      log4j.logger.org.apache.spark.sql.execution.datasources.FileScanRDD=ERROR
+      log4j.logger.org.apache.hadoop.io.compress.CodecPool=ERROR
+
+  - path: /etc/spark/conf/spark-env.sh
+    content: |
+      #!/usr/bin/env bash
+
+      # Use ipython for driver.
+      PYSPARK_DRIVER_PYTHON=ipython
+
+      # So that links work properly in Spark admin.
+      SPARK_PUBLIC_DNS=`wget -q -O - http://169.254.169.254/latest/meta-data/public-hostname`
+
+      # Avoid too-many-open-files errors.
+      ulimit -n 100000
+
+      # Disable parallelism for sklearn / spacy.
+      OPENBLAS_NUM_THREADS=1
+
+      # Use common hash seed across nodes (needed for reduceByKey).
+      PYTHONHASHSEED=1
+
+      SPARK_WORKER_DIR=${data_dir}/work
+
+      # Credentials.
+      AWS_ACCESS_KEY_ID=${aws_access_key_id}
+      AWS_SECRET_ACCESS_KEY=${aws_secret_access_key}
+      WANDB_API_KEY=${wandb_api_key}
+
+  - path: /etc/spark/conf/spark-defaults.conf
+    content: |
+      spark.master spark://${master_private_ip}:7077
+      spark.driver.memory ${driver_memory}
+      spark.executor.memory ${executor_memory}
+      spark.driver.maxResultSize ${max_driver_result_size}
+      spark.jars.packages ${join(",", spark_packages)}
+
+      spark.local.dir ${data_dir}/spark
+      spark.driver.extraJavaOptions -Dderby.system.home=${data_dir}/derby
+      spark.sql.warehouse.dir ${data_dir}/spark-warehouse
+      spark.hadoop.hadoop.tmp.dir ${data_dir}/hadoop
+
+      spark.master.rest.enabled true
+      spark.task.maxFailures 20
+      spark.hadoop.fs.s3a.connection.maximum 1000
+      spark.hadoop.fs.s3a.bucket.all.committer.magic.enabled true
+      spark.sql.shuffle.partitions 2000
+      spark.sql.execution.arrow.maxRecordsPerBatch 200
+
+  - path: /var/lib/cloud/scripts/per-boot/start-spark.sh
+    permissions: 0755
+    content: |
+      #!/bin/sh
+
+      export AWS_ACCESS_KEY_ID=${aws_access_key_id}
+      export AWS_SECRET_ACCESS_KEY=${aws_secret_access_key}
+
+      aws ecr get-login-password --region us-east-1 | \
+        docker login --username AWS --password-stdin ${ecr_server}
+
+      docker run -d --rm \
+        --name spark \
+        --network host \
+        --env SPARK_ENV=prod \
+        -v /data:/data \
+        -v /etc/spark/conf:/opt/spark/conf \
+        -p 8080:8080 \
+        ${!master && gpu_workers ? "--gpus all" : ""} \
+        ${ecr_server}/${ecr_repo} \
+        ${
+          master ?
+          "spark-class org.apache.spark.deploy.master.Master" :
+          "spark-class org.apache.spark.deploy.worker.Worker spark://${master_private_ip}:7077"
+        }
+
+  - path: /home/ubuntu/spark-bash.sh
+    permissions: 0755
+    content: |
+      #!/bin/bash
+
+      CONTAINER_ID=$(sudo docker ps -q --filter "name=spark")
+      DOCKER_CMD="sudo docker exec -it $${CONTAINER_ID} bash"
+
+      tmux attach -t spark || tmux new -s spark "$DOCKER_CMD"
+
+  # Automatically drop into Spark shell on login.
+  - path: /home/ubuntu/.bashrc
+    content: |
+      source ./spark-bash.sh
+    append: true
diff --git a/cluster.py b/cluster.py
@@ -0,0 +1,185 @@
+import typer
+import subprocess
+import json
+import requests
+import paramiko
+import re
+
+from typing import Optional, Iterator
+from pathlib import Path
+from dataclasses import dataclass
+from loguru import logger
+
+
+ROOT_DIR = Path(__file__).parent
+CONFIG_DIR = ROOT_DIR.parent / 'cluster'
+CLOUDINIT_LOG_PATH = '/var/log/cloud-init-output.log'
+
+
+def read_terraform_output(src: str, key: str):
+    """Read a given output key from the Terraform state.
+    """
+    with open(src) as fh:
+        return json.load(fh)['outputs'][key]['value']
+
+
+def tail_log_file(
+    hostname: str,
+    username: str,
+    log_file_path: str,
+) -> Iterator[str]:
+
+    client = paramiko.SSHClient()
+    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+    client.connect(hostname, username=username)
+
+    command = f'tail -f {log_file_path}'
+    stdin, stdout, stderr = client.exec_command(command)
+
+    for line in stdout:
+        yield line.strip()
+
+    client.close()
+
+
+@dataclass
+class Cluster:
+
+    master_dns: str
+
+    @classmethod
+    def from_tfstate(
+        cls,
+        *,
+        src: str = ROOT_DIR / 'terraform.tfstate',
+        master_dns_key: str = 'master_dns',
+    ):
+        """Create an instance from the master DNS output in a tfstate file.
+        """
+        try:
+            master_dns = read_terraform_output(src, master_dns_key)
+        except Exception:
+            raise RuntimeError(
+                'Unable to read master node DNS. Is the cluster up?'
+            )
+
+        return cls(master_dns)
+
+    @property
+    def web_ui_url(self) -> str:
+        """Build the URL for the Spark web UI.
+        """
+        return f'http://{self.master_dns}:8080'
+
+    def ssh_client(self) -> paramiko.SSHClient:
+        """Create an SSH client to the master node.
+        """
+        client = paramiko.SSHClient()
+        client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        client.connect(self.master_dns, username='ubuntu', timeout=60*3)
+        return client
+
+    def cat_cloudinit_log(self) -> str:
+        """Pull `cloud-init-output.log` from the master node using Fabric.
+        """
+        with self.ssh_client() as client:
+            _, stdout, _ = client.exec_command(f'cat {CLOUDINIT_LOG_PATH}')
+            return stdout.read().decode()
+
+    def tail_cloudinit_log(self) -> Iterator[str]:
+        """Tail `cloud-init-output.log` in real-time.
+        """
+        with self.ssh_client() as client:
+
+            cmd = f'tail -f {CLOUDINIT_LOG_PATH}'
+            stdin, stdout, stderr = client.exec_command(cmd)
+
+            for line in stdout:
+                yield line.strip()
+
+    def tail_cloutinit_log_until_finished(self) -> Iterator[str]:
+        """Tail `cloud-init-output.log` until the cluster is up.
+        """
+        for line in self.tail_cloudinit_log():
+            yield line
+            if re.search(r'Cloud-init .* finished', line):
+                break
+
+
+def read_master_dns():
+    """Read the master IP out of the TF state.
+    """
+    with open(ROOT_DIR / 'terraform.tfstate') as fh:
+        return json.load(fh)['outputs']['master_dns']['value']
+
+
+app = typer.Typer()
+
+
+@app.command()
+def create(profile: Optional[str] = typer.Argument(None)):
+    """Create a cluster using default variables defined in:
+
+    `../cluster/default.tfvars`
+
+    If a "profile" is passed, also inject variables defined at:
+
+    `../cluster/profiles/<profile>.tfvars`
+    """
+
+    cmd = [
+        'terraform', 'apply',
+        '-var-file', CONFIG_DIR / 'default.tfvars',
+    ]
+
+    if profile is not None:
+        cmd += [
+            '-var-file', CONFIG_DIR / 'profiles' / f'{profile}.tfvars'
+        ]
+
+    subprocess.run(cmd)
+
+    cluster = Cluster.from_tfstate()
+
+    logger.info('Tailing `cloud-init-output.log` on the master node...')
+
+    for line in cluster.tail_cloutinit_log_until_finished():
+        print(line)
+
+    logger.info('Cluster up 🚀🚀')
+
+
+@app.command()
+def destroy():
+    """Destroy the cluster.
+    """
+    subprocess.run([
+        'terraform', 'destroy',
+        '-var-file', CONFIG_DIR / 'default.tfvars',
+    ])
+
+
+@app.command()
+def login():
+    """SSH into the master node.
+    """
+    subprocess.run(['ssh', f'ubuntu@{read_master_dns()}'])
+
+
+@app.command()
+def admin():
+    """Open the Spark web UI.
+    """
+    subprocess.run(['open', f'http://{read_master_dns()}:8080'])
+
+
+@app.command()
+def cat_cloudinit_log():
+    """Print the cloud-init log from the master node.
+    """
+    cluster = Cluster.from_tfstate()
+    print(cluster.cat_cloudinit_log())
+
+
+if __name__ == '__main__':
+    app()
diff --git a/cluster.sh b/cluster.sh
@@ -1,3 +1,3 @@
 #!/bin/sh
 
-exec poetry run python ./pyspark_deploy.py "$@"
+exec poetry run python ./cluster.py "$@"
diff --git a/cluster.yml.changeme b/cluster.yml.changeme
diff --git a/docker/Dockerfile b/docker/Dockerfile
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
diff --git a/docker/spark-defaults.conf b/docker/spark-defaults.conf
diff --git a/docker/spark-env.sh b/docker/spark-env.sh
diff --git a/terraform/main.tf → main.tf b/terraform/main.tf → main.tf
@@ -78,77 +78,103 @@ resource "aws_key_pair" "spark" {
   public_key = file(var.public_key_path)
 }
 
-resource "aws_instance" "master" {
-  ami                         = var.aws_ami
-  instance_type               = var.master_instance_type
-  subnet_id                   = var.aws_subnet_id
-  vpc_security_group_ids      = [aws_security_group.spark.id]
-  key_name                    = aws_key_pair.spark.key_name
-  associate_public_ip_address = true
+# TODO: Provision the subnet directly?
+resource "aws_network_interface" "master" {
+  subnet_id       = var.aws_subnet_id
+  security_groups = [aws_security_group.spark.id]
+}
 
-  tags = {
-    Name = "spark-master"
+locals {
+  user_data_vars = {
+    ecr_server             = var.ecr_server
+    ecr_repo               = var.ecr_repo
+    aws_access_key_id      = var.aws_access_key_id
+    aws_secret_access_key  = var.aws_secret_access_key
+    wandb_api_key          = var.wandb_api_key
+    driver_memory          = var.driver_memory
+    executor_memory        = var.executor_memory
+    gpu_workers            = var.gpu_workers
+    max_driver_result_size = var.max_driver_result_size
+    data_dir               = var.data_dir
+    spark_packages         = var.spark_packages
+    master_private_ip      = aws_network_interface.master.private_ip
+  }
+}
+
+locals {
+  master_user_data = templatefile(
+    "cloud-config.yaml",
+    merge(local.user_data_vars, {
+      master = true
+    })
+  )
+  worker_user_data = templatefile(
+    "cloud-config.yaml",
+    merge(local.user_data_vars, {
+      master = false
+    })
+  )
+}
+
+resource "aws_instance" "master" {
+  ami           = var.aws_ami
+  instance_type = var.master_instance_type
+  key_name      = aws_key_pair.spark.key_name
+  user_data     = local.master_user_data
+
+  network_interface {
+    network_interface_id = aws_network_interface.master.id
+    device_index         = 0
   }
 
   root_block_device {
-    volume_size = var.master_root_vol_size
+    volume_size = var.root_vol_size
+  }
+
+  tags = {
+    Name = "spark-master"
   }
 }
 
-resource "aws_instance" "worker" {
+# TODO: Name tags on the workers.
+
+resource "aws_instance" "workers" {
   ami                         = var.aws_ami
   instance_type               = var.worker_instance_type
   subnet_id                   = var.aws_subnet_id
   vpc_security_group_ids      = [aws_security_group.spark.id]
   key_name                    = aws_key_pair.spark.key_name
   associate_public_ip_address = true
   count                       = var.on_demand_worker_count
+  user_data                   = local.worker_user_data
 
   root_block_device {
-    volume_size = var.worker_root_vol_size
+    volume_size = var.root_vol_size
   }
 }
 
-resource "aws_spot_instance_request" "worker" {
+resource "aws_spot_instance_request" "workers" {
   ami                         = var.aws_ami
   instance_type               = var.worker_instance_type
   subnet_id                   = var.aws_subnet_id
   vpc_security_group_ids      = [aws_security_group.spark.id]
   key_name                    = aws_key_pair.spark.key_name
-  spot_price                  = var.worker_spot_price
-  spot_type                   = "one-time"
   associate_public_ip_address = true
   wait_for_fulfillment        = true
   count                       = var.spot_worker_count
+  user_data                   = local.worker_user_data
 
   root_block_device {
-    volume_size = var.worker_root_vol_size
+    volume_size = var.root_vol_size
   }
 }
 
-data "template_file" "inventory" {
-  template = file("${path.module}/inventory.tpl")
-
-  vars = {
-    master_ip            = aws_instance.master.public_ip
-    master_private_ip    = aws_instance.master.private_ip
-    on_demand_worker_ips = join("\n", [for ip in aws_instance.worker.*.public_ip : ip if ip != null])
-    spot_worker_ips      = join("\n", [for ip in aws_spot_instance_request.worker.*.public_ip : ip if ip != null])
-  }
-
-  # Wait for assigned IPs to be known, before writing inventory.
-  depends_on = [
-    aws_instance.master,
-    aws_instance.worker,
-    aws_spot_instance_request.worker,
-  ]
+data "aws_instance" "master" {
+  instance_id = aws_instance.master.id
 }
 
-resource "local_file" "inventory" {
-  content  = data.template_file.inventory.rendered
-  filename = "${path.module}/../ansible/inventory"
-}
-
-output "master_ip" {
-  value = aws_instance.master.public_ip
-}
+# XXX: Pull from the "data" source, which will read the (possibly new) DNS
+# after the plan is applied.
+output "master_dns" {
+  value = data.aws_instance.master.public_dns
+}
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,14 +3,20 @@ name = "pyspark-deploy"
 version = "0.1.0"
 description = ""
 authors = ["David McClure <davidwilliammcclure@gmail.com>"]
+readme = "README.md"
 
 [tool.poetry.dependencies]
-python = "^3.8"
-ansible = "^5.1.0"
-pydantic = "^1.8.2"
-click = "^8.0.3"
-ipython = "^7.30.1"
+python = "^3.10"
+ipython = "^8.11.0"
+typer = "^0.7.0"
+requests = "^2.28.2"
+loguru = "^0.6.0"
+paramiko = "^3.0.0"
+
+
+[tool.poetry.group.dev.dependencies]
+flake8 = "^6.0.0"
 
 [build-system]
-requires = ["poetry>=0.12"]
-build-backend = "poetry.masonry.api"
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/pyspark_deploy.py b/pyspark_deploy.py
diff --git a/setup.sh b/setup.sh
@@ -1,4 +1,4 @@
 #!/bin/sh
 
-cd terraform && terraform init
+terraform init
 poetry install
diff --git a/terraform/inventory.tpl b/terraform/inventory.tpl
diff --git a/terraform/variables.tf b/terraform/variables.tf
diff --git a/terraform/versions.tf b/terraform/versions.tf
diff --git a/variables.tf b/variables.tf
@@ -0,0 +1,101 @@
+
+variable "ecr_server" {
+  type = string
+}
+
+variable "ecr_repo" {
+  type = string
+}
+
+# AWS
+
+variable "aws_region" {
+  default = "us-east-1"
+}
+
+variable "aws_vpc_id" {
+  type = string
+}
+
+variable "aws_subnet_id" {
+  type = string
+}
+
+# Deep Learning Base AMI (Ubuntu 18.04) Version 34.1
+variable "aws_ami" {
+  default = "ami-04eb5b2f5ef92e8b8"
+}
+
+variable "public_key_path" {
+  default = "~/.ssh/spark.pub"
+}
+
+# Instances
+
+variable "root_vol_size" {
+  default = 100
+}
+
+variable "master_instance_type" {
+  default = "c5.xlarge"
+}
+
+# TODO: Possible to automatically use, eg, 0.8 * instance ram?
+variable "driver_memory" {
+  default = "4g"
+}
+
+variable "worker_instance_type" {
+  default = "c5.xlarge"
+}
+
+variable "executor_memory" {
+  default = "4g"
+}
+
+variable "spot_worker_count" {
+  default = 0
+}
+
+variable "on_demand_worker_count" {
+  default = 0
+}
+
+variable "gpu_workers" {
+  default = false
+}
+
+# Config
+
+variable "data_dir" {
+  default = "/data"
+}
+
+variable "max_driver_result_size" {
+  default = "10g"
+}
+
+# AWS / S3 packages for Spark 3.3.
+variable "spark_packages" {
+  default = [
+    "org.apache.spark:spark-hadoop-cloud_2.13:3.3.2"
+  ]
+}
+
+# Secrets
+# TODO: Use a single env_vars list / dict variable?
+
+variable "aws_access_key_id" {
+  type      = string
+  sensitive = true
+}
+
+variable "aws_secret_access_key" {
+  type      = string
+  sensitive = true
+}
+
+variable "wandb_api_key" {
+  type      = string
+  sensitive = true
+}
diff --git a/versions.tf b/versions.tf
@@ -0,0 +1,4 @@
+
+terraform {
+  required_version = ">= 0.14"
+}