diff --git a/config.tf b/config.tf index 9476676..afbc3b8 100644 --- a/config.tf +++ b/config.tf @@ -3,11 +3,17 @@ terraform { encrypt = true bucket = "terraform-persistence" key = "terraform-emr-pyspark.tfstate" - region = "eu-central-1" + region = "us-west-2" + } + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } } } provider "aws" { - version = "~> 1.0" - region = "${var.region}" + region = var.region } diff --git a/main.tf b/main.tf index 69bbb2e..3f950aa 100644 --- a/main.tf +++ b/main.tf @@ -1,6 +1,6 @@ module "s3" { source = "./modules/s3" - name = "${var.name}" + name = var.name } module "iam" { @@ -9,26 +9,26 @@ module "iam" { module "security" { source = "./modules/security" - name = "${var.name}" - vpc_id = "${var.vpc_id}" - ingress_cidr_blocks = "${var.ingress_cidr_blocks}" + name = var.name + vpc_id = var.vpc_id + ingress_cidr_blocks = var.ingress_cidr_blocks } module "emr" { source = "./modules/emr" - name = "${var.name}" - release_label = "${var.release_label}" - applications = "${var.applications}" - subnet_id = "${var.subnet_id}" - key_name = "${var.key_name}" - master_instance_type = "${var.master_instance_type}" - master_ebs_size = "${var.master_ebs_size}" - core_instance_type = "${var.core_instance_type}" - core_instance_count = "${var.core_instance_count}" - core_ebs_size = "${var.core_ebs_size}" - emr_master_security_group = "${module.security.emr_master_security_group}" - emr_slave_security_group = "${module.security.emr_slave_security_group}" - emr_ec2_instance_profile = "${module.iam.emr_ec2_instance_profile}" - emr_service_role = "${module.iam.emr_service_role}" - emr_autoscaling_role = "${module.iam.emr_autoscaling_role}" + name = var.name + release_label = var.release_label + applications = var.applications + subnet_id = var.subnet_id + key_name = var.key_name + master_instance_type = var.master_instance_type + master_ebs_size = var.master_ebs_size + core_instance_type = var.core_instance_type + core_instance_count = var.core_instance_count + core_ebs_size = var.core_ebs_size + emr_master_security_group = module.security.emr_master_security_group + emr_slave_security_group = module.security.emr_slave_security_group + emr_ec2_instance_profile = module.iam.emr_ec2_instance_profile + emr_service_role = module.iam.emr_service_role + emr_autoscaling_role = module.iam.emr_autoscaling_role } diff --git a/modules/emr/main.tf b/modules/emr/main.tf index a0ffebf..9c9004c 100644 --- a/modules/emr/main.tf +++ b/modules/emr/main.tf @@ -1,88 +1,83 @@ resource "aws_emr_cluster" "emr-spark-cluster" { - name = "${var.name}" - release_label = "${var.release_label}" - applications = "${var.applications}" + name = var.name + release_label = var.release_label + applications = var.applications termination_protection = false keep_job_flow_alive_when_no_steps = true ec2_attributes { - subnet_id = "${var.subnet_id}" - key_name = "${var.key_name}" - emr_managed_master_security_group = "${var.emr_master_security_group}" - emr_managed_slave_security_group = "${var.emr_slave_security_group}" - instance_profile = "${var.emr_ec2_instance_profile}" + subnet_id = var.subnet_id + key_name = var.key_name + emr_managed_master_security_group = var.emr_master_security_group + emr_managed_slave_security_group = var.emr_slave_security_group + instance_profile = var.emr_ec2_instance_profile } - ebs_root_volume_size = "12" - - instance_group { + master_instance_group { + instance_type = var.master_instance_type + instance_count = 1 name = "EMR master" - instance_role = "MASTER" - instance_type = "${var.master_instance_type}" - instance_count = "1" ebs_config { - size = "${var.master_ebs_size}" + size = var.master_ebs_size type = "gp2" volumes_per_instance = 1 } } - instance_group { + core_instance_group { + instance_type = var.core_instance_type + instance_count = var.core_instance_count name = "EMR slave" - instance_role = "CORE" - instance_type = "${var.core_instance_type}" - instance_count = "${var.core_instance_count}" ebs_config { - size = "${var.core_ebs_size}" + size = var.core_ebs_size type = "gp2" volumes_per_instance = 1 } } - tags { + tags = { Name = "${var.name} - Spark cluster" } - service_role = "${var.emr_service_role}" - autoscaling_role = "${var.emr_autoscaling_role}" + service_role = var.emr_service_role + autoscaling_role = var.emr_autoscaling_role bootstrap_action { name = "Bootstrap setup." path = "s3://${var.name}/scripts/bootstrap_actions.sh" } - step = [ - { - name = "Copy script file from s3." - action_on_failure = "CONTINUE" + step { + name = "Copy script file from s3." + action_on_failure = "CONTINUE" - hadoop_jar_step { - jar = "command-runner.jar" - args = ["aws", "s3", "cp", "s3://${var.name}/scripts/pyspark_quick_setup.sh", "/home/hadoop/"] - } - }, - { - name = "Setup pyspark with conda." - action_on_failure = "CONTINUE" + hadoop_jar_step { + jar = "command-runner.jar" + args = ["aws", "s3", "cp", "s3://${var.name}/scripts/pyspark_quick_setup.sh", "/home/hadoop/"] + } + } - hadoop_jar_step { - jar = "command-runner.jar" - args = ["sudo", "bash", "/home/hadoop/pyspark_quick_setup.sh"] - } - }, - ] + step { + name = "Setup pyspark with conda." + action_on_failure = "CONTINUE" + + hadoop_jar_step { + jar = "command-runner.jar" + args = ["sudo", "bash", "/home/hadoop/pyspark_quick_setup.sh"] + } + } configurations_json = <> $HOME/.bashrc && source $HOME/.bashrc +# Install Miniconda +MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" +wget --quiet ${MINICONDA_URL} -O ~/miniconda.sh +/bin/bash ~/miniconda.sh -b -p $HOME/conda -# install packages -conda install -y notebook=5.7.* jupyter=1.0.* pandas seaborn +# Initialize Conda for bash shell. This will also update the .bashrc file. +source $HOME/conda/bin/activate +conda init bash -#install findspark +# Update conda +conda update -n base -c defaults conda -y + +# Install packages +conda install -y notebook jupyter pandas seaborn + +# Update pip and install Python packages +conda install pip -y pip install --upgrade pip -pip install findspark -pip install sklearn +pip install findspark sklearn + +# Cleanup +# rm ~/miniconda.sh diff --git a/scripts/pyspark_quick_setup.sh b/scripts/pyspark_quick_setup.sh index c0526a4..ee09e86 100755 --- a/scripts/pyspark_quick_setup.sh +++ b/scripts/pyspark_quick_setup.sh @@ -1,6 +1,22 @@ #!/usr/bin/env bash -# bind conda to spark -echo -e "\nexport PYSPARK_PYTHON=/home/hadoop/conda/bin/python" >> /etc/spark/conf/spark-env.sh -echo "export PYSPARK_DRIVER_PYTHON=/home/hadoop/conda/bin/jupyter" >> /etc/spark/conf/spark-env.sh -echo "export PYSPARK_DRIVER_PYTHON_OPTS='notebook --no-browser --port 8888 --ip=\"0.0.0.0\"'" >> /etc/spark/conf/spark-env.sh +# Ensure the script fails on any error +set -e + +# Set the paths for PySpark's Python and Jupyter Driver +PYSPARK_PYTHON_PATH="/home/hadoop/conda/bin/python" +PYSPARK_DRIVER_PATH="/home/hadoop/conda/bin/jupyter" +PYSPARK_DRIVER_OPTS="notebook --no-browser --port 8888 --ip='0.0.0.0'" + +# Append configurations to spark-env.sh +cat >> /etc/spark/conf/spark-env.sh << EOF + +# Bind PySpark to Conda Python installation +export PYSPARK_PYTHON=${PYSPARK_PYTHON_PATH} + +# Set Jupyter as the driver for PySpark +export PYSPARK_DRIVER_PYTHON=${PYSPARK_DRIVER_PATH} + +# Set options for the Jupyter driver +export PYSPARK_DRIVER_PYTHON_OPTS='${PYSPARK_DRIVER_OPTS}' +EOF diff --git a/terraform.tfvars b/terraform.tfvars index 86e96e9..2503c02 100644 --- a/terraform.tfvars +++ b/terraform.tfvars @@ -1,18 +1,18 @@ # EMR general configurations -name = "spark-app" -region = "eu-central-1" -subnet_id = "" -vpc_id = "" -key_name = "" -ingress_cidr_blocks = "0.0.0.0/0" -release_label = "emr-5.16.0" -applications = ["Hadoop", "Spark"] +name = "spark-app" +region = "eu-central-1" +subnet_id = "" +vpc_id = "" +key_name = "" +ingress_cidr_blocks = ["0.0.0.0/0"] +release_label = "emr-6.12.0" +applications = ["Hadoop", "Spark"] # Master node configurations -master_instance_type = "m3.xlarge" -master_ebs_size = "50" +master_instance_type = "m5.xlarge" +master_ebs_size = "50" # Slave nodes configurations -core_instance_type = "m3.xlarge" +core_instance_type = "m5.xlarge" core_instance_count = 1 -core_ebs_size = "50" +core_ebs_size = "50" diff --git a/variables.tf b/variables.tf index 668b008..c71f807 100644 --- a/variables.tf +++ b/variables.tf @@ -1,15 +1,64 @@ -variable "name" {} -variable "region" {} -variable "subnet_id" {} -variable "vpc_id" {} -variable "key_name" {} -variable "release_label" {} +variable "name" { + description = "Name for the EMR cluster" + type = string +} + +variable "region" { + description = "AWS region where resources will be created" + type = string +} + +variable "subnet_id" { + description = "ID of the subnet in which the EMR cluster will be launched" + type = string +} + +variable "vpc_id" { + description = "ID of the VPC in which the EMR cluster will reside" + type = string +} + +variable "key_name" { + description = "Name of the EC2 key pair for SSH access to the instances" + type = string +} + +variable "release_label" { + description = "Release label for the EMR cluster" + type = string +} + variable "applications" { - type = "list" -} -variable "master_instance_type" {} -variable "master_ebs_size" {} -variable "core_instance_type" {} -variable "core_instance_count" {} -variable "core_ebs_size" {} -variable "ingress_cidr_blocks" {} + description = "List of applications to be installed and configured on the EMR cluster" + type = list(string) +} + +variable "master_instance_type" { + description = "EC2 instance type for the master node" + type = string +} + +variable "master_ebs_size" { + description = "EBS volume size (in GB) for the master node" + type = number +} + +variable "core_instance_type" { + description = "EC2 instance type for the core nodes" + type = string +} + +variable "core_instance_count" { + description = "Number of core nodes in the EMR cluster" + type = number +} + +variable "core_ebs_size" { + description = "EBS volume size (in GB) for the core nodes" + type = number +} + +variable "ingress_cidr_blocks" { + description = "List of CIDR blocks to allow incoming traffic from" + type = list(string) +}