Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.

Modernize to TF v1.5, aws provider 5 #1

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions config.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,17 @@ terraform {
encrypt = true
bucket = "terraform-persistence"
key = "terraform-emr-pyspark.tfstate"
region = "eu-central-1"
region = "us-west-2"
}

required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
}
}

provider "aws" {
version = "~> 1.0"
region = "${var.region}"
region = var.region
}
38 changes: 19 additions & 19 deletions main.tf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module "s3" {
source = "./modules/s3"
name = "${var.name}"
name = var.name
}

module "iam" {
Expand All @@ -9,26 +9,26 @@ module "iam" {

module "security" {
source = "./modules/security"
name = "${var.name}"
vpc_id = "${var.vpc_id}"
ingress_cidr_blocks = "${var.ingress_cidr_blocks}"
name = var.name
vpc_id = var.vpc_id
ingress_cidr_blocks = var.ingress_cidr_blocks
}

module "emr" {
source = "./modules/emr"
name = "${var.name}"
release_label = "${var.release_label}"
applications = "${var.applications}"
subnet_id = "${var.subnet_id}"
key_name = "${var.key_name}"
master_instance_type = "${var.master_instance_type}"
master_ebs_size = "${var.master_ebs_size}"
core_instance_type = "${var.core_instance_type}"
core_instance_count = "${var.core_instance_count}"
core_ebs_size = "${var.core_ebs_size}"
emr_master_security_group = "${module.security.emr_master_security_group}"
emr_slave_security_group = "${module.security.emr_slave_security_group}"
emr_ec2_instance_profile = "${module.iam.emr_ec2_instance_profile}"
emr_service_role = "${module.iam.emr_service_role}"
emr_autoscaling_role = "${module.iam.emr_autoscaling_role}"
name = var.name
release_label = var.release_label
applications = var.applications
subnet_id = var.subnet_id
key_name = var.key_name
master_instance_type = var.master_instance_type
master_ebs_size = var.master_ebs_size
core_instance_type = var.core_instance_type
core_instance_count = var.core_instance_count
core_ebs_size = var.core_ebs_size
emr_master_security_group = module.security.emr_master_security_group
emr_slave_security_group = module.security.emr_slave_security_group
emr_ec2_instance_profile = module.iam.emr_ec2_instance_profile
emr_service_role = module.iam.emr_service_role
emr_autoscaling_role = module.iam.emr_autoscaling_role
}
89 changes: 42 additions & 47 deletions modules/emr/main.tf
Original file line number Diff line number Diff line change
@@ -1,88 +1,83 @@
resource "aws_emr_cluster" "emr-spark-cluster" {
name = "${var.name}"
release_label = "${var.release_label}"
applications = "${var.applications}"
name = var.name
release_label = var.release_label
applications = var.applications
termination_protection = false
keep_job_flow_alive_when_no_steps = true

ec2_attributes {
subnet_id = "${var.subnet_id}"
key_name = "${var.key_name}"
emr_managed_master_security_group = "${var.emr_master_security_group}"
emr_managed_slave_security_group = "${var.emr_slave_security_group}"
instance_profile = "${var.emr_ec2_instance_profile}"
subnet_id = var.subnet_id
key_name = var.key_name
emr_managed_master_security_group = var.emr_master_security_group
emr_managed_slave_security_group = var.emr_slave_security_group
instance_profile = var.emr_ec2_instance_profile
}

ebs_root_volume_size = "12"

instance_group {
master_instance_group {
instance_type = var.master_instance_type
instance_count = 1
name = "EMR master"
instance_role = "MASTER"
instance_type = "${var.master_instance_type}"
instance_count = "1"

ebs_config {
size = "${var.master_ebs_size}"
size = var.master_ebs_size
type = "gp2"
volumes_per_instance = 1
}
}

instance_group {
core_instance_group {
instance_type = var.core_instance_type
instance_count = var.core_instance_count
name = "EMR slave"
instance_role = "CORE"
instance_type = "${var.core_instance_type}"
instance_count = "${var.core_instance_count}"

ebs_config {
size = "${var.core_ebs_size}"
size = var.core_ebs_size
type = "gp2"
volumes_per_instance = 1
}
}

tags {
tags = {
Name = "${var.name} - Spark cluster"
}

service_role = "${var.emr_service_role}"
autoscaling_role = "${var.emr_autoscaling_role}"
service_role = var.emr_service_role
autoscaling_role = var.emr_autoscaling_role

bootstrap_action {
name = "Bootstrap setup."
path = "s3://${var.name}/scripts/bootstrap_actions.sh"
}

step = [
{
name = "Copy script file from s3."
action_on_failure = "CONTINUE"
step {
name = "Copy script file from s3."
action_on_failure = "CONTINUE"

hadoop_jar_step {
jar = "command-runner.jar"
args = ["aws", "s3", "cp", "s3://${var.name}/scripts/pyspark_quick_setup.sh", "/home/hadoop/"]
}
},
{
name = "Setup pyspark with conda."
action_on_failure = "CONTINUE"
hadoop_jar_step {
jar = "command-runner.jar"
args = ["aws", "s3", "cp", "s3://${var.name}/scripts/pyspark_quick_setup.sh", "/home/hadoop/"]
}
}

hadoop_jar_step {
jar = "command-runner.jar"
args = ["sudo", "bash", "/home/hadoop/pyspark_quick_setup.sh"]
}
},
]
step {
name = "Setup pyspark with conda."
action_on_failure = "CONTINUE"

hadoop_jar_step {
jar = "command-runner.jar"
args = ["sudo", "bash", "/home/hadoop/pyspark_quick_setup.sh"]
}
}

configurations_json = <<EOF
[
{
[
{
"Classification": "spark-defaults",
"Properties": {
"Properties": {
"maximizeResourceAllocation": "true",
"spark.dynamicAllocation.enabled": "true"
}
}
]
EOF
}
]
EOF
}
2 changes: 1 addition & 1 deletion modules/emr/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ variable "subnet_id" {}
variable "key_name" {}
variable "release_label" {}
variable "applications" {
type = "list"
type = list(string)
}
variable "master_instance_type" {}
variable "master_ebs_size" {}
Expand Down
24 changes: 19 additions & 5 deletions modules/s3/main.tf
Original file line number Diff line number Diff line change
@@ -1,23 +1,37 @@
resource "aws_s3_bucket" "create_bucket" {
bucket = "${var.name}"
acl = "private"

tags = {
Name = "Bucket for EMR Bootstrap actions/Steps"
Environment = "Scripts"
}
}

resource "aws_s3_bucket_object" "bootstrap_action_file" {
resource "aws_s3_bucket_ownership_controls" "create_bucket" {
bucket = aws_s3_bucket.create_bucket.id

rule {
object_ownership = "BucketOwnerPreferred"
}
}

resource "aws_s3_bucket_acl" "create_bucket" {
depends_on = [aws_s3_bucket_ownership_controls.create_bucket]

bucket = aws_s3_bucket.create_bucket.id
acl = "private"
}

resource "aws_s3_object" "bootstrap_action_file" {
bucket = "${var.name}"
key = "scripts/bootstrap_actions.sh"
source = "scripts/bootstrap_actions.sh"
depends_on = ["aws_s3_bucket.create_bucket"]
depends_on = [aws_s3_bucket.create_bucket]
}

resource "aws_s3_bucket_object" "pyspark_quick_setup_file" {
resource "aws_s3_object" "pyspark_quick_setup_file" {
bucket = "${var.name}"
key = "scripts/pyspark_quick_setup.sh"
source = "scripts/pyspark_quick_setup.sh"
depends_on = ["aws_s3_bucket.create_bucket"]
depends_on = [aws_s3_bucket.create_bucket]
}
18 changes: 9 additions & 9 deletions modules/security/main.tf
Original file line number Diff line number Diff line change
@@ -1,35 +1,35 @@
resource "aws_security_group" "emr_master" {
name = "${var.name} - EMR-master"
description = "Security group for EMR master."
vpc_id = "${var.vpc_id}"
vpc_id = var.vpc_id
revoke_rules_on_delete = true

ingress {
from_port = 22
to_port = 22
protocol = "tcp"
cidr_blocks = ["${var.ingress_cidr_blocks}"]
cidr_blocks = var.ingress_cidr_blocks
}

ingress {
from_port = 4040
to_port = 4040
protocol = "tcp"
cidr_blocks = ["${var.ingress_cidr_blocks}"]
cidr_blocks = var.ingress_cidr_blocks
}

ingress {
from_port = 8888
to_port = 8888
protocol = "tcp"
cidr_blocks = ["${var.ingress_cidr_blocks}"]
cidr_blocks = var.ingress_cidr_blocks
}

ingress {
from_port = 20888
to_port = 20888
protocol = "tcp"
cidr_blocks = ["${var.ingress_cidr_blocks}"]
cidr_blocks = var.ingress_cidr_blocks
}

egress {
Expand All @@ -39,22 +39,22 @@ resource "aws_security_group" "emr_master" {
cidr_blocks = ["0.0.0.0/0"]
}

tags {
tags = {
Name = "EMR_master"
}
}

resource "aws_security_group" "emr_slave" {
name = "${var.name} - EMR-slave"
description = "Security group for EMR slave."
vpc_id = "${var.vpc_id}"
vpc_id = var.vpc_id
revoke_rules_on_delete = true

ingress {
from_port = 22
to_port = 22
protocol = "tcp"
cidr_blocks = ["${var.ingress_cidr_blocks}"]
cidr_blocks = var.ingress_cidr_blocks
}

egress {
Expand All @@ -64,7 +64,7 @@ resource "aws_security_group" "emr_slave" {
cidr_blocks = ["0.0.0.0/0"]
}

tags {
tags = {
Name = "EMR_slave"
}
}
6 changes: 5 additions & 1 deletion modules/security/variables.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
variable "name" {}
variable "vpc_id" {}
variable "ingress_cidr_blocks" {}

variable "ingress_cidr_blocks" {
description = "List of CIDR blocks to allow incoming traffic from"
type = list(string)
}
6 changes: 3 additions & 3 deletions outputs.tf
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
output "id" {
value = "${module.emr.id}"
value = module.emr.id
}

output "name" {
value = "${module.emr.name}"
value = module.emr.name
}

output "master_public_dns" {
value = "${module.emr.master_public_dns}"
value = module.emr.master_public_dns
}
30 changes: 21 additions & 9 deletions scripts/bootstrap_actions.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,27 @@
#!/usr/bin/env bash

# install conda
wget --quiet https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh \
&& /bin/bash ~/miniconda.sh -b -p $HOME/conda
# Ensure the script fails on any error
set -e

echo -e '\nexport PATH=$HOME/conda/bin:$PATH' >> $HOME/.bashrc && source $HOME/.bashrc
# Install Miniconda
MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh"
wget --quiet ${MINICONDA_URL} -O ~/miniconda.sh
/bin/bash ~/miniconda.sh -b -p $HOME/conda

# install packages
conda install -y notebook=5.7.* jupyter=1.0.* pandas seaborn
# Initialize Conda for bash shell. This will also update the .bashrc file.
source $HOME/conda/bin/activate
conda init bash

#install findspark
# Update conda
conda update -n base -c defaults conda -y

# Install packages
conda install -y notebook jupyter pandas seaborn

# Update pip and install Python packages
conda install pip -y
pip install --upgrade pip
pip install findspark
pip install sklearn
pip install findspark sklearn

# Cleanup
# rm ~/miniconda.sh
Loading