Skip to content

Commit 47d4fc1

Browse files
committed
Adding new project
1 parent 23d6656 commit 47d4fc1

File tree

286 files changed

+35151
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

286 files changed

+35151
-0
lines changed

AWS_Services/README.md

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Launching EMR cluster from command line
2+
### Below example creates a 3 Node EMR cluster with 1 master and 2 slave Nodes.
3+
4+
aws emr create-cluster \
5+
--applications Name=Ganglia Name=Spark Name=Zeppelin \
6+
--ebs-root-volume-size 10 \
7+
--ec2-attributes \
8+
'{"KeyName":<cluster-name>,"InstanceProfile":<IAMROLE>,"SubnetId":<subnet-id>,"EmrManagedSlaveSecurityGroup":<slave-security-group-id>,"EmrManagedMasterSecurityGroup":<master-security-group-id>}' \
9+
--service-role IAMROLE \
10+
--enable-debugging \
11+
--release-label <emr release version e.g emr-5.29.0> \
12+
--log-uri <s3-bucket-path-for-logging> \
13+
--name <cluster-name> \
14+
--instance-groups \
15+
'[ \
16+
{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"MASTER","InstanceType":"m5.xlarge","Name":"Master Instance Group"}, \
17+
{"InstanceCount":2,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"CORE","InstanceType":"m5.xlarge","Name":"Core Instance Group"}\
18+
]' \
19+
--scale-down-behavior TERMINATE_AT_TASK_COMPLETION \
20+
--region us-east-1
21+
22+
23+
# AWS s3 CLI Cheat Sheet
24+
![s3 cli cheat sheet](https://github.com/san089/Data_Engineering_Projects/blob/master/AWS_Services/aws-s3-cheat-sheet.png)

AWS_Services/aws-s3-cheat-sheet.png

57.9 KB
Loading

Airflow_CloudFormation.yaml

+267
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
AWSTemplateFormatVersion: '2010-09-09'
2+
3+
Description: Airflow server backed by Postgres RDS
4+
5+
Parameters:
6+
KeyName:
7+
Description: Name of an existing EC2 KeyPair to enable SSH access into the Airflow web server
8+
Type: AWS::EC2::KeyPair::KeyName
9+
ConstraintDescription: Must be the name of an existing EC2 KeyPair
10+
S3BucketName:
11+
Description: REQUIRED - A new S3 Bucket name. This bucket will be used to read and write the Movielens dataset.
12+
Type: String
13+
AllowedPattern: '.+'
14+
DBPassword:
15+
Default: airflowpassword
16+
NoEcho: 'true'
17+
Description: Airflow database admin account password
18+
Type: String
19+
MinLength: '8'
20+
MaxLength: '41'
21+
AllowedPattern: '[a-zA-Z0-9]*'
22+
ConstraintDescription: Must contain only alphanumeric characters
23+
24+
# Mapping to find the Amazon Linux AMI in each region.
25+
Mappings:
26+
RegionMap:
27+
us-east-1:
28+
AMI: ami-97785bed
29+
us-east-2:
30+
AMI: ami-f63b1193
31+
us-west-1:
32+
AMI: ami-824c4ee2
33+
us-west-2:
34+
AMI: ami-f2d3638a
35+
ca-central-1:
36+
AMI: ami-a954d1cd
37+
eu-west-1:
38+
AMI: ami-d834aba1
39+
eu-west-2:
40+
AMI: ami-403e2524
41+
eu-west-3:
42+
AMI: ami-8ee056f3
43+
eu-central-1:
44+
AMI: ami-5652ce39
45+
sa-east-1:
46+
AMI: ami-84175ae8
47+
ap-south-1:
48+
AMI: ami-531a4c3c
49+
ap-southeast-1:
50+
AMI: ami-68097514
51+
ap-southeast-2:
52+
AMI: ami-942dd1f6
53+
ap-northeast-1:
54+
AMI: ami-ceafcba8
55+
ap-northeast-2:
56+
AMI: ami-863090e8
57+
Resources:
58+
EC2Instance:
59+
Type: AWS::EC2::Instance
60+
Properties:
61+
KeyName: !Ref 'KeyName'
62+
SecurityGroups: [!Ref 'AirflowEC2SecurityGroup']
63+
InstanceType: 'm4.xlarge'
64+
IamInstanceProfile:
65+
Ref: EC2InstanceProfile
66+
Tags:
67+
-
68+
Key: Name
69+
Value: Airflow
70+
ImageId: !FindInMap
71+
- RegionMap
72+
- !Ref 'AWS::Region'
73+
- AMI
74+
UserData:
75+
Fn::Base64: !Sub |
76+
#!/bin/bash
77+
set -x
78+
exec > >(tee /var/log/user-data.log|logger -t user-data ) 2>&1
79+
# Get the latest CloudFormation package
80+
echo "Installing aws-cfn"
81+
yum install -y aws-cfn-bootstrap
82+
# Start cfn-init
83+
/opt/aws/bin/cfn-init -v -c install --stack ${AWS::StackId} --resource EC2Instance --region ${AWS::Region}
84+
# Download and unzip the Movielens dataset
85+
wget http://files.grouplens.org/datasets/movielens/ml-latest.zip && unzip ml-latest.zip
86+
# Upload the movielens dataset files to the S3 bucket
87+
aws s3 cp ml-latest s3://${S3BucketName} --recursive
88+
# Install git
89+
sudo yum install -y git
90+
# Clone the git repository
91+
git clone https://github.com/aws-samples/aws-concurrent-data-orchestration-pipeline-emr-livy.git
92+
sudo pip install boto3
93+
# Install airflow using pip
94+
echo "Install Apache Airflow"
95+
sudo SLUGIFY_USES_TEXT_UNIDECODE=yes pip install -U apache-airflow
96+
# Encrypt connection passwords in metadata db
97+
sudo pip install apache-airflow[crypto]
98+
# Postgres operators and hook, support as an Airflow backend
99+
sudo pip install apache-airflow[postgres]
100+
sudo -H pip install six==1.10.0
101+
sudo pip install --upgrade six
102+
sudo pip install markupsafe
103+
sudo pip install --upgrade MarkupSafe
104+
echo 'export PATH=/usr/local/bin:$PATH' >> /root/.bash_profile
105+
source /root/.bash_profile
106+
# Initialize Airflow
107+
airflow initdb
108+
# Update the RDS connection in the Airflow Config file
109+
sed -i '/sql_alchemy_conn/s/^/#/g' ~/airflow/airflow.cfg
110+
sed -i '/sql_alchemy_conn/ a sql_alchemy_conn = postgresql://airflow:${DBPassword}@${DBInstance.Endpoint.Address}:${DBInstance.Endpoint.Port}/airflowdb' ~/airflow/airflow.cfg
111+
# Update the type of executor in the Airflow Config file
112+
sed -i '/executor = SequentialExecutor/s/^/#/g' ~/airflow/airflow.cfg
113+
sed -i '/executor = SequentialExecutor/ a executor = LocalExecutor' ~/airflow/airflow.cfg
114+
airflow initdb
115+
# Move all the files to the ~/airflow directory. The Airflow config file is setup to hold all the DAG related files in the ~/airflow/ folder.
116+
mv aws-concurrent-data-orchestration-pipeline-emr-livy/* ~/airflow/
117+
# Delete the higher-level git repository directory
118+
rm -rf aws-concurrent-data-orchestration-pipeline-emr-livy
119+
# Replace the name of the S3 bucket in each of the .scala files. CHANGE THE HIGHLIGHTED PORTION BELOW TO THE NAME OF THE S3 BUCKET YOU CREATED IN STEP 1. The below command replaces the instance of the string ‘<s3-bucket>’ in each of the scripts to the name of the actual bucket.
120+
sed -i 's/<s3-bucket>/${S3BucketName}/g' /root/airflow/dags/transform/*
121+
# Run Airflow webserver
122+
airflow webserver
123+
Metadata:
124+
AWS::CloudFormation::Init:
125+
configSets:
126+
install:
127+
- gcc
128+
gcc:
129+
packages:
130+
yum:
131+
gcc: []
132+
DependsOn:
133+
- DBInstance
134+
- AirflowEC2SecurityGroup
135+
DBInstance:
136+
Type: AWS::RDS::DBInstance
137+
DeletionPolicy: Delete
138+
Properties:
139+
DBName: airflowdb
140+
Engine: postgres
141+
MasterUsername: airflow
142+
MasterUserPassword: !Ref 'DBPassword'
143+
DBInstanceClass: db.t2.small
144+
AllocatedStorage: 5
145+
DBSecurityGroups:
146+
- Ref: DBSecurityGroup
147+
AirflowEC2SecurityGroup:
148+
Type: AWS::EC2::SecurityGroup
149+
Properties:
150+
GroupName: AirflowEC2SG
151+
GroupDescription: Enable HTTP access via port 80 + SSH access
152+
SecurityGroupIngress:
153+
- IpProtocol: tcp
154+
FromPort: 80
155+
ToPort: 80
156+
CidrIp: 0.0.0.0/0
157+
- IpProtocol: tcp
158+
FromPort: 8080
159+
ToPort: 8080
160+
CidrIp: 0.0.0.0/0
161+
- IpProtocol: tcp
162+
FromPort: 22
163+
ToPort: 22
164+
CidrIp: 0.0.0.0/0
165+
AirflowEMRMasterEC2SecurityGroup:
166+
Type: AWS::EC2::SecurityGroup
167+
Properties:
168+
GroupName: AirflowEMRMasterSG
169+
GroupDescription: Airflow EMR Master SG
170+
DependsOn:
171+
- AirflowEC2SecurityGroup
172+
AirflowEMRMasterInboundRule:
173+
Type: AWS::EC2::SecurityGroupIngress
174+
Properties:
175+
IpProtocol: tcp
176+
FromPort: '8998'
177+
ToPort: '8998'
178+
SourceSecurityGroupName: !Ref 'AirflowEC2SecurityGroup'
179+
GroupName: !Ref 'AirflowEMRMasterEC2SecurityGroup'
180+
AirflowEMRSlaveEC2SecurityGroup:
181+
Type: AWS::EC2::SecurityGroup
182+
Properties:
183+
GroupName: AirflowEMRSlaveSG
184+
GroupDescription: Airflow EMR Slave SG
185+
DBSecurityGroup:
186+
Type: AWS::RDS::DBSecurityGroup
187+
Properties:
188+
GroupDescription: Frontend Access
189+
DBSecurityGroupIngress:
190+
EC2SecurityGroupName:
191+
Ref: AirflowEC2SecurityGroup
192+
EC2Role:
193+
Type: AWS::IAM::Role
194+
Properties:
195+
RoleName: AirflowInstanceRole
196+
AssumeRolePolicyDocument:
197+
Version: "2012-10-17"
198+
Statement:
199+
-
200+
Effect: "Allow"
201+
Principal:
202+
Service:
203+
- "ec2.amazonaws.com"
204+
Action:
205+
- "sts:AssumeRole"
206+
ManagedPolicyArns:
207+
- arn:aws:iam::aws:policy/AmazonS3FullAccess
208+
- arn:aws:iam::aws:policy/AmazonElasticMapReduceFullAccess
209+
EC2InstanceProfile:
210+
Type: AWS::IAM::InstanceProfile
211+
Properties:
212+
InstanceProfileName: AirflowInstanceProfile
213+
Roles:
214+
-
215+
Ref: EC2Role
216+
EmrRole:
217+
Type: AWS::IAM::Role
218+
Properties:
219+
RoleName: EmrRole
220+
AssumeRolePolicyDocument:
221+
Version: "2012-10-17"
222+
Statement:
223+
-
224+
Effect: "Allow"
225+
Principal:
226+
Service:
227+
- "elasticmapreduce.amazonaws.com"
228+
- "s3.amazonaws.com"
229+
Action:
230+
- "sts:AssumeRole"
231+
ManagedPolicyArns:
232+
- arn:aws:iam::aws:policy/AmazonS3FullAccess
233+
- arn:aws:iam::aws:policy/AmazonElasticMapReduceFullAccess
234+
EmrEc2Role:
235+
Type: AWS::IAM::Role
236+
Properties:
237+
RoleName: EmrEc2Role
238+
AssumeRolePolicyDocument:
239+
Version: "2012-10-17"
240+
Statement:
241+
-
242+
Effect: "Allow"
243+
Principal:
244+
Service:
245+
- "ec2.amazonaws.com"
246+
Action:
247+
- "sts:AssumeRole"
248+
ManagedPolicyArns:
249+
- arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceforEC2Role
250+
- arn:aws:iam::aws:policy/AmazonS3FullAccess
251+
EmrEc2InstanceProfile:
252+
Type: AWS::IAM::InstanceProfile
253+
Properties:
254+
InstanceProfileName: EmrEc2InstanceProfile
255+
Roles:
256+
-
257+
Ref: EmrEc2Role
258+
S3Bucket:
259+
Type: AWS::S3::Bucket
260+
DeletionPolicy: Retain
261+
Properties:
262+
AccessControl: BucketOwnerFullControl
263+
BucketName: !Ref 'S3BucketName'
264+
Outputs:
265+
AirflowEC2PublicDNSName:
266+
Description: Public DNS Name of the Airflow EC2 instance
267+
Value: !Join ["", ["http://", !GetAtt EC2Instance.PublicDnsName, ":8080"]]

Airflow_Data_Pipelines/Setup_Redshift_Connection_Airflow.md

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from datetime import datetime, timedelta
2+
from airflow import DAG
3+
from airflow.operators.dummy_operator import DummyOperator
4+
from airflow.operators import LoadDimensionOperator
5+
from helpers import SqlQueries
6+
7+
8+
def load_dimension_subdag(
9+
parent_dag_name,
10+
task_id,
11+
redshift_conn_id,
12+
sql_statement,
13+
delete_load,
14+
table_name,
15+
*args, **kwargs):
16+
17+
dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)
18+
19+
load_dimension_table = LoadDimensionOperator(
20+
task_id=task_id,
21+
dag=dag,
22+
redshift_conn_id=redshift_conn_id,
23+
sql_query = sql_statement,
24+
delete_load = delete_load,
25+
table_name = table_name,
26+
)
27+
28+
load_dimension_table
29+
30+
return dag

0 commit comments

Comments
 (0)