-
Notifications
You must be signed in to change notification settings - Fork 0
/
Makefile
242 lines (215 loc) · 11.2 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#################################################################################
# GLOBALS #
#################################################################################
#################################################################################
# COMMANDS #
#################################################################################
.PHONY: requirements
## Install Python Dependencies
requirements:
@echo ">>> Creating conda environment nypd-complaint-analysis"
conda env create -f conda.yml
.PHONY: clean
## Delete all compiled Python files
clean:
find . -type f -name "*.py[co]" -delete
find . -type d -name "__pycache__" -delete
docker exec -it ${AIRFLOW_DOCKER_ID} sh -c "/entrypoint.sh airflow delete_dag nypd_complaint_analysis"
rm -ri ${AIRFLOW_DAG_ROOT_FOLDER}/nypd-complaint
.PHONY: cleanup_aws
## Remove bastion host & RedShift cluster
cleanup_aws:
python src/cleanup_cluster.py
# Source: https://stackoverflow.com/questions/53382383/makefile-cant-use-conda-activate
# Need to specify bash in order for conda activate to work.
SHELL=/bin/bash
# Note that the extra activate is needed to ensure that the activate floats env to the front of PATH
CONDA_ACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda activate ; conda activate
CONDA_DEACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda deactivate ; conda deactivate
.PHONY: load_data_to_redshift
## Load data from S3 to RedShift
load_data_to_redshift:
($(CONDA_ACTIVATE) nypd-complaint-analysis; cd src; python load_data_to_redshift.py; $(CONDA_DEACTIVATE))
.PHONY: airflow_trigger_dag_local
## Execute airflow on local dev environment
airflow_trigger_dag_local:
docker exec -it ${AIRFLOW_DOCKER_ID} sh -c "/entrypoint.sh airflow trigger_dag nypd_complaint_analysis --conf '{\"env\":\"local\"}'"
.PHONY: airflow_trigger_dag_aws_keep_redshift
## Execute airflow on AWS and keep RedShift cluster
airflow_trigger_dag_aws_keep_redshift:
docker exec -it ${AIRFLOW_DOCKER_ID} sh -c "/entrypoint.sh airflow trigger_dag nypd_complaint_analysis --conf '{\"env\":\"aws\", \"delete_redshift_cluster\":\"False\"}'"
.PHONY: airflow_trigger_dag_aws_delete_redshift
## Execute airflow on AWS and delete RedShift cluster
airflow_trigger_dag_aws_delete_redshift:
docker exec -it ${AIRFLOW_DOCKER_ID} sh -c "/entrypoint.sh airflow trigger_dag nypd_complaint_analysis --conf '{\"env\":\"aws\", \"delete_redshift_cluster\":\"True\"}'"
.PHONY: airflow_deploy
## Deploy airflow file
airflow_deploy:
mkdir -p ${AIRFLOW_DAG_ROOT_FOLDER}/nypd-complaint/.
cp src/nypd_complaint_airflow.py ${AIRFLOW_DAG_ROOT_FOLDER}/nypd-complaint/.
cp src/transform_data.py ${AIRFLOW_DAG_ROOT_FOLDER}/.
cp src/create_redshift_cluster_database.py ${AIRFLOW_DAG_ROOT_FOLDER}/.
cp src/redshift.cfg ${AIRFLOW_DAG_ROOT_FOLDER}/.
cp src/load_data_to_redshift.py ${AIRFLOW_DAG_ROOT_FOLDER}/.
cp src/create_bastion_host.py ${AIRFLOW_DAG_ROOT_FOLDER}/.
cp src/cleanup_cluster.py ${AIRFLOW_DAG_ROOT_FOLDER}/.
cp src/emr_queries.py ${AIRFLOW_DAG_ROOT_FOLDER}/.
cp src/redshift_queries.py ${AIRFLOW_DAG_ROOT_FOLDER}/.
cp src/transform_data.py ${AIRFLOW_DAG_ROOT_FOLDER}/nypd-complaint/.
cp src/create_redshift_cluster_database.py ${AIRFLOW_DAG_ROOT_FOLDER}/nypd-complaint/.
cp src/redshift.cfg ${AIRFLOW_DAG_ROOT_FOLDER}/nypd-complaint/.
cp src/load_data_to_redshift.py ${AIRFLOW_DAG_ROOT_FOLDER}/nypd-complaint/.
cp src/create_bastion_host.py ${AIRFLOW_DAG_ROOT_FOLDER}/nypd-complaint/.
cp src/cleanup_cluster.py ${AIRFLOW_DAG_ROOT_FOLDER}/nypd-complaint/.
cp src/emr_queries.py ${AIRFLOW_DAG_ROOT_FOLDER}/nypd-complaint/.
cp src/redshift_queries.py ${AIRFLOW_DAG_ROOT_FOLDER}/nypd-complaint/.
docker cp src/redshift.cfg ${AIRFLOW_DOCKER_ID}:/tmp/.
docker cp src/bastion.cfg ${AIRFLOW_DOCKER_ID}:/tmp/.
docker cp ${AWS_EMR_SSH_IDENTITY_FILE} ${AIRFLOW_DOCKER_ID}:/tmp/pkey.pem
aws s3 cp src/emr_queries.py s3://nypd-complaint/code/emr_queries.py
aws s3 cp src/redshift_queries.py s3://nypd-complaint/code/redshift_queries.py
aws s3 cp src/transform_data.py s3://nypd-complaint/code/transform_data.py
aws s3 cp src/install-requirements.sh s3://nypd-complaint/code/install-requirements.sh
.PHONY: airflow_get_configs
## Download configparser files from AirFlow to local environment
airflow_get_configs:
docker cp ${AIRFLOW_DOCKER_ID}:/tmp/bastion.cfg /tmp/.
docker cp ${AIRFLOW_DOCKER_ID}:/tmp/redshift.cfg /tmp/.
airflow_deploy:
.PHONY: airflow_clear_runs
## Clear all airflow runs
airflow_clear_runs:
docker exec -it ${AIRFLOW_DOCKER_ID} sh -c "/entrypoint.sh airflow clear -c nypd_complaint_analysis"
.PHONY: run_local_transform
## Run local Spark transforms
run_local_transform:
docker cp src/transform_data.py ${SPARK_LOCAL_MASTER_DOCKER_ID}:/tmp/.
docker cp src/local.cfg ${SPARK_LOCAL_MASTER_DOCKER_ID}:/tmp/.
docker exec -it ${SPARK_LOCAL_MASTER_DOCKER_ID} sh -c "python /tmp/transform_data.py 'local'"
.PHONY: spark_analysis_local
## Open interactive prompt for Spark ad hoc analysis on local Spark docker container
spark_analysis_local:
docker cp src/interactive_analysis.py ${SPARK_LOCAL_MASTER_DOCKER_ID}:/tmp/.
docker exec -it ${SPARK_LOCAL_MASTER_DOCKER_ID} sh -c "python -i /tmp/interactive_analysis.py local"
.PHONY: spark_analysis_aws
## Open interactive prompt for Spark ad hoc analysis on AWS EMR cluster
spark_analysis_aws:
$(eval emr_id=$(shell sh -c "aws emr list-clusters --region us-west-2 --active | jq '[.Clusters | .[] | .Id][0]'"))
@echo $(emr_id)
$(eval dns=$(shell sh -c "aws emr describe-cluster --cluster-id $(emr_id) --region us-west-2 --query Cluster.MasterPublicDnsName"))
@echo $(dns)
scp -i ${AWS_EMR_SSH_IDENTITY_FILE} -o StrictHostKeyChecking=no src/interactive_analysis.py hadoop@$(dns):/tmp/.
ssh -i ${AWS_EMR_SSH_IDENTITY_FILE} -o StrictHostKeyChecking=no hadoop@$(dns) "/usr/bin/python3 -i /tmp/interactive_analysis.py aws"
.PHONY: test_transform_aws
## Test transform_data.py on existing EMR instance on AWS
test_transform_aws:
$(eval emr_id=$(shell sh -c "aws emr list-clusters --region us-west-2 --active | jq '[.Clusters | .[] | .Id][0]'"))
@echo $(emr_id)
$(eval dns=$(shell sh -c "aws emr describe-cluster --cluster-id $(emr_id) --region us-west-2 --query Cluster.MasterPublicDnsName"))
@echo $(dns)
scp -i ${AWS_EMR_SSH_IDENTITY_FILE} -o StrictHostKeyChecking=no src/transform_data.py hadoop@$(dns):/tmp/.
scp -i ${AWS_EMR_SSH_IDENTITY_FILE} -o StrictHostKeyChecking=no src/emr_queries.py hadoop@$(dns):/tmp/.
aws emr add-steps --region us-west-2 --cluster-id $(emr_id) --steps Type="CUSTOM_JAR",Name="Test Transforms",Jar="command-runner.jar",ActionOnFailure="CONTINUE",Args="['sudo', '-H', '-u', 'hadoop', 'bash', '-c', \"cd /tmp; /usr/bin/python3 /tmp/transform_data.py aws ${AWS_ACCESS_KEY} ${AWS_SECRET_ACCESS_KEY}\"]"
.PHONY: test_dimenional_model_datasets
## Test transform_data.py on existing EMR instance on AWS
test_dimenional_model_datasets:
pytest src/transform_data.py
# Source: https://stackoverflow.com/questions/53382383/makefile-cant-use-conda-activate
# Need to specify bash in order for conda activate to work.
SHELL=/bin/bash
# Note that the extra activate is needed to ensure that the activate floats env to the front of PATH
CONDA_ACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda activate ; conda activate
CONDA_DEACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda deactivate ; conda deactivate
.PHONY: test_redshift_aws_creation
## Test creating of RedShift cluster
test_redshift_aws_creation:
($(CONDA_ACTIVATE) nypd-complaint-analysis; cd src; python create_redshift_cluster_database.py; $(CONDA_DEACTIVATE))
.PHONY: connect_psql_redshift
## Connect to nypd_complaint via psql
connect_psql_redshift:
$(eval public_dns=$(shell sh -c "grep PUBLIC_DNS /tmp/bastion.cfg | cut -d' ' -f 3"))
$(eval redshift_host=$(shell sh -c "grep DWH_HOST /tmp/redshift.cfg | cut -d' ' -f 3"))
$(eval db_user=$(shell sh -c "grep DWH_DB_USER /tmp/redshift.cfg | cut -d' ' -f 3"))
$(eval db_password=$(shell sh -c "grep DWH_DB_PASSWORD /tmp/redshift.cfg | cut -d' ' -f 3"))
$(eval db_port=$(shell sh -c "grep DWH_PORT /tmp/redshift.cfg | cut -d' ' -f 3"))
ssh -D localhost:8087 -S /tmp/.ssh-aws-bastion -M -o StrictHostKeyChecking=no -i ${AWS_EMR_SSH_IDENTITY_FILE} ec2-user@$(public_dns) -fNT -L 5439:$(redshift_host):5439
PGPASSWORD=$(db_password) psql -h 127.0.0.1 -d nypd_complaint -U $(db_user) -p $(db_port)
ssh -S /tmp/.ssh-aws-bastion -O exit $(public_dns)
.PHONY: ssh_aws_emr_master
## Connect to master node of running AWS EMR cluster
ssh_aws_emr_master:
$(eval emr_id=$(shell sh -c "aws emr list-clusters --region us-west-2 --active | jq '[.Clusters | .[] | .Id][0]'"))
@echo $(emr_id)
$(eval dns=$(shell sh -c "aws emr describe-cluster --cluster-id $(emr_id) --region us-west-2 --query Cluster.MasterPublicDnsName"))
@echo $(dns)
ssh -i ${AWS_EMR_SSH_IDENTITY_FILE} -o StrictHostKeyChecking=no hadoop@$(dns)
.PHONY: mount_aws_emr_log_directory
## Mount AWS EMR logs via s3fs to local directory
mount_aws_emr_log_directory:
mkdir -p /tmp/aws-emr-logs
$(eval user_id=$(shell sh -c "id -u"))
$(eval user_group_id=$(shell sh -c "id -g"))
s3fs ${AWS_EMR_LOG_BUCKET} /tmp/aws-emr-logs -o passwd_file=${AWS_S3_S3FS_PASSWORD_FILE} -o allow_other,uid=$(user_id),gid=$(user_group_id) -o umask=0007
.PHONY: lint
## Lint all python files
lint:
yapf -i src/*.py
#################################################################################
# Self Documenting Commands #
#################################################################################
.DEFAULT_GOAL := help
# From: https://github.com/drivendata/cookiecutter-data-science/blob/master/%7B%7B%20cookiecutter.repo_name%20%7D%7D/Makefile
# Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
# sed script explained:
# /^##/:
# * save line in hold space
# * purge line
# * Loop:
# * append newline + line to hold space
# * go to next line
# * if line starts with doc comment, strip comment character off and loop
# * remove target prerequisites
# * append hold space (+ newline) to line
# * replace newline plus comments by `---`
# * print line
# Separate expressions are necessary because labels cannot be delimited by
# semicolon; see <http://stackoverflow.com/a/11799865/1968>
.PHONY: help
help:
@echo "$$(tput bold)Available rules:$$(tput sgr0)"
@echo
@sed -n -e "/^## / { \
h; \
s/.*//; \
:doc" \
-e "H; \
n; \
s/^## //; \
t doc" \
-e "s/:.*//; \
G; \
s/\\n## /---/; \
s/\\n/ /g; \
p; \
}" ${MAKEFILE_LIST} \
| LC_ALL='C' sort --ignore-case \
| awk -F '---' \
-v ncol=$$(tput cols) \
-v indent=19 \
-v col_on="$$(tput setaf 6)" \
-v col_off="$$(tput sgr0)" \
'{ \
printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
n = split($$2, words, " "); \
line_length = ncol - indent; \
for (i = 1; i <= n; i++) { \
line_length -= length(words[i]) + 1; \
if (line_length <= 0) { \
line_length = ncol - indent - length(words[i]) - 1; \
printf "\n%*s ", -indent, " "; \
} \
printf "%s ", words[i]; \
} \
printf "\n"; \
}' \
| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')