-
Notifications
You must be signed in to change notification settings - Fork 0
/
Makefile
336 lines (258 loc) · 11 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
.PHONY: clean lint requirements train_model predict_model train_test_split
#################################################################################
# GLOBALS #
#################################################################################
PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')
PROFILE = default
PROJECT_NAME = airpollutionnowcast
PYTHON_INTERPRETER = python3
ifeq (,$(shell which conda))
HAS_CONDA=False
else
HAS_CONDA=True
endif
#################################################################################
# VARIABLES #
#################################################################################
#CONFIG_PATH = config/parameters.ini
TRAIN_DATA_PATH = data/processed/train.csv
VALID_DATA_PATH = data/processed/valid.csv
TEST_DATA_PATH = data/processed/test.csv
#################################################################################
# PROJECT CONSTRUCT #
#################################################################################
## Lint using flake8
lint:
flake8 src
## Set up python interpreter environment
create_environment:
ifeq (True,$(HAS_CONDA))
@echo ">>> Detected conda, creating conda environment."
conda env create -f environment.yml --name $(PROJECT_NAME)
conda activate $(PROJECT_NAME)
else
@echo ">>> CONDA NEEDED TO CREATE ENVIRONMENT"
endif
## Test python environment is setup correctly
test_environment:
conda activate $(PROJECT_NAME)
$(PYTHON_INTERPRETER) test_environment.py
## Install Python Dependencies
requirements: test_environment
$(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel
$(PYTHON_INTERPRETER) -m pip install -r requirements.txt
#################################################################################
# COMMANDS #
#################################################################################
### extract search trends
#data/interim/search.csv:
# $(PYTHON_INTERPRETER) airpolnowcast/data/extract_search_trend.py $(CONFIG_PATH) $@
#
### extract pol labels
#data/interim/pol.csv:
# $(PYTHON_INTERPRETER) airpolnowcast/data/extract_pol_label.py $(CONFIG_PATH) $@
#
### extract physical measurements data
#data/interim/phys.csv:
# $(PYTHON_INTERPRETER) airpolnowcast/data/extract_phys_meas.py $(CONFIG_PATH) $@
#
### process physical measurements features
#data/interim/process_phys.csv: data/interim/phys.csv
# $(PYTHON_INTERPRETER) airpolnowcast/data/process_phys_feature.py $(CONFIG_PATH) $< $@
#
### merge all data
#data/interim/merged.csv: data/interim/pol.csv data/interim/search.csv data/interim/process_phys.csv
# $(PYTHON_INTERPRETER) airpolnowcast/data/merge_data_files.py $(CONFIG_PATH) $^ $@
#
### train test split into files
#train_test_split: data/interim/merged.csv
# $(PYTHON_INTERPRETER) airpolnowcast/data/train_test_split.py $(CONFIG_PATH) $< $(TRAIN_DATA_PATH) $(VALID_DATA_PATH) $(TEST_DATA_PATH)
#
### train model
#train_model:
# $(PYTHON_INTERPRETER) airpolnowcast/evaluation/train_model.py $(CONFIG_PATH) $(TRAIN_DATA_PATH) $(VALID_DATA_PATH)
#
### predict and get report
#predict_model:
# $(PYTHON_INTERPRETER) airpolnowcast/evaluation/predict_model.py $(CONFIG_PATH) $(TEST_DATA_PATH)
#
### predict fine-tuning results
#predict_fine_tuning:
# $(PYTHON_INTERPRETER) airpolnowcast/evaluation/predict_fine_tuning.py $(CONFIG_PATH) $(TRAIN_DATA_PATH) $(VALID_DATA_PATH) $(TEST_DATA_PATH)
##########
# variable to run train_test_split on unit test data
##########
# unit_test_config
UNIT_CONFIG_PATH = config/unit_test.ini
UNIT_TRAIN_DATA_PATH = data/processed/unit_test/train.csv
UNIT_VALID_DATA_PATH = data/processed/unit_test/valid.csv
UNIT_TEST_DATA_PATH = data/processed/unit_test/test.csv
# process physical measurements features
data/external/unit_test/process_phys.csv:
$(PYTHON_INTERPRETER) src/data/process_phys_feature.py $(UNIT_CONFIG_PATH) data/external/unit_test/phys.csv $@
# merge all data
data/interim/unit_test/merged.csv:
$(PYTHON_INTERPRETER) src/data/merge_data_files.py $(UNIT_CONFIG_PATH) data/external/unit_test/pol.csv data/external/unit_test/search.csv data/external/unit_test/process_phys.csv $@
# train test split into files
unit_train_test_split: data/interim/unit_test/merged.csv
$(PYTHON_INTERPRETER) src/data/train_test_split.py $(UNIT_CONFIG_PATH) $< $(UNIT_TRAIN_DATA_PATH) $(UNIT_VALID_DATA_PATH) $(UNIT_TEST_DATA_PATH)
# unit test on build features
unit_build_features:
$(PYTHON_INTERPRETER) src/features/build_features.ut.py -v $(UNIT_CONFIG_PATH) $(UNIT_TRAIN_DATA_PATH)
#########
# variable to run train model on word vectors (without dict training)
#########
CONFIG_PATH = config/word_vector.ini
## extract search trends
data/interim/search.csv:
$(PYTHON_INTERPRETER) src/data/extract_search_trend.py $(CONFIG_PATH) $@
## extract pol labels
data/interim/pol.csv:
$(PYTHON_INTERPRETER) src/data/extract_pol_label.py $(CONFIG_PATH) $@
## extract physical measurements data
data/interim/phys.csv:
$(PYTHON_INTERPRETER) src/data/extract_phys_meas.py $(CONFIG_PATH) $@
## process physical measurements features
data/interim/process_phys.csv: data/interim/phys.csv
$(PYTHON_INTERPRETER) src/data/process_phys_feature.py $(CONFIG_PATH) $< $@
## merge all data
data/interim/merged.csv: data/interim/pol.csv data/interim/search.csv data/interim/process_phys.csv
$(PYTHON_INTERPRETER) src/data/merge_data_files.py $(CONFIG_PATH) $^ $@
## train test split into files
train_test_split: data/interim/merged.csv
$(PYTHON_INTERPRETER) src/data/train_test_split.py $(CONFIG_PATH) $< $(TRAIN_DATA_PATH) $(VALID_DATA_PATH) $(TEST_DATA_PATH)
## train model
train_model:
$(PYTHON_INTERPRETER) src/evaluation/train_model.py $(CONFIG_PATH) $(TRAIN_DATA_PATH) $(VALID_DATA_PATH)
## predict and get report
predict_model:
$(PYTHON_INTERPRETER) src/evaluation/predict_model.py $(CONFIG_PATH) $(TEST_DATA_PATH)
#########
# variable to run train model on word vectors (without dict training) and only on seed queries
#########
SEED_CONFIG_PATH = config/word_vector_seed_queries.ini
# all the operations makefile same above
## extract search trends
## train model
SEED_train_model:
$(PYTHON_INTERPRETER) src/evaluation/train_model.py $(SEED_CONFIG_PATH) $(TRAIN_DATA_PATH) $(VALID_DATA_PATH)
## predict and get report
SEED_predict_model:
$(PYTHON_INTERPRETER) src/evaluation/predict_model.py $(SEED_CONFIG_PATH) $(TEST_DATA_PATH)
#########
# variable to run train model on word vectors (without dict training) and only on seed queries and on NO2 prediction
#########
NO2_CONFIG_PATH = config/NO2/word_vector_NO2.ini
NO2_TRAIN_DATA_PATH = data/processed/NO2/train.csv
NO2_VALID_DATA_PATH = data/processed/NO2/valid.csv
NO2_TEST_DATA_PATH = data/processed/NO2/test.csv
#change operations
## extract pol labels
data/interim/NO2/pol.csv:
$(PYTHON_INTERPRETER) src/data/extract_pol_label.py $(NO2_CONFIG_PATH) $@
## merge all data
data/interim/NO2/merged.csv: data/interim/NO2/pol.csv data/interim/search.csv data/interim/process_phys.csv
$(PYTHON_INTERPRETER) src/data/merge_data_files.py $(NO2_CONFIG_PATH) $^ $@
## train test split into files
NO2_train_test_split: data/interim/NO2/merged.csv
$(PYTHON_INTERPRETER) src/data/train_test_split.py $(NO2_CONFIG_PATH) $< $(NO2_TRAIN_DATA_PATH) $(NO2_VALID_DATA_PATH) $(NO2_TEST_DATA_PATH)
## train model
NO2_train_model:
$(PYTHON_INTERPRETER) src/evaluation/train_model.py $(NO2_CONFIG_PATH) $(NO2_TRAIN_DATA_PATH) $(NO2_VALID_DATA_PATH)
## predict and get report
NO2_predict_model:
$(PYTHON_INTERPRETER) src/evaluation/predict_model.py $(NO2_CONFIG_PATH) $(NO2_TEST_DATA_PATH)
#########
# variable to run train model on word vectors (without dict training) and only on seed queries and on PM25 prediction
#########
PM25_CONFIG_PATH = config/PM25/word_vector_PM25.ini
PM25_TRAIN_DATA_PATH = data/processed/PM25/train.csv
PM25_VALID_DATA_PATH = data/processed/PM25/valid.csv
PM25_TEST_DATA_PATH = data/processed/PM25/test.csv
#change operations
## extract pol labels
data/interim/PM25/pol.csv:
$(PYTHON_INTERPRETER) src/data/extract_pol_label.py $(PM25_CONFIG_PATH) $@
## merge all data
data/interim/PM25/merged.csv: data/interim/PM25/pol.csv data/interim/search.csv data/interim/process_phys.csv
$(PYTHON_INTERPRETER) src/data/merge_data_files.py $(PM25_CONFIG_PATH) $^ $@
## train test split into files
PM25_train_test_split: data/interim/PM25/merged.csv
$(PYTHON_INTERPRETER) src/data/train_test_split.py $(PM25_CONFIG_PATH) $< $(PM25_TRAIN_DATA_PATH) $(PM25_VALID_DATA_PATH) $(PM25_TEST_DATA_PATH)
## train model
PM25_train_model:
$(PYTHON_INTERPRETER) src/evaluation/train_model.py $(PM25_CONFIG_PATH) $(PM25_TRAIN_DATA_PATH) $(PM25_VALID_DATA_PATH)
## predict and get report
PM25_predict_model:
$(PYTHON_INTERPRETER) src/evaluation/predict_model.py $(PM25_CONFIG_PATH) $(PM25_TEST_DATA_PATH)
## Delete all compiled Python files
clean:
find . -type f -name "*.py[co]" -delete
find . -type d -name "__pycache__" -delete
rm -f data/raw/*.csv
rm -f data/interim/*.csv
rm -f data/processed/*.csv
rm -f models/*.h5
rm -f models/*.pkl
rm -f models/interim/*.h5
rm -f reports/*.csv
rm -f reports/*.ini
#################################################################################
# Self Documenting Commands #
#################################################################################
.DEFAULT_GOAL := help
# Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
# sed script explained:
# /^##/:
# * save line in hold space
# * purge line
# * Loop:
# * append newline + line to hold space
# * go to next line
# * if line starts with doc comment, strip comment character off and loop
# * remove target prerequisites
# * append hold space (+ newline) to line
# * replace newline plus comments by `---`
# * print line
# Separate expressions are necessary because labels cannot be delimited by
# semicolon; see <http://stackoverflow.com/a/11799865/1968>
.PHONY: help
help:
@echo "$$(tput bold)Available rules:$$(tput sgr0)"
@echo
@sed -n -e "/^## / { \
h; \
s/.*//; \
:doc" \
-e "H; \
n; \
s/^## //; \
t doc" \
-e "s/:.*//; \
G; \
s/\\n## /---/; \
s/\\n/ /g; \
p; \
}" ${MAKEFILE_LIST} \
| LC_ALL='C' sort --ignore-case \
| awk -F '---' \
-v ncol=$$(tput cols) \
-v indent=19 \
-v col_on="$$(tput setaf 6)" \
-v col_off="$$(tput sgr0)" \
'{ \
printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
n = split($$2, words, " "); \
line_length = ncol - indent; \
for (i = 1; i <= n; i++) { \
line_length -= length(words[i]) + 1; \
if (line_length <= 0) { \
line_length = ncol - indent - length(words[i]) - 1; \
printf "\n%*s ", -indent, " "; \
} \
printf "%s ", words[i]; \
} \
printf "\n"; \
}' \
| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')