forked from hassonlab/247-pickling
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Makefile
119 lines (101 loc) · 3.88 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Miscellaneous: \
247 Subjects IDs: 625 and 676 \
Podcast Subjects: 661 662 717 723 741 742 743 763 798 \
777: Is the code the collection of significant electrodes
# NOTE: link data from tigressdata before running any scripts \
(Recommend running this before running every target)
PRJCT_ID := podcast
# {tfs | podcast}
link-data:
ifeq ($(PRJCT_ID), podcast)
$(eval DIR_KEY := podcast-data)
else
$(eval DIR_KEY := conversations-car)
endif
# create directory
mkdir -p data/$(PRJCT_ID)
# delete bad symlinks
find data/$(PRJCT_ID)/ -xtype l -delete
# create symlinks from original data store
ln -sf /projects/HASSON/247/data/$(DIR_KEY)/* data/$(PRJCT_ID)/
ifeq ($(PRJCT_ID), podcast)
cp /projects/HASSON/247/data/podcast/podcast-transcription.txt data/$(PRJCT_ID)/
cp /projects/HASSON/247/data/podcast/podcast-datum-cloze.csv data/$(PRJCT_ID)/
else
$(eval DIR_KEY := conversations-car)
endif
# settings for target: create-pickle, create-sig-pickle, upload-pickle
%-pickle: CMD := python # {echo | python}
%-pickle: PRJCT_ID := podcast # {tfs | podcast}
%-pickle: SID_LIST=743 # {625 676 | 661 662 717 723 741 742 743 763 798 | 777}
%-pickle: MEL := 500 # Setting a large number will extract all common \
electrodes across all conversations
%-pickle: MINF := 0
create-pickle:
mkdir -p logs
for sid in $(SID_LIST); do \
$(CMD) code/tfspkl_main.py \
--project-id $(PRJCT_ID) \
--subject $$sid \
--max-electrodes $(MEL) \
--vocab-min-freq $(MINF); \
done
# create pickle fof significant electrodes
create-sig-pickle:
mkdir -p logs
$(CMD) code/tfspkl_main.py \
--project-id $(PRJCT_ID) \
--sig-elec-file /scratch/gpfs/hgazula/phase-5000-sig-elec-glove50d-perElec-FDR-01_newVer_1000perm-LH.csv \
--max-electrodes $(MEL) \
--vocab-min-freq $(MINF);
# upload pickles to google cloud bucket
upload-pickle:
for sid in $(SID_LIST); do \
gsutil -m cp -r results/$(PRJCT_ID)/$$sid/pickles/*.pkl gs://247-podcast-data/$(PRJCT_ID)_pickles/$$sid; \
done
# download pickles from google cloud bucket
download-247-pickles:
mkdir -p results/{625,676}
gsutil -m rsync -x "^(?!.*625).*" gs://247-podcast-data/247_pickles/ results/625/
gsutil -m rsync -x "^(?!.*676).*" gs://247-podcast-data/247_pickles/ results/676/
## settings for targets: generate-embeddings, concatenate-embeddings
%-embeddings: CMD := sbatch submit.sh # {echo | python | sbatch submit.sh}
%-embeddings: PRJCT_ID := podcast # {tfs | podcast}
%-embeddings: SID := 661 # {625 | 676 | 661}
%-embeddings: CONV_IDS = $(shell seq 1 1) # {54 for 625 | 79 for 676 | 1 for 661}
%-embeddings: PKL_IDENTIFIER := full # {full | trimmed | binned}
%-embeddings: EMB_TYPE := gpt2-xl # {glove50 | bert | gpt2-xl}
%-embeddings: CNXT_LEN := 1024
%-embeddings: HIST := --history
# Note: embeddings file is the same for all podcast subjects \
and hence only generate once using subject: 661
# generates embeddings (for each conversation separately)
generate-embeddings:
mkdir -p logs
for conv_id in $(CONV_IDS); do \
$(CMD) code/tfsemb_main.py \
--project-id $(PRJCT_ID) \
--pkl-identifier $(PKL_IDENTIFIER) \
--subject $(SID) \
--conversation-id $$conv_id \
--embedding-type $(EMB_TYPE) \
$(HIST) \
--context-length $(CNXT_LEN); \
done
# concatenate embeddings from all conversations
concatenate-embeddings:
python code/tfsemb_concat.py \
--project-id $(PRJCT_ID) \
--pkl-identifier $(PKL_IDENTIFIER) \
--subject $(SID) \
--embedding-type $(EMB_TYPE) \
$(HIST) \
--context-length $(CNXT_LEN); \
# Podcast: copy embeddings to other subjects as well
copy-embeddings:
for SID_LIST=662 717 723 741 742 763 798 | 777}; do \
rsync /scratch/gpfs/$(shell whoami)/247-pickling/results/podcast/661/
done
# Sync results with the /projects/HASSON folder
sync-results:
rsync -ah /scratch/gpfs/$(shell whoami)/247-pickling/results/* /projects/HASSON/247/results_new_infra/pickling/