-
Notifications
You must be signed in to change notification settings - Fork 10
/
Makefile
169 lines (148 loc) · 5.44 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# Instructions
# ------------
# The first time you should run link-data
# For podcast to create a 777 pickle with all electrodes (signal and label)
# glove and gpt2 embeddings:
# 1. create-sig-pickle for 777
# 2. create-pickle for 661
# 3. generate-embeddings for 661 glove and gpt2
# 4. run concatenate-embeddings for 661 glove and gpt2
# 5. run copy-embeddings
# 6. (optionally) run upload-pickle
# For 247
# 1. create-pickle for subject
# 2. generate-embeddings for glove
# 3. generate-embeddings for gpt2
# 4. upload pickle
# Miscellaneous: \
247 Subjects IDs: 625, 676, 7170, and 798 \
Podcast Subjects: 661 662 717 723 741 742 743 763 798 \
777: Is the collection of significant electrodes
# Other Notes:
# For generate-embeddings
# 1. Models greater than 6.7B parameters need a GPU with more than 40GB RAM
# 2. This means currently we cannot run EleutherAI/gpt-neox-20b and
# facebook/opt-30b using huggingface on della GPUs
# Run all commands in one shell
.ONESHELL:
# NOTE: link data from tigressdata before running any scripts
PRJCT_ID := tfs
# {tfs | podcast}
link-data:
ifeq ($(PRJCT_ID), podcast)
$(eval DIR_KEY := podcast-data)
else
$(eval DIR_KEY := conversations-car)
endif
# create directory
mkdir -p data/$(PRJCT_ID)
# delete old symlinks
find data/$(PRJCT_ID)/ -xtype l -delete
# create symlinks from original data store
ln -sf /projects/HASSON/247/data/$(DIR_KEY)/* data/$(PRJCT_ID)/
# settings for target: create-pickle, create-sig-pickle, upload-pickle
%-pickle: CMD := python
# {echo | python}
%-pickle: PRJCT_ID := tfs
# {tfs | podcast}
%-pickle: SID_LIST = 676
# {625 676 7170 798 | 661 662 717 723 741 742 743 763 798 | 777}
create-pickle:
mkdir -p logs
for sid in $(SID_LIST); do \
$(CMD) scripts/tfspkl_main.py \
--project-id $(PRJCT_ID) \
--subject $$sid; \
done
# create pickle of significant electrodes (just for podcast)
create-sig-pickle:
mkdir -p logs
$(CMD) scripts/tfspkl_main.py \
--project-id $(PRJCT_ID) \
--sig-elec-file all-electrodes2.csv
# upload pickles to google cloud bucket
# on bucket we use 247 not tfs, so manually adjust as needed
# upload-pickle: pid=247
upload-pickle: pid=podcast
upload-pickle:
for sid in $(SID_LIST); do \
gsutil -m rsync -rd results/$(PRJCT_ID)/$$sid/pickles/ gs://247-podcast-data/$(pid)-pickles/$$sid; \
done
# upload raw data to google cloud bucket
upload-data:
gsutil -m rsync -rd data/tfs/676 gs://247-podcast-data/247-data/676/
gsutil -m rsync -rd data/tfs/625 gs://247-podcast-data/247-data/625/
# download pickles from google cloud bucket
download-247-pickles:
mkdir -p results/{625,676}
gsutil -m rsync -x "^(?!.*625).*" gs://247-podcast-data/247-pickles/ results/625/
gsutil -m rsync -x "^(?!.*676).*" gs://247-podcast-data/247-pickles/ results/676/
## settings for targets: generate-embeddings, concatenate-embeddings
%-embeddings: PRJCT_ID := podcast
# {tfs | podcast}
%-embeddings: SID := 661
# {625 | 676 | 7170 | 798 | 661}
%-embeddings: CONV_IDS = $(shell seq 1 1)
# {54 for 625 | 78 for 676 | 1 for 661 | 24 for 7170 | 15 for 798}
%-embeddings: PKL_IDENTIFIER := full
# {full | trimmed | binned}
%-embeddings: EMB_TYPE := gpt2-xl
# {"gpt2", "gpt2-large", "gpt2-xl", \
"EleutherAI/gpt-neo-125M", "EleutherAI/gpt-neo-1.3B", "EleutherAI/gpt-neo-2.7B", \
"EleutherAI/gpt-neox-20b", \
"facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", \
"facebook/opt-2.7b", "facebook/opt-6.7b", "facebook/opt-30b", \
"facebook/blenderbot_small-90M"}
%-embeddings: CNXT_LEN := 1024
%-embeddings: LAYER := all
# {'all' for all layers | 'last' for the last layer | (list of) integer(s) >= 1}
# Note: embeddings file is the same for all podcast subjects \
and hence only generate once using subject: 661
%-embeddings: JOB_NAME = $(subst /,-,$(EMB_TYPE))
%-embeddings: CMD = python
# {echo | python | sbatch --job-name=$(SID)-$(JOB_NAME)-cnxt-$$cnxt_len submit.sh}
# 38 and 39 failed
# generate-base-for-embeddings: Generates the base dataframe for embedding generation
generate-base-for-embeddings:
python scripts/tfsemb_LMBase.py \
--project-id $(PRJCT_ID) \
--pkl-identifier $(PKL_IDENTIFIER) \
--subject $(SID) \
--embedding-type $(EMB_TYPE);
# generates embeddings (for each conversation separately)
generate-embeddings: generate-base-for-embeddings
mkdir -p logs
for cnxt_len in $(CNXT_LEN); do \
for conv_id in $(CONV_IDS); do \
$(CMD) scripts/tfsemb_main.py \
--project-id $(PRJCT_ID) \
--pkl-identifier $(PKL_IDENTIFIER) \
--subject $(SID) \
--conversation-id $$conv_id \
--embedding-type $(EMB_TYPE) \
--layer-idx $(LAYER) \
--context-length $$cnxt_len; \
done; \
done;
# concatenate embeddings from all conversations
concatenate-embeddings:
for cnxt_len in $(CNXT_LEN); do \
python scripts/tfsemb_concat.py \
--project-id $(PRJCT_ID) \
--pkl-identifier $(PKL_IDENTIFIER) \
--subject $(SID) \
--embedding-type $(EMB_TYPE) \
--context-length $$cnxt_len; \
done;
# Podcast: copy embeddings to other subjects as well
copy-embeddings:
fn=results/podcast/661/pickles/embeddings
for sid in 662 717 723 741 742 763 798 777; do \
cp -rpf $$fn $$(echo $$fn | sed "s/661/$$sid/g"); \
done; \
# Download huggingface models to cache (before generating embeddings)
# This target needs to be run on the head node
cache-models: MODEL := causal
# {causal | seq2seq | mlm | or any model name specified in EMB_TYPE comments}
cache-models:
python -c "from scripts import tfsemb_download; tfsemb_download.download_tokenizers_and_models(\"$(MODEL)\")"