-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_sae.py
191 lines (153 loc) · 8.35 KB
/
train_sae.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import copy
import sys
import json
import itertools
import torch
import transformers
from argparse import ArgumentParser
import sae
import graphmlp
import _dgl
from graphmlp import auto_data_loop
from read_mrp import read_mrp_file, DUMMY_LABEL, INV_LABEL, UNA_LABEL
from util import Dir, get_capacities, str2bool
argp = ArgumentParser()
argp.add_argument('formalism', type=str, help='Linguistic formalism; must match a training and validation .mrp file in DATA.')
argp.add_argument('epochs', type=int, help='How many epochs to train for.')
# argp.add_argument('train_mode', type=int, help='0: graph, 1: lm, 2: both, 3: combined, '
# '4: graph (combined sanity), 5: lm (combined sanity)')
argp.add_argument('from_scratch', type=str2bool, help='Whether or not to train the Transformer language model (GPT-2) from scratch.')
argp.add_argument('per_l', type=str2bool, help='Whether or not to permute edge labels in the input.')
argp.add_argument('per_a', type=str2bool, help='Whether or not to permute token-to-node anchoring in the input.')
argp.add_argument('keep_una', type=str2bool, help='Whether or not to retain unanalyzable (multi-word) anchors when permuting token anchoring (does nothing if PER_A=0).')
# argp.add_argument('lm_weight', type=float, help='Weight of LM-finetuning-only loss in addition to ensemble-LM loss.')
# argp.add_argument('aux_weight', type=float, help='Weight of MTL auxiliary loss in addition to ensemble-LM loss. (Not implemented)')
argp.add_argument('nslm_no_tokens', type=str2bool, help='Whether or not to remove tokens from linguistic graph inputs.')
argp.add_argument('seed', type=int, help='Seed for random model and data shuffling initialization.')
argp.add_argument('--data', type=str, default='mrp/', help='Main data directory.')
argp.add_argument('--baseline-enc', type=str, choices=['gcn', 'rgcn', 'gat'], default=None, help='Which variant of graph neural net baseline to use, if any.')
argp.add_argument('--upos-file', type=str, default=None, help='Performs comparison-by-combination with UPOS if .mrp file is provided.')
args = argp.parse_args()
upos = None
upos_types = None
if args.upos_file:
with open(args.upos_file) as f:
upos = json.load(f)
upos_types = {t: i for i, t in enumerate(sorted(map(str, set(itertools.chain(*map(set, upos.values()))))))}
lbda = 0
torch.autograd.set_detect_anomaly(True)
transformers.set_seed(args.seed)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
if args.from_scratch:
gpt2 = transformers.GPT2LMHeadModel(transformers.GPT2Config.from_pretrained('gpt2'))
else:
gpt2 = transformers.GPT2LMHeadModel.from_pretrained('gpt2')
embedding = gpt2.get_input_embeddings()
embedding_dim = embedding.embedding_dim
data_dir = f'{args.data}/{args.formalism}.training.mrp'
_d, edge_labels = read_mrp_file(data_dir, permute_labels=args.per_l, permute_anchors=args.per_a, keep_una=args.keep_una,
seed=args.seed)
SHARED_IDS = []
d = [x for x in _d if x['id'] not in SHARED_IDS]
edge_labels[DUMMY_LABEL] = len(edge_labels)
edge_labels[INV_LABEL] = len(edge_labels)
edge_labels[UNA_LABEL] = len(edge_labels)
n_edge_labels = len(edge_labels)
print('edge labels', n_edge_labels, edge_labels)
sibling_dir = Dir.l2r
inp_emb = torch.nn.Embedding.from_pretrained(embedding.weight.detach().cpu())
emb_param = torch.nn.Parameter(inp_emb.weight)
if args.nslm_no_tokens:
inp_emb = None
max_parents = 1
index_immediates = True
max_children = 1
max_coparents = 0
max_siblings = 1
max_aunts = 1
max_grandparents = 0
max_una = 0
parent_feat_dim = (max_siblings + 1 + max_aunts + 1 + max_grandparents + 1 + 1) * \
(max_parents + 1) * len(edge_labels) + \
((max_siblings + 1 + max_aunts + 1 + max_grandparents + 1 + 1) * \
(max_parents + 1) + (max_una + 1)) * \
embedding_dim * int(inp_emb is not None)
child_feat_dim = (max_coparents + 1 + 1) * (max_children + 1) * len(edge_labels) + \
(max_coparents + 1 + 1) * (max_children + 1) * \
embedding_dim * int(inp_emb is not None)
feat_dim = parent_feat_dim + child_feat_dim
print('parent_feat_dim', parent_feat_dim)
print('child_feat_dim', child_feat_dim)
print('feat_dim', feat_dim)
capacity_sizes, capacity_types, capacity_rels = get_capacities(len(edge_labels), embedding_dim,
max_parents=max_parents,
max_siblings=max_siblings,
max_grandparents=max_grandparents,
max_aunts=max_aunts,
max_children=max_children,
max_coparents=max_coparents,
max_una=max_una,
index_immediates=index_immediates,
index_tokens=inp_emb is not None,
index_pos=len(args.upos_types) if upos is not None else 0)
feat_dim = sum(capacity_sizes)
print('feat_dim', feat_dim)
kwargs = dict(embedding_dim=embedding_dim, edge_labels=edge_labels,
rels=['parent_', 'sibling_s', 'grandparent_s', 'aunt_s', 'child_', 'coparent_s'],
max_parents=max_parents, index_immediates=index_immediates,
max_children=max_children, max_coparents=max_coparents,
max_siblings=max_siblings, max_aunts=max_aunts,
max_grandparents=max_grandparents)
if args.baseline_enc == 'gcn':
encoder = _dgl.GNN(_dgl.GraphConv, **kwargs)
feat_dim = encoder.out_dim
elif args.baseline_enc == 'rgcn':
encoder = _dgl.GNN(_dgl.RelGraphConv, **kwargs)
feat_dim = encoder.out_dim
elif args.baseline_enc == 'gat':
encoder = _dgl.GNN(_dgl.GATConv, **kwargs)
feat_dim = encoder.out_dim
else:
encoder = graphmlp.SparseSliceEncoder(**kwargs) # TODO: add an out_dim to SparseSliceEncoder to unify this
model = sae.SliceAutoEncoder(feat_dim, 1024, [768], capacity_sizes, capacity_types, encoder, copy.deepcopy(inp_emb),
dropout=0.2)
if device == 'cuda':
model.cuda()
model.eval()
batch_size = 8
train_dev_split = int(len(d) * 0.9)
dev_d = d[train_dev_split:]
d = d[:train_dev_split]
data = auto_data_loop(graphmlp.raw_data_loop, d, edge_labels, tokenizer, encoder,
upos=upos, upos_types=upos_types,
encode_incremental=0, # (set to 0 to speed up training, so that GNN baselines are only run once per sentence. in case of memory issues, set to >0 (the smaller, the slower and less memory used)
device=device,
embedding=copy.deepcopy(inp_emb),
batch_size=batch_size, write_cache=False,
max_una=max_una,
sibling_dir=sibling_dir)
dev_data = auto_data_loop(graphmlp.raw_data_loop, dev_d, edge_labels, tokenizer, encoder,
upos=upos, upos_types=upos_types,
encode_incremental=0, # (0) to speed up training, GNN baselines are only run once per sentence
device=device,
embedding=inp_emb,
batch_size=1, write_cache=False,
return_first_idxs=True,
max_una=max_una,
sibling_dir=sibling_dir)
checkpoint_name = f'{args.formalism}_sae_model-{args.epochs}-{int(args.from_scratch)}{int(args.per_l)}{int(args.per_a)}{int(args.keep_una)}-{int(args.nslm_no_tokens)}-{args.seed}{args.baseline_enc or ""}.pt'
loaded = None
try:
loaded = torch.load(checkpoint_name, map_location='cpu')
except:
print('model not found.', file=sys.stderr)
if loaded is not None:
try:
model.load_state_dict(loaded)
except Exception as e:
print(f'couldn\'t load model: {e}', file=sys.stderr)
sae.train(model, data, dev_data=dev_data, epochs=args.epochs, n_data=((len(data) // batch_size) + 1), randomize=True,
seed=args.seed, checkpoint_name=checkpoint_name, lr=1e-4)
model.eval()