-
Notifications
You must be signed in to change notification settings - Fork 4
/
ibpbcl.py
249 lines (227 loc) · 11.5 KB
/
ibpbcl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import os
from ibpbnn import IBP_BNN
import copy as cpy
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
import torch.distributions as tod
import numpy as np
import math
from copy import deepcopy
import matplotlib.pyplot as plt
import matplotlib
# matplotlib.use('Agg')
import gzip
import pickle
torch.manual_seed(7)
np.random.seed(10)
class IBP_BCL:
def __init__(self, hidden_size, alpha, no_epochs, data_gen, coreset_method, coreset_size=0, single_head=True, grow = False):
'''
hidden_size : list of network hidden layer sizes
alpha : IBP prior concentration parameters
data_gen : Data Generator
coreset_size : Size of coreset to be used (0 represents no coreset)
single_head : To given single head output for all task or multihead output for each task seperately.
'''
## Intializing Hyperparameters for the model.
self.hidden_size = hidden_size
self.alpha = alpha
self.beta = [1.0 for i in range(len(hidden_size))]
self.no_epochs = no_epochs
self.data_gen = data_gen
if(coreset_method != "kcen"):
self.coreset_method = self.rand_from_batch
else:
self.coreset_method = self.k_center
self.coreset_size = coreset_size
self.single_head = single_head
self.grow = grow
self.cuda = torch.cuda.is_available()
def rand_from_batch(self, x_coreset, y_coreset, x_train, y_train, coreset_size):
""" Random coreset selection """
# Randomly select from (x_train, y_train) and add to current coreset (x_coreset, y_coreset)
idx = np.random.choice(x_train.shape[0], coreset_size, False)
x_coreset.append(x_train[idx,:])
y_coreset.append(y_train[idx,:])
x_train = np.delete(x_train, idx, axis=0)
y_train = np.delete(y_train, idx, axis=0)
return x_coreset, y_coreset, x_train, y_train
def k_center(self, x_coreset, y_coreset, x_train, y_train, coreset_size):
""" K-center coreset selection """
# Select K centers from (x_train, y_train) and add to current coreset (x_coreset, y_coreset)
dists = np.full(x_train.shape[0], np.inf)
current_id = 0
dists = self.update_distance(dists, x_train, current_id)
idx = [current_id]
for i in range(1, coreset_size):
current_id = np.argmax(dists)
dists = self.update_distance(dists, x_train, current_id)
idx.append(current_id)
x_coreset.append(x_train[idx,:])
y_coreset.append(y_train[idx,:])
x_train = np.delete(x_train, idx, axis=0)
y_train = np.delete(y_train, idx, axis=0)
return x_coreset, y_coreset, x_train, y_train
def update_distance(self, dists, x_train, current_id):
for i in range(x_train.shape[0]):
current_dist = np.linalg.norm(x_train[i,:]-x_train[current_id,:])
dists[i] = np.minimum(current_dist, dists[i])
return dists
def merge_coresets(self, x_coresets, y_coresets):
## Merges the current task coreset to rest of the coresets
merged_x, merged_y = x_coresets[0], y_coresets[0]
for i in range(1, len(x_coresets)):
merged_x = np.vstack((merged_x, x_coresets[i]))
merged_y = np.vstack((merged_y, y_coresets[i]))
return merged_x, merged_y
def logit(self, x):
eps = 10e-8
return (np.log(x+eps) - np.log(1-x+eps))
def get_soft_logit(self, masks, task_id):
var = []
for i in range(len(masks)):
var.append(self.logit(masks[i][task_id]*0.98 + 0.01))
return var
def get_scores(self, model, x_testsets, y_testsets, x_coresets, y_coresets, hidden_size,
no_epochs, single_head, batch_size=None, kl_mask = None):
## Retrieving the current model parameters
mf_model = model
mf_weights, mf_variances = model.get_weights()
prev_masks, alpha, beta = mf_model.get_IBP()
acc = []
## In case the model is single head or have coresets then we need to test accodingly.
if single_head:# If model is single headed.
if len(x_coresets) > 0:# Model has non zero coreset size
del mf_model
torch.cuda.empty_cache()
x_train, y_train = self.merge_coresets(x_coresets, y_coresets)
prev_pber = self.get_soft_logit(prev_masks,i)
bsize = x_train.shape[0] if (batch_size is None) else batch_size
final_model = IBP_BNN(x_train.shape[1], hidden_size, y_train.shape[1], x_train.shape[0], self.max_tasks,
prev_means=mf_weights, prev_log_variances=mf_variances,
prev_masks = prev_masks, alpha=alpha, beta = beta, prev_pber = prev_pber,
kl_mask = kl_mask, single_head=single_head)
final_model.ukm = 1
final_model.batch_train(x_train, y_train, 0, 1, bsize, 1)
else:# Model does not have coreset
final_model = model
## Testing for all previously learned tasks
for i in range(len(x_testsets)):
if not single_head:# If model is multi headed.
if len(x_coresets) > 0:
try:
del mf_model
except:
pass
torch.cuda.empty_cache()
x_train, y_train = x_coresets[i], y_coresets[i]# coresets per task
prev_pber = self.get_soft_logit(prev_masks,i)
bsize = x_train.shape[0] if (batch_size is None) else batch_size
final_model = IBP_BNN(x_train.shape[1], hidden_size, y_train.shape[1], x_train.shape[0], self.max_tasks,
prev_means=mf_weights, prev_log_variances=mf_variances,
prev_masks = prev_masks, alpha=alpha, beta = beta, prev_pber = prev_pber,
kl_mask = kl_mask, learning_rate = 0.0001, single_head=single_head)
final_model.ukm = 1
final_model.batch_train(x_train, y_train, i, 1, bsize, 1, init_temp = final_model.min_temp)
else:
final_model = model
x_test, y_test = x_testsets[i], y_testsets[i]
pred = final_model.prediction_prob(x_test, i)
pred_mean = np.mean(pred, axis=1)
pred_y = np.argmax(pred_mean, axis=1)
y = np.argmax(y_test, axis=1)
# Calculating the accuracy of the model on given test/validaiton data.
cur_acc = len(np.where((pred_y - y) == 0)[0]) * 1.0 / y.shape[0]
acc.append(cur_acc)
return acc
def concatenate_results(self, score, all_score):
## Concats the current accuracies on all task to previous result in form of matrix
if all_score.size == 0:
all_score = np.reshape(score, (1,-1))
else:
new_arr = np.empty((all_score.shape[0], all_score.shape[1]+1))
new_arr[:] = np.nan# Puts nan in place of empty values (tasks that previous model was not trained on)
new_arr[:,:-1] = all_score
all_score = np.vstack((new_arr, score))
return all_score
def batch_train(self, batch_size=None):
'''
batch_size : Batch_size for gradient updates
'''
## Intializing coresets and dimensions.
in_dim, out_dim = self.data_gen.get_dims()
x_coresets, y_coresets = [], []
x_testsets, y_testsets = [], []
x_trainset, y_trainset = [], []
all_acc = np.array([])
self.max_tasks = self.data_gen.max_iter
fig1, ax1 = plt.subplots(1,self.max_tasks, figsize = [10,5])
## Training the model sequentially.
for task_id in range(self.max_tasks):
## Loading training and test data for current task
x_train, y_train, x_test, y_test = self.data_gen.next_task()
x_testsets.append(x_test)
y_testsets.append(y_test)
## Initializing the batch size for training
bsize = x_train.shape[0] if (batch_size is None) else batch_size
## If this is the first task we need to initialize few variables.
if task_id == 0:
prev_masks = None
prev_pber = None
kl_mask = None
mf_weights = None
mf_variances = None
## Select coreset if coreset size is non zero
if self.coreset_size > 0:
x_coresets,y_coresets,x_train,y_train = self.coreset_method(x_coresets,y_coresets,x_train,y_train,self.coreset_size)
## Training the network
mf_model = IBP_BNN(in_dim, self.hidden_size, out_dim, x_train.shape[0], self.max_tasks,
prev_means=mf_weights, prev_log_variances=mf_variances,
prev_masks = prev_masks, alpha=self.alpha, beta = self.beta, prev_pber = prev_pber,
kl_mask = kl_mask, single_head=self.single_head)
mf_model.grow_net = self.grow
if(self.cuda):
mf_model = mf_model.cuda()
if torch.cuda.device_count() > 1:
mf_model = nn.DataParallel(mf_model) #enabling data parallelism
mf_model.batch_train(x_train, y_train, task_id, self.no_epochs, bsize,max(self.no_epochs//5,1))
mf_weights, mf_variances = mf_model.get_weights()
prev_masks, self.alpha, self.beta = mf_model.get_IBP()
self.hidden_size = deepcopy(mf_model.size[1:-1])
## Figure of masks that has been learned for all seen tasks.
fig, ax = plt.subplots(1,task_id+1, figsize = [10,5])
for i,m in enumerate(prev_masks[0][:task_id+1]):
if(task_id == 0):
ax.imshow(m, vmin = 0, vmax = 1)
else:
ax[i].imshow(m,vmin=0, vmax=1)
fig.savefig("all_masks.png")
## Calculating Union of all task masks and also for visualizing the layer wise network sparsity
sparsity = []
kl_mask = []
M = len(mf_variances[0])
for j in range(M):
c_layer_masks = prev_masks[j]
canvas = np.zeros_like(c_layer_masks[task_id])
for t_id in range(task_id+1):
din, dout = c_layer_masks[t_id].shape
canvas[:din,:dout] = canvas[:din,:dout] + c_layer_masks[t_id]
## Plotting union mask
mask = (canvas > 0.01)*1.0
## Calculating network sparsity
kl_mask.append(mask)
filled = np.mean(mask)
sparsity.append(filled)
ax1[task_id].imshow(mask,vmin=0, vmax=1)
fig1.savefig("union_mask.png")
print("Network sparsity : ", sparsity)
mf_model.grow_net = False
acc = self.get_scores(mf_model, x_testsets, y_testsets, x_coresets, y_coresets,
self.hidden_size, self.no_epochs, self.single_head, batch_size, kl_mask)
torch.save(mf_model.state_dict(), "./saves/model_last_" + str(task_id))
del mf_model
torch.cuda.empty_cache()
all_acc = self.concatenate_results(acc, all_acc); print(all_acc); print('*****')
return [all_acc, prev_masks]