-
Notifications
You must be signed in to change notification settings - Fork 17
/
train_mRVM1_MKL.m
572 lines (488 loc) · 20.3 KB
/
train_mRVM1_MKL.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
%% mRVM1
% authors: Ioannis Psorakis psorakis@gmail.com, Theodoros Damoulas
% theo@dcs.gla.ac.uk
% Copyright NCR, 2009
%
% Use of this code is subject to the Terms and Conditions of the Research License Agreement,
% agreed to when this code was downloaded, and a copy of which is available at
% http://www.dcs.gla.ac.uk/inference/pMKL/Terms_and_Conditions.html.
%
% INPUTS:
% =======
% USE '-i' as input_flag if you want to provide the inputs interactively
% through the console
% OR
% use '-p' as input_flag if you want to provide the inputs as function
% parameters (see function declaration above)
%
% In case you use '-p' please provide the following:
% X_collection: cell array of length Sources, containing:
% { [N x D1] ; [N x D2] ; ...} (N samples, Di features) is the training set
% from multiple feature spaces.
% features) is the training set.
% t: of size C x N (C classes, N samples) is the training labels
% standardize_flag: boolean cell array {Sources x 1} turns ON/OFF data standardization for each source
% convergence_used: values [1 or 2] is the training termination criterion (see
% conv.1 and conv.2 of theoretical background
% kernel_type: string can be either 'gaussian', or 'polynomial' or 'linear;
% kernel_param: for linear kernel put any value
% plot_flag: [1 or 0] plots the number of relevant vectors during training
% dataset_name: auxiliary label
% In case you use '-i' please note:
% each dataset file must contain the necessary variables, which much be in the correct format. Those are:
%
% 1 the class labels, say "t". This variable must be a C X N dimensional array
% where C is the number of classes and N is the size of the data. Say we have a
% problem with 100 samples and 3 classes. The 5th sample would belong to the 2nd
% class if t(:,5)' = [0 1 0].
% For datasets which do have independent training and test sets, there should be two of these
% variables. E.g. tTrain and tTest
%
% 2 the data, say "X". This variable should be a N X D where N is the number of samples
% and D is the number of features. For datasets which do have independent training
% and test sets, there should be two of there variables. E.g. Xtrain Xtest. Also, for multi-kernel problems
% there should be one such variable for each feature space.
%
% OUTPUTS:
% =======
% OUTPUT is an object that has a range of properties:
% model_used: the name of the algorithm (e.g mRVM1);
% dataset_name: the name of the dataset;
% N_total: the total number of samples in the original dataset;
% N_prototypical: the number of relevance vectors extracted from the algorithm;
%
% X_prototypical: the relevance vectors (or prototypical samples) extracted from the algorithm;
%
% X_prototypical_standardized: same as above, but standardized;
% K_sparse: the sparse training kernel, that uses only the relevance vectors;
% W_sparse: the sparse regressors matrix, that uses only the relevance vectors;
%
% active_sample_original_indices: the original indices of the relevance vectors in the dataset;
%
% sources: number of kernels;
% b: kernel mixing coefficients;
%
% kernel_type: kernel(s) used;
% kernel_param: kernel parameter(s) used;
function OUTPUT = train_mRVM1_MKL(input_flag,X_collection,t,standardize_flag,convergence_used,Nmax,kernel_type,kernel_param,plot_flag,dataset_name)
disp('---------------- mRVM1 ----------------')
model_used = 'mRVM1';
if ~exist('input_flag','var')
input_flag = '-i';
end
if strcmp(input_flag,'-i')
%% INTERFACE
% provide the dataset file name
dataset_name = input('please enter the file name of the dataset > ','s');
% attempt to open the file name, if not, stop
try
load(dataset_name);
catch ME
ME.stack;
error('No dataset exists with that name, exiting...')
end
Sources = input('please enter number of kernels > ');
X_collection=cell(Sources,1);
kernel_type = cell(Sources,1);
kernel_param = cell(Sources,1);
standardize_flag = cell(Sources,1);
i=1;
while i<=Sources
%% LOAD DATA
try
fprintf(1,'\n Please provide the variable name of the %d train source',i);
X_collection{i} = input(' > ');
standardize_flag{i} = logical(input('standardize_data? (1/0) > '));
fprintf(1,'\n\tplease provide the kernel type of the %d source (gaussian,polynomial,linear)',i);
kernel_type{i} = input(' > ','s');
if ~strcmp(kernel_type{i},'linear')
fprintf(1,'\n\tplease provide the kernel parameter of the %d source',i);
kernel_param{i} = str2double(input(' > ','s'));
else
kernel_param{i} = '';
end
i=i+1;
catch ME3
disp('incorrect input, please enter again')
end
end
t = input('please provide the train labels variable name > ');
%%%
fprintf('Please select convergence criterion:\n 1. trivial change in hyperpriors \n 2. trivial change in hyperpriors and minimum number of iterations N \n 3. Maximum iterations\n');
convergence_used = input('choice (1 or 2 or empty if not sure) > ');
if isempty(convergence_used)
convergence_used = 2;
Nmax = size(X,1);
elseif convergence_used==1
Nmax = -1;
elseif (convergence_used == 2) || (convergence_used == 3)
Nmax = input('Enter maximum number of iterations (leave empty for Ntrain) > ');
if isempty(Nmax)
Nmax = size(X,1);
end
end
plot_flag = input('plot relevant vectors progresion? (1/0) > ');
end
Sources = length(X_collection);
%% INITIALIZATIONS
Xtrain_collection = cell(Sources,1);
Xtrain = [];
for s=1:Sources
if standardize_flag{s}
[Xtrain_collection{s}, Xmean{s}, Xstd{s}] = standardize_data(X_collection{s});
else
Xtrain_collection{s} = X_collection{s};
end
Xtrain = [Xtrain Xtrain_collection{s}];
end
N = size(Xtrain,1);
C = size(t,1);
if isempty(Nmax)
Nmax = size(Xtrain,1);
end
% initialize the auxiliary variables Y to follow the target labels of the
% training set
Y = 10*rand(C,N).*t + rand(C,N);
% set all scales to infinity
A = inf*ones(N,C);
% set all regressor values to zero
W = zeros(N,C);
if plot_flag
number_of_RVs = zeros(N+1,1);
end
% mRVM1 starts with an empty model
active_samples=[];
%% PERFORM KERNEL SETUP
disp('building composite kernel...')
% BUILD COMPOSITE KERNEL
Kernels = zeros(N,N,Sources);
beta = 1/Sources * ones(1,Sources);
Ktrain = zeros(N,N);
for i=1:Sources
Kernels(:,:,i) = ...
build_standarized_kernel(Xtrain_collection{i},...
Xtrain_collection{i},kernel_type{i},...
kernel_param{i});
Ktrain = Ktrain + beta(i) * Kernels(:,:,i);
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% SELECT FIRST SAMPLE
% here we follow an informative methodology for the selection of the first
% sample to include in the model. We select the sample i, for which k_i has
% the largest projection to the auxiliary variables Y normalized by
% ||k_||^2 and the number of classes C
Initial_CON = sqrt(sum((Ktrain*Y').^2,2)) ./ (C*sum(Ktrain.^2,2));
current_sample_index = find(Initial_CON == max(Initial_CON),1);
% add this sample to the model.
active_samples = [active_samples;current_sample_index];
%% BUILD KERNEL, UPDATE K*,ki,A
% take the i-th row of the training Kernel for that sample
ki = Ktrain(current_sample_index,:);
% If at any point we have M samples in the model, Kstar is MXN. In our case
% where M=1, Kstar=ki
Kstar = Ktrain(active_samples,:);
% update the scales for the selected sample.
A(current_sample_index,:) = update_a_mRVM1_first_time(ki,Y);
% Astar is MXN for M samples in the model. Initially Astar = 1XN because we
% have only one sample in the model.
Astar = diag(A(active_samples,1));
%% LOOP
% Estimate the quantity KKA_inv which will be used for the calculation of
% the contribution of a sample to the model. It shall be also used for the
% update of the W posteriors.
KKA_inv = chol_inv(Kstar*Kstar' + Astar);
% initiallize some iterators and auxiliary variables
mRVM1_iterator=0;
% set all convergence flags to false.
converged_loga_N=false;
converged_loga=false;
disp('Training...')
tic
while true
%% Results of previous iteration
% store some results from the previous iteration.
Aold=diag(Astar);
%% Ki
% calculate the kernel function for the selected sample.
ki = Ktrain(current_sample_index,:);
%% CHECK IF ALREADY IN
% check if the selected sample is already inside the relevant vectors
% collection
is_already_in = ...
logical(sum(1*ismember(active_samples,current_sample_index)));
% get its scale
ai = A(current_sample_index,:);
%% CALCULATE Sm,Qcm
% calculate the sparsity and quality factors (See theoretical
% background)
if mRVM1_iterator==0
Sm = ki*ki' - ki*Kstar'*KKA_inv*Kstar*ki';
Qcm = ki*Y' - ki*Kstar'*KKA_inv*Kstar*Y';
%% UPDATE s,q based on sample existence
if ~is_already_in
si = Sm;
qci = Qcm;
else
si = (ai(1)*Sm)/(ai(1)-Sm);
qci = (ai(1)*Qcm)./(ai(1)-Sm);
end
%% calculate sum of qci^2
qsum2=sum(qci.^2);
does_contribute = qsum2 > C*si;
else
Sm = BIG_SM(current_sample_index);
Qcm = BIG_QM(current_sample_index,:);
if ~is_already_in
si = Sm;
qci = Qcm;
else
si = (ai(1)*Sm)/(ai(1)-Sm);
qci = (ai(1)*Qcm)./(ai(1)-Sm);
end
qsum2=sum(qci.^2);
does_contribute = qsum2 > C*si;
end
%% CHECK CONDITIONS
% based on the contribution value of the current sample and its
% existence or not in the relevant vector collection, for each case we
% do the following:
if does_contribute && is_already_in
% if the sample does contribute to the model solution and it is
% already in the model keep it and update its scales a
A(current_sample_index,:) = update_a_mRVM1(C,si,qci);
% update the matrices
Astar = diag(A(active_samples,1));
KKA_inv = chol_inv(Kstar*Kstar' + Astar);
% there has been a change in the model (due to the scale update so
% we need to update the posteriors).
update_posterior_flag=true;
elseif does_contribute && ~is_already_in
% is the sample can contribute to the model, but it is not inside
% it, we include it.
A(current_sample_index,:) = update_a_mRVM1(C,si,qci);
active_samples = [active_samples current_sample_index];
active_samples = sort(active_samples);
% update matrices
Astar = diag(A(active_samples,1));
Kstar = Ktrain(active_samples,:);
KKA_inv = chol_inv(Kstar*Kstar' + Astar);
% as the model changed due to the inclusion of a sample, we need to
% update the posteriors
update_posterior_flag=true;
elseif ~does_contribute && is_already_in && length(active_samples)~=1
% if the sample does not contribute to the model solution and it is
% in the model, we remove it
A(current_sample_index,:) = inf;
W(current_sample_index,:)=0;
active_samples(active_samples==current_sample_index)=[];
active_samples = sort(active_samples);
% update matrices
Astar = diag(A(active_samples,1));
Kstar = Ktrain(active_samples,:);
KKA_inv = chol_inv(Kstar*Kstar' + Astar);
% as we removed one sample in the model, we have to update our
% posteriors.
update_posterior_flag=true;
else
% the posterior update is not needed if the current sample does not
% contribute to the solution and it is not in the model.
update_posterior_flag=false;
end
non_active=1:1:N;
non_active(active_samples)=[];
%% UPDATE POSTERIORS
% if there was a change in the model, update the regressors W and the
% auxiliary variables Y.
if update_posterior_flag
%% W
W(active_samples,:) = KKA_inv * Kstar * Y';
%% Y
F=Kstar'*W(active_samples,:);
for c=1:C
pos=find(t(c,:)==1);
[Z, Yt(pos,:)] = YTruncate(F(pos',:),c,1000);
end
Y=Yt';
%% BIG MATRICES
% those matrices are used to evaluate the contribution of each
% sample of the training set. We initialize:
% non_active=1:1:N;
% non_active(active_samples)=[];
BIG_SI = zeros(1,N);
BIG_QCI = zeros(N,C);
% BIG_SI stores the `sparsity factor' (see theoretical background)
% for each sample in the training set. BIG_QCI stores the `quality
% factor' (see theoretical background) for each sample in the model
% and across classes.
% BIG_M and BIG_QM are auxiliary tables for the calculation of the
% above factors a different update scheme is followed for included
% and not-included samples of the model (see theoretical
% background)
BIG_SM = diag(Ktrain*Ktrain - Ktrain*Kstar'*KKA_inv*Kstar*Ktrain); % 1XN
BIG_QM = (Ktrain*Y' - Ktrain*Kstar'*KKA_inv*Kstar*Y'); % NXC
BIG_SI(non_active) = BIG_SM(non_active);
BIG_QCI(non_active,:) = BIG_QM(non_active,:);
BIG_SI(active_samples) =...
(A(active_samples,1).*BIG_SM(active_samples))./...
(A(active_samples,1)-BIG_SM(active_samples));
BIG_QCI(active_samples,:) =...
(A(active_samples,:).*BIG_QM(active_samples,:))./...
repmat(A(active_samples,1)-BIG_SM(active_samples),1,C);
% having the sparsity and quality factor for each individual
% sample, we calculate each sample contribution, which is stored in
% to the BIG_CON matrix. This matrix shall be used for the
% informative sample selection scheme (see theoretical background).
BIG_CON = sum(BIG_QCI.^2,2) - C*BIG_SI';
end
%% UPDATE CONVERGENCE
% Calculations for the estimation of the convergence based on the
% change of the scales matrix A.
Acurr=diag(Astar);
if length(Acurr)>length(Aold)
Aold_aux = [Aold;eps*ones(length(Acurr)-length(Aold),1)];
Acurr_aux=Acurr;
elseif length(Aold)>length(Acurr)
Acurr_aux =[Acurr;eps*ones(length(Aold)-length(Acurr),1)];
Aold_aux=Aold;
else
Aold_aux = Aold;
Acurr_aux = Acurr;
end
% LOGA convergence
loga = abs(log(Acurr_aux)-log(Aold_aux));
%Convergence before N
are_loga_all_small = prod(1*(loga<1e-2));
are_non_active_all_negative = prod(1*(BIG_CON(non_active)<=0));
if ~converged_loga
converged_loga = are_loga_all_small &&...
are_non_active_all_negative;
end
%Convergence after N
if ~converged_loga_N
converged_loga_N = are_loga_all_small &&...
are_non_active_all_negative && mRVM1_iterator>N;
end
% stop when converged
if (convergence_used == 1 && converged_loga) || (convergence_used == 2 && converged_loga_N) ||...
(convergence_used == 3 && mRVM1_iterator == Nmax)
break;
end
% break if not converged after 10*N
if mRVM1_iterator == 10 * N
break;
warning('not converged - training terminated after maximum iterations')
end
%% PLOTS
if plot_flag
number_of_RVs(mRVM1_iterator+1) = length(active_samples);
plot(number_of_RVs(1:mRVM1_iterator+1));
title('Relevant vectors')
drawnow;
end
%% MKL learning
% Get beta for Quadratic programming-----------------
M=zeros(N,C,Sources);
for kernel_index=1:Sources
M(:,:,kernel_index) = Kernels(:,:,kernel_index) * W;
end
Omega = zeros(Sources, Sources);
for class_index = 1 : C
Mtemp = squeeze(M(:,class_index,:));
Omega = Omega + Mtemp'*Mtemp;
end
for kernel_index = 1 : Sources
f(kernel_index)=sum(sum(Y'.*squeeze(M(:,:,kernel_index))));
end
beta = quadprog(Omega, -f, zeros(Sources), zeros(Sources, 1 ),...
ones(Sources), ones(Sources, 1), zeros(Sources,1) );
% Update Composite Kernel----------
Ktrain=zeros(N,N);
for kernel_index=1:Sources
Ktrain=Ktrain+beta(kernel_index)*Kernels(:,:,kernel_index);
end
%% INFORMATIVE SAMPLE SELECTION
% based on the contributions matrix BIG_CON, perform an informative
% selection scheme for the next sample which will be evaluated. At
% first, we try to find in a non-included sample i can contribute to
% the model solution (BIG_CON(i)>0). If more than one sample exists
% with that criterion, select the one with the maximum contribution. If
% no such sample exists, for each sample inside the model check if
% there is one which has a negative contribution. If more than one such
% samples exist, select the one with the biggest by absolute value
% negative contribution.
% perform some intermediate calculations
BIG_CONtmp = BIG_CON;
BIG_CONtmp(active_samples)=-inf;
non_active=1:1:N;
non_active(active_samples)=[];
BIG_CONtmp2 = BIG_CON;
BIG_CONtmp2(non_active)=inf;
% and apply the criteria described above
if ~isempty(BIG_CONtmp(BIG_CONtmp>0))
% ARE THERE ANY NON-ACTIVE WHICH CAN CONTRIBUTE TO THE MODEL?
current_sample_index = ...
find(BIG_CONtmp==max(BIG_CONtmp(BIG_CONtmp>0)),1);
%disp('Inactive which can contribute selected:')
%current_sample_index
elseif sum(1*(BIG_CON(active_samples)<=0))~=0
% ARE THERE ANY ACTIVE WHICH DO NOT CONTRIBUTE?
current_sample_index = ...
find(BIG_CONtmp2==min(BIG_CONtmp2),1);
%disp('Active which did not contribute selected')
%current_sample_index
else
% UPDATE THE A OF ONE RANDOM ACTIVE SAMPLE
current_sample_index = ...
active_samples(ceil(1+rand*(length(active_samples)-1)));
%disp('Random active sample selected') current_sample_index
end
%% INCREMENT
mRVM1_iterator=mRVM1_iterator+1;
end
disp('TRAINING FINALIZED.');
disp('-------------------');
toc
fprintf('Total iterations until convergence: %d\n',mRVM1_iterator)
fprintf('Number of prototypical samples: %d out of %d total.\n',length(active_samples),N)
if ~exist('dataset_name','var')
dataset_name = 'untitled_run';
end
X_prototypical = cell(Sources,1);
for s=1:Sources
X_aux = X_collection{i};
X_prototypical{s} = X_aux(active_samples,:);
end
X_prototypical_standardized = cell(Sources,1);
for s=1:Sources
if standardize_flag{s}
X_aux = Xtrain_collection{s};
X_prototypical_standardized{s} = X_aux(active_samples,:);
else
X_prototypical_standardized{s} = nan;
end
end
OUTPUT = mRVM_train_output(model_used, dataset_name, N, X_prototypical,standardize_flag,X_prototypical_standardized,Xmean,Xstd, Ktrain(active_samples,active_samples),...
W(active_samples,:), active_samples,kernel_type,kernel_param,Sources,beta);
end
%% AUXILIARY FUNCTIONS
% authors: Ioannis Psorakis psorakis@gmail.com, Theodoros Damoulas
% theo@dcs.gla.ac.uk
% Copyright NCR, 2009
%
% Use of this code is subject to the Terms and Conditions of the Research License Agreement,
% agreed to when this code was downloaded, and a copy of which is available at
% http://www.dcs.gla.ac.uk/inference/pMKL/Terms_and_Conditions.html.
function ai = update_a_mRVM1(C,si,qci)
result = (C*si^2) / (sum(qci.^2)-C*si);
if result<1e-5
result = 1e-6;
end
ai=result;
end
function a_new = update_a_mRVM1_first_time(ki,Y)
C = size(Y,1);
nominator = sum(ki.^2);
kis = repmat(ki,C,1);
denominator = sum(sum(kis.*Y,2).^2) / (C*nominator -1);
a_new = nominator/denominator;
end