-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_interaction_variables.m
454 lines (427 loc) · 18.1 KB
/
create_interaction_variables.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
function data_set = create_interaction_variables(data_set,vars,range_nways,separator,max_varname_length)
% create_interaction_variables
%
% for Matlab R13+
% version 1.1 (April 2012)
% (c) Brian Weidenbaum
% website: http://www.BrianWeidenbaum.com/.
%
%
% OUTPUT: your dataset (or a dataset based on your matrix), updated with new, aptly-named *unique* interaction variables,
% ranging from at least 2 to any number the user specifies
%
% INPUTS (** = OPTIONAL)
% input name: (input datatype/s) -- description
%
% data_set: (dataset OR matrix) -- the data you want to alter
%
% **vars: (cell array of chars/numbers, OR vector of numbers, OR 'ALL') --
% default: 'ALL'
% the names of the variables you want to interact; alternatively, the column numbers of the variables you want to interact
% OR, you can just say 'ALL' to include all variables automatically
%
% **range_nways (vector OR 'MAX') --
% default: 2
% the range of numbers of variables to include in the interaction terms generated by this function
% alternatively, just type 'MAX' to use 2 variables to the maximum possible number of vars
%
% **separator (char) --
% default: '_'
% the separator string you want to use to divide the
% variable names that contributed to a new interaction variable. Default
% is '_'. E.g., by default, an interaction between Var1 and Var2 will be
% named 'Var1_Var2'.
%
% **max_varname_length (number) --
% default: 63
% the maximum length of the newly created interaction terms' variable names.
% Any dynamically generated variable names (e.g. 'Var1_Var2') that exceed this number will be excluded from the new dataset.
% You should set max_varname_length according to the database you plan to use with your data--
% e.g., if you only want to use this data in MATLAB, you should set max_varname_length=63 (the maximum length supported by the dataset class,
% but if you plan to export your data to Oracle, you should set it to around 30
%
%
%
% EXAMPLES
%
% You have a dataset with 3 variables: a, b, and c.
% You want to create interaction terms, up to 3 ways, for all of your variables.
% You type:
% new_dataset = create_interaction_variables(old_dataset,'all','max');
% new_dataset will contain:
% a.*b, named 'a_b'
% a.*c, named 'a_c'
% b.*c, named 'b_c'
% and a.*b.*c, named 'a_b_c',
% plus all original variables.
% It will NOT contain b.*a, c.*a, etc because these are not unique combos.
%
% You have a dataset with 3 variables: a, b, and c.
% You want to create interaction terms, up to 2 ways, only for columns 2 and 3.
% You type:
% new_dataset = create_interaction_variables(old_dataset,[2 3],2);
% new_dataset will contain only b.*c,
% plus all original variables.
%
% You have a dataset with 3 variables: a, b, and c.
% You want to create interaction terms, up to 2 ways, only for 'a' and 'c'.
% You type:
% new_dataset = create_interaction_variables(old_dataset,{'a','c'},2);
% new_dataset will contain only a.*c,
% plus all original variables.
%
% You have a dataset with 3 variables: a, b, and c.
% You want to create interaction terms, ONLY 3 ways, only for all vars.
% You type:
% new_dataset = create_interaction_variables(old_dataset,'all',3);
% new_dataset will contain only a .* b .* c,
% plus all original variables.
%
%
%
% CHANGE LOG
% Changes between 1.0 and 1.1:
% -added 'separator' parameter, enabling users to create custom named vars
% -changed 'max_nways' parameter to 'range_nways', giving user more control
% over the types of interactions created
% -enforced maximum max_varname_length as 63
% -reduced min number of arguments to 1, setting defaults for other 4 params
% -minor performance tweaks via vectorizing some inner function loops
%
%
%PHASE ONE: USER INPUT VALIDATION
% set default values for all params except data_set
if nargin==1
vars='all';
range_nways=2;
separator='_';
max_varname_length = 63; %max length supported by MATLAB dataset class
elseif nargin==2
range_nways=2;
separator='_';
max_varname_length = 63;
elseif nargin==3
separator='_';
max_varname_length = 63;
elseif nargin==4
max_varname_length = 63;
end
%if data_set param is a regular matrix, convert it to a dataset
%Otherwise, user will be unable to know which columns correspond to which interactions
if ~strcmp(class(data_set),'dataset')
data_set = matrix2dataset(data_set);
%this data_set will include default variable names, like 'Var1', 'Var2', etc
end
%check if user input 'all' or 'ALL' for vars param
%if so, convert vars = data_set.Properties.VarNames;
if ischar(vars)
if strcmpi(vars,'ALL')
vars = data_set.Properties.VarNames;
else
error('If you are using a char array as your vars parameter, you can only set it as ''all'' or ''ALL''. ');
end
end
nvars = length(vars);
if nvars<2
error('You must specify at least 2 variable names/column numbers inside the vars parameter.');
end
%validate range_nways
if ischar(range_nways)
range_nways = 2:nvars;
elseif isnumeric(range_nways)
if length(range_nways)<1
error('The max_nways parameter should contain at least one number.');
end
% 2 <= max vars per interaction <= total number of variable names passed in
% set any nways <2 =2 or >nvars = nvwars and drop all but one
% cannot have any repeating numbers, so in end just replace with unique
if sum(range_nways<2)>0
disp(['At least one number in range_nways is < 2.' ...
' Since you must have at least 2 variables per interaction, all numbers < 2 have been replaced with 2.']);
end
range_nways(range_nways<2)=2;
if sum(range_nways>nvars)>0
disp(['At least one number in range_nways is > ' num2str(nvars)...
' (the total number of variables you are trying to create interactions from).' ...
' All such numbers will be dropped from range_nways and replaced with a single ' num2str(nvars) '.']);
end
range_nways(range_nways>nvars)=nvars;
range_nways=unique(range_nways);
%force rnways to be a row vector
range_nways = reshape(range_nways,1,length(range_nways));
else
error('range_nways parameter MUST be a vector of numbers, or ''ALL''.');
end
%check if each variable name/column number in vars param exists inside the dataset
%also, if necessary, convert column numbers to variable names, for future use with eval strings
if iscell(vars)
if ~ischar(vars{1}) %if the varnames are NOT chars
vars = columns2names(vars);
else %vars are chars, so check if each name is member of vars
for i=1:length(vars)
if ~ismember(vars{i},data_set.Properties.VarNames)
disp([vars{i} ' is not a valid column and will not be included in the interactions.' ]);
nvars=nvars-1;
end
end
vars(~ismember(vars,data_set.Properties.VarNames))=[];
end
else %its for sure a vector of numbers
vars = columns2names(vars);
end
%validate separator
if ~ischar(separator)
error('Seperator needs to be a char array.');
elseif length(separator)>60
error('Your separator term is too long.');
elseif ismember('%',separator)
error('Your separator contains one or more illegal characters.');
end
%validate max_varname_length
if max_varname_length>63
error('Max varname length exceeds 63, which is the highest number supported by the MATLAB dataset class.');
end
%END INPUT VALIDATION
%PHASE TWO: CREATE ALL THE INTERACTION VARIABLES
all_evalstrs = get_evalstrs;
for i=1:length(all_evalstrs)
%trycatch prevents multiplying chars by numbers
try
eval(all_evalstrs{i});
catch e
disp(e.message);
end
end
%PHASE THREE: PROFIT
%
%
% INNER FUNCTIONS
%
%
%inner function that translates varname INDICES to actual varnames cellarray of chars
%for use in eval() function
function char_varnames = columns2names(columns)
%first detect if the columns are a cell array; if so, xform to vector
if iscell(columns)
columns = cell2mat(columns);
end
%check if all column numbers are between 1 and total nvars in data_set
for ii=1:length(columns)
if ((columns(ii)<1) || (columns(ii)>size(data_set,2)))
disp(['Column number ' num2str(columns(ii)) ' is not a valid column and will not be included in the interactions.' ]);
nvars=nvars-1;
end
end
columns(columns(:)<1 | columns(:)>size(data_set,2))=[];
char_varnames = data_set.Properties.VarNames(columns);
end %inner function
% inner function that creates all UNIQUE combinations of indices for 2:maxvars_perinteraction
% adds them all to a cell array
function all_indices = get_interaction_indices()
all_indices={};
ct=0;
%for each number of vars per interaction...
%get all unique combos of indices
%e.g. if 3 vars, and 3 max vars per interaction
%first, get 2 var combos:
%12,13,23
%then get 3 var combo: 123
%add all that to cell array and return it
for nways = range_nways
%need to dynamically create allcomb args
%if it's choose2, there are 2 args: 1:nvars,1:nvars
%if it's choose3, there are 3 args: 1:nvars,1:nvars,1:nvars ...etc
allcomb_argstr = repmat('1:nvars,',1,nways);
%remove final ','
allcomb_argstr = allcomb_argstr(1:end-1);
%allcomb returns cartesian product
all_combos = eval(['allcomb(' allcomb_argstr ');']);
%now choose all appropriate combos
%so we only have unique combinations (eg not 2 1 and 1 2)
logical_str='';
for choose_columns= nways:-1:2
logical_str=[logical_str, '(all_combos(:,' , num2str(choose_columns) , ')>all_combos(:,' , num2str(choose_columns-1) , '))&' ];
end
%remove final '&'
logical_str = logical_str(1:end-1);
% below gives the combo indices
good_combos = all_combos(eval(logical_str),:);
%now add each row in good_combos to all_indices
ngood = size(good_combos,1);
for ii=1:ngood
ct =ct+1;
all_indices{ct} = good_combos(ii,:);
end %adding rows to all_indices
end %loop that goes thru each possible number of vars per interaction
end %get_interaction_indices inner fx
% inner function that gets the 'data_set.var1_var2_varN=data_set.var1.*data_set.var2.*datset.varN' version of indices
function str = indices2str(indices)
% goal: if give two indices [1 2], return the following string:
% 'data_set.var1_var2 = data_set.var1.*data_set.var2;'
n_indices = length(indices);
%left of equals sign should be: 'data_set.var1_var2'
left_str = 'data_set.';
%add each varname separated by _
for i=1:n_indices
left_str = [left_str vars{indices(i)} separator]; %seperator var
end %building varname string
%remove the final separator
left_str = left_str(1:end-length(separator));
%check if varname is too big
if length(left_str)-length('data_set.') > max_varname_length
error(['Name of new variable: ''' left_str(length('data_set.')+1:end) ...
''' exceeded max varname length of ' num2str(max_varname_length) ', and will not be included in the new dataset.']);
end
%now for the righthand side of equals, which should be:
%'data_set.var1.*data_set.var2;'
right_str = '';
for i=1:n_indices
right_str = [right_str 'data_set.' vars{indices(i)} '.*'];
end %building varname string
%removing the final '.*'
right_str=[right_str(1:end-2) ';'];
str= [left_str '=' right_str];
end %indices2str inner fx
%will give cellarray where each cell contains a good evalstr
function all_evalstrs = get_evalstrs
%first get all the indices
inds = get_interaction_indices;
ncombos= length(inds);
all_evalstrs = cell(ncombos,1);
%go thru each combo of indices and try to create an eval str
%need try catch bc of potential max varname length problems
for ii = 1:ncombos
try
all_evalstrs{ii}=indices2str(inds{:,ii});
catch e
all_evalstrs{ii}='';
disp(e.message);
end
end
end %get all evalstrs
function A = allcomb(varargin)
% ALLCOMB - All combinations
% B = ALLCOMB(A1,A2,A3,...,AN) returns all combinations of the elements
% in A1, A2, ..., and AN. B is P-by-N matrix is which P is the product
% of the number of elements of the N inputs.
% Empty inputs yields an empty matrix B of size 0-by-N. Note that
% previous versions (1.x) simply ignored empty inputs.
%
% Example:
% allcomb([1 3 5],[-3 8],[0 1]) ;
% 1 -3 0
% 1 -3 1
% 1 8 0
% ...
% 5 -3 1
% 5 8 0
% 5 8 1
%
% ALLCOMB(A1,..AN,'matlab') causes the first column to change fastest.
% This is more consistent with matlab indexing. Example:
% allcomb(1:2,3:4,5:6,'matlab') %->
% 1 3 5
% 2 3 5
% 1 4 5
% ...
% 2 4 6
%
% This functionality is also known as the cartesian product.
%
% See also NCHOOSEK, PERMS, NDGRID
% and COMBN, KTHCOMBN (Matlab Central FEX)
% for Matlab R13+
% version 2.2 (jan 2012)
% (c) Jos van der Geest
% email: jos@jasen.nl
% History
% 1.1 (feb 2006), removed minor bug when entering empty cell arrays;
% added option to let the first input run fastest (suggestion by JD)
% 1.2 (jan 2010), using ii as an index on the left-hand for the multiple
% output by NDGRID. Thanks to Jan Simon, for showing this little trick
% 2.0 (dec 2010). Bruno Luong convinced me that an empty input should
% return an empty output.
% 2.1 (feb 2011). A cell as input argument caused the check on the last
% argument (specifying the order) to crash.
% 2.2 (jan 2012). removed a superfluous line of code (ischar(..))
error(nargchk(1,Inf,nargin)) ;
% check for empty inputs
q = ~cellfun('isempty',varargin) ;
if any(~q),
warning('ALLCOMB:EmptyInput','Empty inputs result in an empty output.') ;
A = zeros(0,nargin) ;
else
ni = sum(q) ;
argn = varargin{end} ;
if ischar(argn) && (strcmpi(argn,'matlab') || strcmpi(argn,'john')),
% based on a suggestion by JD on the FEX
ni = ni-1 ;
ii = 1:ni ;
q(end) = 0 ;
else
% enter arguments backwards, so last one (AN) is changing fastest
ii = ni:-1:1 ;
end
if ni==0,
A = [] ;
else
args = varargin(q) ;
if ~all(cellfun('isclass',args,'double')),
error('All arguments should be arrays of doubles') ;
end
if ni==1,
A = args{1}(:) ;
else
% flip using ii if last column is changing fastest
[A{ii}] = ndgrid(args{ii}) ;
% concatenate
A = reshape(cat(ni+1,A{:}),[],ni) ;
end
end
end
end %allcomb inner fx
function output = matrix2dataset(matrix,ifx_varnames)
% matrix2dataset
%
% INPUTS
% matrix (N x M matrix): the matrix version of the data you want to convert
% varnames (OPTIONAL cell vector of chars): the names of the columns, in order
%
% OUTPUT:
% a dataset that contains all the data from your matrix, named according to
% your varnames
% assumes that each column in matrix is one dataset column-- eg no dataset
% column should consist of >1 matrix column
%
%
% EXAMPLE:
% This function is great for when you have converted an old dataset
% into a matrix of doubles for performing some calculations, but now you
% want to update your original dataset with new values or new columns that
% you have created from your calculations.
%
% In that situation, you would type the following:
% updated_dataset = matrix2dataset(matrix_from_old_dataset, old_dataset.Properties.VarNames);
%
% If you don't have any varnames, you can choose not to use the varnames parameter
% In that case, you will get the default dataset names: Var1, Var2,..VarN
%
n_vars = size(matrix,2);
if nargin==2
if ~(length(ifx_varnames)==n_vars)
error('You must have one variable name for each column in the matrix parameter.');
end
end
str_vars = '';
for ii=1:n_vars
str_vars = [str_vars, 'matrix(:,', num2str(ii), '),' ];
end
if nargin == 2
ds_arguments = [str_vars, '''VarNames'',','ifx_varnames'];
elseif nargin==1
ds_arguments = str_vars(1:end-1);
end
output = eval(['dataset(' ds_arguments ');']);
end%innerfx
%END OF INNER FUNCTIONS
end%main fx