-
Notifications
You must be signed in to change notification settings - Fork 2
/
statistics_on_genes.m
146 lines (114 loc) · 6.71 KB
/
statistics_on_genes.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# statistics_on_genes runs k-means clustering with multi-dimensional scaling for transcript data downloaded from Cyanomics database or flux data generated from RUN_all.m
load('SynechococcusPCC7002.mat'); %fbamodel
load('Syn7002_IDs.mat'); % list of gene IDs extracted from transcriptomic reads file
genes = fbamodel.genes;
genes_in_dataset = Syn7002_IDs;
V = numel(genes);
M = 2; %number of objectives
% Load transcripts
load('transcripts.mat')
% Load flux matrices and calculate flux fold changes by dividing all experimental flux vectors by control flux vector
load('all_atp_flux.mat')
load('all_p1_flux.mat')
load('all_p2_flux.mat')
ATP_FC=all_atp_flux(:,1:23)./all_atp_flux(:,24);
P1_FC=all_p1_flux(:,1:23)./all_p1_flux(:,24);
P2_FC=all_p2_flux(:,1:23)./all_p2_flux(:,24);
ATP_FC(isnan(ATP_FC))=1;%set NaN values to 1
ATP_FC(isinf(ATP_FC))=max_ATP_FC;% set inf values to max flux value in matrix
P1_FC(isnan(P1_FC))=1;
P1_FC(isinf(P1_FC))=max_P1_FC;
P2_FC(isnan(P2_FC))=1;
P2_FC(isinf(P2_FC))=max_P2_FC;
transcripts(24,:)=ones; %all ones for standard control FC
ATP_FC=ATP_FC';
ATPTF=horzcat(transcripts,ATP_FC);
ATPTF(24,:)=ones; %all ones for standard control FC
P1_FC=P1_FC';
P1TF=horzcat(transcripts,P1_FC);
P1TF(24,:)=ones; %all ones for standard control FC
P2_FC=P2_FC';
P2TF=horzcat(transcripts,P2_FC);
P2TF(24,:)=ones; %all ones for standard control FC
%% COMPUTE PROFILES OF THE NONDOMINATED POINTS
all_objpairs = transcripts'; % choose dataset i.e. from transcripts (transcripts) , fluxes (all_atp_flux, all_p1_flux, all_p2_flux), or combined (ATPTF, P1TF, P2TF)
all_solutions = transcripts'; % same dataset here
%%
all_biomass_values = all_solutions(:,1);
profiles = all_objpairs;%(ix_of_interest,:);
genes_vs_profiles = profiles'; %we need to use profiles' and not profiles, because otherwise it would compute the correlation (and all the following measures) between profiles along all the genes, while we want the correlation between genes along the profiles
dist_correlation_vector = pdist(zscore(genes_vs_profiles), 'correlation'); %use zscore to standardize each of the profiles to have zero mean and unit variance
dist_correlation_matrix = squareform(dist_correlation_vector);
clusterTree = linkage(dist_correlation_vector, 'average');
%% *******************************************************************************************************************************************************************************
%try k-means clustering with a few number of clusters: the result was
%that the max(silhouette) was k=10 for the high_biomass_points and k=6 for the low_biomass_points
%Note that we discarded trivial clustering solutions with too few clusters
% prompt = 'k-means: Press ''y'' if the number of cluster is known, or any other key to execute silhouette analysis ';
% answer = input(prompt,'s');
%
% if strcmp(answer,'y')
% mean_silhouette = zeros(1,30);
% for NoClust = 2:30
% [cidx, ctrs] = kmeans(genes_vs_profiles, NoClust, 'dist','correlation', 'rep', 5, 'disp','final'); %kmeans_corr_1 is my modified version of pdist, in which we use x - 1 instead of x - mean(x) all over the place in the definition of the correlation distance here http://www.mathworks.co.uk/help/stats/pdist.html?refresh=true
% %figure;
%
% % We now compute silhouette values. Large silhouette value, greater than 0.6, indicating that the cluster is somewhat separated from neighboring clusters. However, the third cluster contains many points with low silhouette values, and the first contains a few points with negative values, indicating that those two clusters are not well separated.
% % The average s(i) over all data of a cluster is a measure of how tightly grouped all the data in the cluster are.
% figure;
% [silh5,h] = silhouette(genes_vs_profiles,cidx,'corr');
% mean_silhouette(NoClust) = mean(silh5);
% h = gca;
% h.Children.EdgeColor = [.8 .8 1];
% close(gcf) %we close the silhouette plot otherwise we have one plot per each index of the loop
% xlabel 'Silhouette Value';
% ylabel 'Cluster';
%
% end
% end
%
% figure
% plot(mean_silhouette);
%%
prompt = 'k-means: what is the number of clusters chosen after inspection of the mean_silhouette plot? (10 for high biomass, 9 for low biomass) ';
num_clusters = input(prompt)
%% k-means clustering
[cidx, ctrs] = kmeans(genes_vs_profiles, num_clusters, 'dist','cityblock', 'rep',5, 'disp','final'); %kmeans_corr_1 is my modified version of pdist, in which we use x - 1 instead of x - mean(x) all over the place in the definition of the correlation distance here http://www.mathworks.co.uk/help/stats/pdist.html?refresh=true
figure
for c = 1:num_clusters
subplot(5,ceil(num_clusters/5),c);
plot(genes_vs_profiles((cidx == c),:)');
%axis tight
end
suptitle('K-Means Clustering of Genes');
% We now compute silhouette values. Large silhouette value, greater than 0.6, indicating that the cluster is somewhat separated from neighboring clusters. However, the third cluster contains many points with low silhouette values, and the first contains a few points with negative values, indicating that those two clusters are not well separated.
% The average s(i) over all data of a cluster is a measure of how tightly grouped all the data in the cluster are.
figure;
[silh5,h] = silhouette(genes_vs_profiles,cidx,'cityblock'); %silhouette_corr_1 is a special version of silhouette
h = gca;
h.Children.EdgeColor = [.8 .8 1];
xlabel 'Silhouette Value';
ylabel 'Cluster';
% PLOT AVERAGE IN EVERY CLUSTER
figure
for c = 1:num_clusters
subplot(5,ceil(num_clusters/5),c);
plot(ctrs(c,:)');
%axis tight
axis off % turn off the axis
end
suptitle('K-Means Clustering of Profiles');
% % %% PLOT DISTANCE BETWEEN GENES THROUGH MULTI-DIMENSIONAL SCALING APPLIED TO THE CORRELATION MATRIX TO MAKE IT BECOME A DISTANCE MATRIX
% %
%[Y,stress] = mdscale_robust(dist_correlation_vector,2,'sstress','metricsstress');
%[Y,stress] = mdscale_robust(dist_correlation_vector,2,'criterion','metricstress');
options = statset('MaxIter',500); % change MaxIter to 500
[Y,stress] = mdscale_robust(dist_correlation_vector,2,'criterion','sstress','start','random','Options',options); %mdscale_robust is a variation which circumvents colocation error by multiplying dissimilarities by a scalar
figure
C = cidx; %colour according to kmeans clustering
colormap(jet(256))
scatter(Y(:,1),Y(:,2),200,C,'.');
title(['K-Means Clustering (k=' num2str(numel(unique(cidx))) ')']);
labels = num2str((1:size(Y,1))','%d'); %'
text(Y(:,1), Y(:,2), labels, 'horizontal','left', 'vertical','bottom')
%colorbar;