-
Notifications
You must be signed in to change notification settings - Fork 1
/
S01_Prepare_ASE_data.m
201 lines (159 loc) · 8.55 KB
/
S01_Prepare_ASE_data.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
% This file gets a set of ASE data files and merges them into a matrix,
% it make a Union of all SNPs putting NaN into missing values. The folmat
% of these files is described/defined further below in the section "%% Constants/ Input file format"
% "FileNAmePattern" indicates the input files, it can be for example:
% Myfoler/ all files in this this folder
% Myfolder/*.txt
% *.[1-9].dd
% Basically any sort of pattern that fits to linux find function would fit
% here too. '
% Example: S01_Prepare_ASE_data('01_Data/AE_Sample_Data/*.txt', 10, 'Test_run');
% Pej March 2015 NYGC
%--------------------
function S01_Prepare_ASE_data(FileNamePattern, minOccurance, AnalysisLabel)
if nargin == 0
% Make a test run
AnalysisLabel= 'Test_run';
end
OutFolder = ['02_Results/' AnalysisLabel];
mkdir(OutFolder);
delete([OutFolder '/S01_Prepare_ASE_data_run.log']);
diary( [OutFolder '/S01_Prepare_ASE_data_run.log']);
disp('Run started at:')
disp(datetime(now,'ConvertFrom','datenum'))
if nargin == 0
beep
disp('No input parameters were given. Running demo on sample data.')
minOccurance = 10;
FileNamePattern = '01_Data/AE_Sample_Data/*.txt';
end
%% Run the script
try
S01_Prepare_ASE_data_(FileNamePattern, minOccurance, OutFolder)
catch ee
ee.getReport
diary off
end
end
function S01_Prepare_ASE_data_(FileNamePattern, minOccurance, OutFolder)
%% Constants/ Input file format (PARAMETERS)
% Mandatory fields ======================
ColumnLabel_for_uniqueID = 'VARIANT_ID'; % Label for the column carrying a unique varinat ID for aeSNP. **Edit this to match your data**
ColumnLabel_for_refCount = 'REF_COUNT'; % Label for the column carrying ref. allele count at aeSNP. **Edit this to match your data**
ColumnLabel_for_altCount = 'ALT_COUNT'; % Label for the column carrying alt. allele count at aeSNP. **Edit this to match your data**
ColumnLabel_for_GeneID = 'GENE_ID'; % Label for the column carrying the gene ID associated with aeSNP. **Edit this to match your data**
% Optional fields (if you don't need these comment out the lines)
ColumnLabel_for_null_ratio = 'NULL_RATIO'; % Label for the column carrying the null ratio for the aeSNP. This is the reference bias, we define it as the expected reference ratio in absence of any regulatory difference between the haplotype. If you don't have this in your data, delete this line and we assume null ratio is 0.50 **Edit this to match your data**
% Optional QC columns (if you don't need these comment out the lines)
% if there are columns "MAPPING_BIAS_SIM", "GENOTYPE_WARNING", and
% "LOW_MAPABILITY" are available in the data, all lines with 1 in either of
% those will be removed before the analysis too. If you have some other QC
% things just apply to the data before running ANEVA.
QC_ColumnLabels = {'MAPPING_BIAS_SIM', 'GENOTYPE_WARNING', 'LOW_MAPABILITY'}; % QC_ColumnLabels is a cell array of column names for qulity control columns in data.
MinCount = 8; % Discard ASE data with total count smaller than this from the data
%% Default values
Files = Pej_GetFiles(FileNamePattern);
NF = length(Files);
if nargin <2
minOccurance = min(10, NF); % Variants that are less common than this will be discarded from the file "Common_SNPs".
end
%% The rest of the code
fprintf(1, '%d files found!\n', NF);
for i = NF:-1:1
fprintf(1, '%i files left...', i);
% FormatS = '%s%s%s%s%s%s%s%s%f%f%f%f%f%f%f%f%s%s%s%s%s%f%f%f'; % for standard GTEx_V6
% FormatS='%s%s%s%s%s%f%f%f%f%f%f%f%f%s%s%f%s%s%f%f%f%f%f%f'; % For GEUVADIS
% tmpD= Pej_Read_Table(Files{i}, [], false, FormatS);
tmpD= Pej_Read_Table(Files{i}, [], false, [], true);
% converting the data column names to what is used around the code.
D{i}.uniqueID =tmpD.(ColumnLabel_for_uniqueID);
D{i}.refCount = str2double(tmpD.(ColumnLabel_for_refCount));
D{i}.totalCount = D{i}.refCount + str2double(tmpD.(ColumnLabel_for_altCount));
D{i}.GeneID =tmpD.(ColumnLabel_for_GeneID);
% D{i}.chr = tmpD.CHR;
% D{i}.position = tmpD.POS;
% D{i}.Variant_Annotation = repmat({'NA'}, size(D{i}.GeneID,1), size(D{i}.GeneID,2));
MetaData.SAMPLE_ID{i} = tmpD.SAMPLE_ID{1};
% MetaData.SUBJECT_ID{i} = tmpD.SAMPLE_ID{1}(1:find(tmpD.SAMPLE_ID{1}=='.',1)-1);
%% If null ratio is missing from the data, leave it empty
try
D{i}.null_ratio = str2double(tmpD.(ColumnLabel_for_null_ratio));
catch
D{i}.null_ratio = nan(size(D{i}.uniqueID));
end
%% Discard low quality aeSNPs if the columns are there, if they are not it'll just skip it.
% See Castel et al. 2015 for reference (https://doi.org/10.1186/s13059-015-0762-6)
tmpFilt = false(size(D{i}.uniqueID));
for i_QC = 1:length(QC_ColumnLabels)
try
tmpFilt = tmpFilt | str2double(tmpD.(QC_ColumnLabels{i_QC}))==1;
catch err
error(['Optional QC column "' QC_ColumnLabels{i_QC} '" is not available in the data. If it is supposed to be there check if there is a typo. If you don''t have or need it remove the name of this column from the list of QC columns at the begining of S01_Prepare_ASE_data.m'])
end
end
fprintf('\n%d our of %d (%.2f%%) variants were filtered from %s\n', sum(tmpFilt), length(tmpFilt), (sum(tmpFilt)/length(tmpFilt))*100, tmpD.SAMPLE_ID{1});
D{i}= Pej_Struct_RowDel(D{i}, tmpFilt);
D{i} = Pej_Struct_RowSelect(D{i}, D{i}.totalCount>=MinCount);
end
mkdir(OutFolder);
% save([OutFolder '/All_Imported'], 'D', 'FileNamePattern');
% load('All_Imported');
%% Join them in a single matrix
jSNPs = Union_stats(D);
figure
hist(jSNPs.occurance(jSNPs.occurance>1), 1:max(jSNPs.occurance));
% xlim([0 101])
xlabel('Hetrozygous occurance in dataset')
ylabel('Number of aeSNPs')
Pej_SavePlot(gcf, [OutFolder '/Figures/aeSNP_occurrence']);
%%
jSNPcommon= Pej_Struct_RowDel(jSNPs, jSNPs.occurance<minOccurance); clear jSNPs
jSNPcommon.totalCount = nan(length(jSNPcommon.uniqueID),NF);
jSNPcommon.refCount = nan(length(jSNPcommon.uniqueID),NF);
jSNPcommon.null_ratio = nan(size(jSNPcommon.uniqueID));
jSNPcommon.GeneID = cell(size(jSNPcommon.uniqueID));
% jSNPcommon.chr = cell(size(jSNPcommon.uniqueID));
% jSNPcommon.position = cell(size(jSNPcommon.uniqueID));
% jSNPcommon.Variant_Annotation = cell(size(jSNPcommon.uniqueID));
for i = 1:NF
[~, Ij, Id] = intersect(jSNPcommon.uniqueID, D{i}.uniqueID);
jSNPcommon.totalCount(Ij,i) = D{i}.totalCount(Id);
jSNPcommon.refCount(Ij,i) = D{i}.refCount(Id);
% jSNPcommon.chr(Ij,1) = D{i}.chr(Id); % This line does not really have o be done so many times but it's a lazy apporaoch to make sure all SNPs are covered.
% jSNPcommon.position(Ij,1) = D{i}.position(Id); % This line does not really have o be done so many times but it's a lazy apporaoch to make sure all SNPs are covered.
jSNPcommon.null_ratio(Ij,1) = D{i}.null_ratio(Id); % This line does not really have o be done so many times but it's a lazy apporaoch to make sure all SNPs are covered.
jSNPcommon.GeneID(Ij,1) = D{i}.GeneID(Id); % This line does not really have o be done so many times but it's a lazy apporaoch to make sure all SNPs are covered.
% jSNPcommon.Variant_Annotation(Ij,1) = D{i}.Variant_Annotation(Id); % This line does not really have o be done so many times but it's a lazy apporaoch to make sure all SNPs are covered.
% delete it
D{i} = [];
end
%% Read in sample expression normalization factors
save([ OutFolder '/Common_SNPs'], 'jSNPcommon', 'MetaData');%,'-v7.3'); % ver 7.3 allows for saving large arrays.
disp(['Data for ' num2str(length(jSNPcommon.uniqueID)) ' aeSNPs was saved in: ' OutFolder '/Common_SNPs.mat'] )
tfout = fopen([OutFolder '/Common_SNPs_SNPids.txt'], 'w');
fprintf(tfout, 'snp_ids\n');
fprintf(tfout, '%s\n', jSNPcommon.uniqueID{:});
fclose(tfout);
disp(['Used aeSNP IDs were saved in: ' [OutFolder '/Common_SNPs_SNPids.txt']] )
disp('Run ended at:')
disp(datetime(now,'ConvertFrom','datenum'))
diary off
end
function U = Union_stats(D)
tmpFile = ['temp_' num2str(rand*1E+16) '_' num2str(cputime)];
tmpF = fopen(tmpFile, 'w');
for i = 1:length(D)
fprintf(tmpF, '%s\n', D{i}.uniqueID{:});
end
fclose(tmpF);
system(sprintf('sort %s >%s.sorted', tmpFile, tmpFile));
system(sprintf('uniq -c %s.sorted >%s.counts', tmpFile, tmpFile));
system(sprintf('rm %s', tmpFile));
system(sprintf('rm %s.sorted', tmpFile));
tmpF = fopen(sprintf('%s.counts', tmpFile), 'r');
tmpIDs = textscan(tmpF, '%f%s');
fclose(tmpF);
U.uniqueID = tmpIDs{2};
U.occurance = tmpIDs{1};
system(sprintf('rm %s.counts',tmpFile));
end