-
Notifications
You must be signed in to change notification settings - Fork 1
/
Pej_Genomic_Intersection_2.m
189 lines (143 loc) · 5.32 KB
/
Pej_Genomic_Intersection_2.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
function [I1, I2]= Pej_Genomic_Intersection_2(Loci1, Loci2)
% This function reports the overlaps between two sets of genomic locations.
% NOTE: this function does THE SAME JOB as "Pej_Genomic_Intersection.m", it
% just has a different implementation. The other one is usually faster than
% this, but this one can be faster if there are a large amount of
% overlapping loci at one given point, like 100 or 1000 overlapping things at one point.
% INPUTS:
% Two Loci structures:
% Loci.chr[nx1] : chromosome names: (string)
% Loci.start[nx1] : start of the position: (positive int)
% Loci.end[nx1] : end of the position: (positive int)
% Loci.dir[nx1] : strand of the locale: (char: [ '+' / '-'])
% OUTPUTS:
% I1, I2, and two cell arrays such that: n-th locale in the Loci1 intersects
% with all locales in Loci2 with the index I1{n}
% Writtn by Pejman Mohammadi
% 26 June 2014, Basel
% Pejman.m@gmail.com
% ---------------------------------
L1 = size(Loci1.start,1);
L2 = size(Loci2.start,1);
if ~isfield(Loci1, 'chr_num')
tmpCHRs =[Loci1.chr; Loci2.chr];
[~, ~, tmpCi] = unique(tmpCHRs);
Loci1.chr_num = tmpCi(1:L1);
Loci2.chr_num = tmpCi(L1+1:end);
end
Loci1.ID(:,1) = 1:L1;
Loci2.ID(:,1) = 1:L2;
senseFilt1 = Loci1.dir == '+';
senseFilt2 = Loci2.dir == '+';
I1 = cell(size(senseFilt1));
I2 = cell(size(senseFilt2));
fprintf('\n%d loci to compare against %d loci\n', L1, L2);
tic
fprintf('\nAnalyzing sense strand...\nChr: ');
[I1( senseFilt1), I2( senseFilt2) ] = GGI_noStrand(Pej_Struct_RowSelect(Loci1, senseFilt1), Pej_Struct_RowSelect(Loci2, senseFilt2));
fprintf('\nAnalyzing antisense strand...\nChr: ');
nS1 = ~senseFilt1;nS2 = ~senseFilt2;
[I1(nS1), I2(nS2) ] = GGI_noStrand(Pej_Struct_RowSelect(Loci1,nS1), Pej_Struct_RowSelect(Loci2,nS2));
fprintf('\nDone and Done! (%.0f secs)\n', toc);
end
function [I1, I2]= GGI_noStrand(Loci1, Loci2)
% This fuction works for inputs that come the same strand.
s1 = size(Loci1.chr_num);
s2 = size(Loci2.chr_num);
I1 = cell(s1);
I2 = cell(s2);
CHRs = unique(union(unique(Loci1.chr_num), unique(Loci2.chr_num))); % find all Chromosomes
unused1 = true(s1);
unused2 = true(s2);
for c = 1: length(CHRs)
fprintf('%d ', CHRs(c));
F1 = false(s1);
F2 = false(s2);
F1(unused1) = Loci1.chr_num(unused1) == CHRs(c);
F2(unused2) = Loci2.chr_num(unused2) == CHRs(c);
[I1(F1), I2(F2)] = GGI_noStrand_noChr(Pej_Struct_RowSelect(Loci1, F1), Pej_Struct_RowSelect(Loci2, F2));
unused1(F1) = false;
unused2(F2) = false;
end
end
function [I1, I2] = GGI_noStrand_noChr(Loci1, Loci2)
% This fuction works for inputs that come the same strand of the same chromosome.
if isempty(Loci1.start) || isempty(Loci2.start)
I1=repmat({[]}, size(Loci1.start));
I2=repmat({[]}, size(Loci2.start));
return
end
if issorted(Loci1.start)
if Loci1.start(1)<=Loci1.start(end)
% it's ascending
ix1 = 1:length(Loci1.start);
else
% it's descending
ix1 = length(Loci1.start):-1:1;
end
else
[~, ix1] = sort(Loci1.start, 'ascend');
end
if issorted(Loci2.start)
if Loci2.start(1)<=Loci2.start(end)
% it's ascending
ix2 = 1:length(Loci2.start);
else
% it's descending
ix2 = length(Loci2.start):-1:1;
end
else
[~, ix2] = sort(Loci2.start, 'ascend');
end
[I1(ix1), I2(ix2)] = GGI_noStrand_noChr_sorted(Pej_Struct_RowSelect(Loci1, ix1), Pej_Struct_RowSelect(Loci2, ix2));
end
function [I1, I2] = GGI_noStrand_noChr_sorted(Loci1, Loci2)
% try
% This fuction works for inputs that come the same strand of the same chromosome, and are sorted ascendingly for start positions.
L1 = size(Loci1.start,1);
L2 = size(Loci2.start,1);
I1 = cell(L1,1);
I2 = cell(L2,1);
mx = min([Loci1.start(1), Loci2.start(1)]);
Mx = max([max(Loci1.end), max(Loci2.end)]);
pointer1 = 1;
pointer2 = 1;
Overlapping1 = []; New_overlaps1 = [];
Overlapping2 = []; New_overlaps2 = [];
% these are the points that matter!
Milestones = union(union(Loci1.start, Loci2.start), union(Loci1.end, Loci2.end))';
for t = 1:length(Milestones)
x = Milestones(t);
% Check if overlapping hits are still overlapping
Retain_OL1 = Loci1.end(Overlapping1)>=x;
Retain_OL2 = Loci2.end(Overlapping2)>=x;
% Check if new overlapping sets have started
while pointer1<=L1 && x == Loci1.start(pointer1)
New_overlaps1 = union(New_overlaps1, pointer1);
pointer1 = pointer1+1;
end
while pointer2<=L2 && x == Loci2.start(pointer2)
New_overlaps2 = union(New_overlaps2, pointer2);
pointer2 = pointer2+1;
end
% Update Overlapping sets
Overlapping1 = [Overlapping1(Retain_OL1), New_overlaps1]; New_overlaps1 = [];
Overlapping2 = [Overlapping2(Retain_OL2), New_overlaps2]; New_overlaps2 = [];
% Report Overlaps
if ~isempty(Overlapping1) && ~isempty(Overlapping2)
% Add the current overlapping loci to those that already exist
for i = 1:length(Overlapping1)
I1{Overlapping1(i)} = union(I1{Overlapping1(i)}, Overlapping2);
end
for i = 1:length(Overlapping2)
I2{Overlapping2(i)} = union(I2{Overlapping2(i)}, Overlapping1);
end
end
end
% Translate the indexes to the original IDs
for i = 1:L1; I1{i} = Loci2.ID(I1{i}); end
for i = 1:L2; I2{i} = Loci1.ID(I2{i}); end
% catch err
% 3
% end
end