forked from pejminister/Pejtools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Pej_Genomic_Intersection.m
219 lines (176 loc) · 6.34 KB
/
Pej_Genomic_Intersection.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
function [I]= Pej_Genomic_Intersection(Loci1, Loci2)
% This function reports the overlaps between two sets of genomic locations.
% NOTE: this function does THE SAME JOB as "Pej_Genomic_Intersection_2.m", it
% just has a different implementation. The other one is usually slower than
% this, but this one can be slow if there are a large amount of
% overlapping loci at one given point, like 100 or 1000 overlapping things at one point.
% INPUTS:
% Two Loci structures:
% Loci.chr[nx1] : chromosome names: (string)
% Loci.start[nx1] : start of the position: (positive int)
% Loci.end[nx1] : end of the position: (positive int)
% Loci.dir[nx1] : strand of the locale: (char: [ '+' / '-']); if missing matching will be done disregarding the strand
% Loci.chr_num[nx1] : [optional] chromosome numeric Ids: (number)
% OUTPUTS:
% I: A 2xz array such that each row, [n, m], says n-th locale in the Loci1 intersects
% with m-th locale in the loci 2. If one set has more than one intersecting
% sets in the other set, it will show up in multiple lines.
% Writtn by Pejman Mohammadi
% 27 June 2014, Basel
% Pejman.m@gmail.com
% ---------------------------------
% An ID field is defined in this code so we remove it from the data if it happens to be there!
if isfield(Loci1, 'ID')
warning('There was an ID field in the data, FYI I removed it :)')
Loci1 = rmfield(Loci1, 'ID');
end
if isfield(Loci2, 'ID')
warning('There was an ID field in the data, FYI I removed it :)')
Loci2 = rmfield(Loci2, 'ID');
end
L1 = size(Loci1.start,1);
L2 = size(Loci2.start,1);
if ~isfield(Loci1, 'chr_num')
tmpCHRs =[Loci1.chr; Loci2.chr];
[~, ~, tmpCi] = unique(tmpCHRs);
Loci1.chr_num = tmpCi(1:L1);
Loci2.chr_num = tmpCi(L1+1:end);
end
Loci1.ID(:,1) = 1:L1;
Loci2.ID(:,1) = 1:L2;
if ~isfield(Loci1, 'dir') && ~isfield(Loci2, 'dir')
% disregard strand
senseFilt1 = true(size(Loci1.chr));
senseFilt2 = true(size(Loci2.chr));
elseif isfield(Loci1, 'dir') && isfield(Loci2, 'dir')
% use strand info
senseFilt1 = Loci1.dir == '+';
senseFilt2 = Loci2.dir == '+';
else
error('One input is stranded and the other one isnt, not cool!')
end
fprintf('\n%d loci to compare against %d loci\n', L1, L2);
tic
fprintf('\nAnalyzing sense strand...\nChr: ');
I =[];
[tmpI] = GGI_noStrand(Pej_Struct_RowSelect(Loci1, senseFilt1), Pej_Struct_RowSelect(Loci2, senseFilt2));
I = [I; tmpI];
fprintf('\nAnalyzing antisense strand...\nChr: ');
nS1 = ~senseFilt1;nS2 = ~senseFilt2;
[tmpI ] = GGI_noStrand(Pej_Struct_RowSelect(Loci1,nS1), Pej_Struct_RowSelect(Loci2,nS2));
fprintf('\nDone and Done! (%.0f secs)\n', toc);
I = [I; tmpI];
end
function [I]= GGI_noStrand(Loci1, Loci2)
% This fuction works for inputs that come the same strand.
s1 = size(Loci1.chr_num);
s2 = size(Loci2.chr_num);
I = [];
CHRs = unique(union(unique(Loci1.chr_num), unique(Loci2.chr_num))); % find all Chromosomes
unused1 = true(s1);
unused2 = true(s2);
for c = 1: length(CHRs)
fprintf('%d ', CHRs(c));
F1 = false(s1);
F2 = false(s2);
F1(unused1) = Loci1.chr_num(unused1) == CHRs(c);
F2(unused2) = Loci2.chr_num(unused2) == CHRs(c);
[tmpI] = GGI_noStrand_noChr(Pej_Struct_RowSelect(Loci1, F1), Pej_Struct_RowSelect(Loci2, F2));
I = [I; tmpI];
unused1(F1) = false;
unused2(F2) = false;
end
end
function [I] = GGI_noStrand_noChr(Loci1, Loci2)
% This fuction works for inputs that come the same strand of the same chromosome.
I = [];
if isempty(Loci1.start) || isempty(Loci2.start)
return
end
if issorted(Loci1.start)
if Loci1.start(1)<=Loci1.start(end)
% it's ascending
ix1 = 1:length(Loci1.start);
else
% it's descending
ix1 = length(Loci1.start):-1:1;
end
else
[~, ix1] = sort(Loci1.start, 'ascend');
end
if issorted(Loci2.start)
if Loci2.start(1)<=Loci2.start(end)
% it's ascending
ix2 = 1:length(Loci2.start);
else
% it's descending
ix2 = length(Loci2.start):-1:1;
end
else
[~, ix2] = sort(Loci2.start, 'ascend');
end
[tmpI] = GGI_noStrand_noChr_sorted(Pej_Struct_RowSelect(Loci1, ix1), Pej_Struct_RowSelect(Loci2, ix2));
I = [I; tmpI];
end
function [I] = GGI_noStrand_noChr_sorted(Loci1, Loci2)
try
% This fuction works for inputs that come the same strand of the same chromosome, and are sorted ascendingly for start positions.
L1 = size(Loci1.start,1);
L2 = size(Loci2.start,1);
mL = ceil(mean(L1,L2)/10);
I = zeros(mL, 2);
pointer1 = 1;
pointer2 = 1;
Overlapping1 = []; New_overlaps1 = [];
Overlapping2 = []; New_overlaps2 = [];
% these are the points that matter!
Milestones = union(union(Loci1.start, Loci2.start), union(Loci1.end, Loci2.end))';
lenI = 0;
for t = 1:length(Milestones)
x = Milestones(t);
% Check if overlapping hits are still overlapping
Overlapping1(Loci1.end(Overlapping1)<x)=[];
Overlapping2(Loci2.end(Overlapping2)<x)=[];
% Check if new overlapping sets have started
while pointer1<=L1 && x == Loci1.start(pointer1)
New_overlaps1 = [New_overlaps1; pointer1];
pointer1 = pointer1+1;
end
while pointer2<=L2 && x == Loci2.start(pointer2)
New_overlaps2 = [New_overlaps2; pointer2];
pointer2 = pointer2+1;
end
% Update Overlapping sets
if ~isempty(New_overlaps1)
Overlapping1 = [Overlapping1; New_overlaps1];
New_overlaps1 = [];
end
if ~isempty(New_overlaps2)
Overlapping2 = [Overlapping2; New_overlaps2];
New_overlaps2 = [];
end
% Report Overlaps
if ~isempty(Overlapping1) && ~isempty(Overlapping2)
tmpI = Cart_prod(Overlapping1, Overlapping2);
tL = size(tmpI,1);
if (lenI + tL)>size(I,1);
%increase the size of I; This is just to pre-allocate I to speed up
I(end+mL,:)=0;
end
I(lenI+(1:tL),:) = tmpI;
lenI = lenI+tL;
end
end
I = unique(I(1:lenI,:), 'rows') ;
I(:,1) = Loci1.ID(I(:,1));
I(:,2) = Loci2.ID(I(:,2));
catch
2
end
end
function P = Cart_prod(S1, S2)
if isscalar(S1) && isscalar(S2); P = [S1, S2]; return; end
S1 = unique(S1);S2 = unique(S2);
[X, Y] = meshgrid(S1, S2);
P = [X(:), Y(:)];
end