Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Splicing coverage fix #14

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 67 additions & 14 deletions src/epivizFileParser/BamFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,29 +76,82 @@ def getRange(self, chr, start, end, bins=2000, zoomlvl=-1, metric="AVG", respTyp
result = []

try:
iter = self.file.pileup(chr, start, end)
iter = self.file.pileup(
chr,
start,
end,
truncate=True,
)

chrTemp = valueTemp = None
startTemp = endTemp = start

chrTemp = startTemp = endTemp = valueTemp = None
for x in iter:
# get data
_current_reference_name = x.reference_name
_current_reference_pos = x.reference_pos
_current_valueTemp = x.get_num_aligned() # or x.nsegments

# print(x.pileups[0])

for pileupread in x.pileups:
# or look at x.get_query_sequences()
_current_valueTemp -= pileupread.is_del

if not x.reference_pos >= start or not x.reference_pos <= end:
# print(
# f"{_current_reference_pos} : {_current_valueTemp} : {x.get_query_sequences()}")

# is first?
if valueTemp is None:
chrTemp = _current_reference_name
startTemp = _current_reference_pos
valueTemp = _current_valueTemp

endTemp = _current_reference_pos + 1
continue

if x.reference_pos == end and valueTemp is not None:

# is it at the end? did it finish before end?
if x.reference_pos == end-1 and valueTemp is not None:
# last min change
if valueTemp is not _current_valueTemp:
result.append((chrTemp, startTemp, endTemp, valueTemp))
chrTemp = _current_reference_name
startTemp = _current_reference_pos
valueTemp = _current_valueTemp

result.append((chrTemp, startTemp, end, valueTemp))

continue

if valueTemp is None:
chrTemp = x.reference_name
startTemp = x.reference_pos
valueTemp = x.get_num_aligned()
elif valueTemp is not x.get_num_aligned():
# gap detection
if _current_reference_pos != endTemp:
# print('caught gap')
# print(f"{_current_reference_pos} : {endTemp}")
result.append((chrTemp, startTemp, endTemp, valueTemp))
chrTemp = _current_reference_name
startTemp = _current_reference_pos
valueTemp = _current_valueTemp

endTemp = _current_reference_pos + 1
continue

# change of coverage value
if valueTemp is not _current_valueTemp:
result.append((chrTemp, startTemp, endTemp, valueTemp))
chrTemp = x.reference_name
startTemp = x.reference_pos
valueTemp = x.get_num_aligned()
chrTemp = _current_reference_name
startTemp = _current_reference_pos
valueTemp = _current_valueTemp

endTemp = x.reference_pos+1
endTemp = _current_reference_pos + 1
continue

# no change
endTemp = _current_reference_pos + 1

# last insertion check
if len(result) == 0 or startTemp != result[len(result) - 1][1]:
if valueTemp is not None:
result.append((chrTemp, startTemp, endTemp, valueTemp))

self.get_col_names()

Expand Down
25 changes: 23 additions & 2 deletions src/epivizFileParser/SplicingBamFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,32 @@ def get_col_names(self):
return self.columns

def _get_junctions(self, chr, start, end):
iter = self.file.fetch(chr, start, end)
# iter = self.file.fetch(chr, start, end)
iter = self.file.pileup(
chr,
start,
end,
truncate=True,
)

reads_set = set()
reads_refs = []

for pileupcolumn in iter:
for pileupread in pileupcolumn.pileups:
_name = pileupread.alignment.query_name

if _name in reads_set:
pass
else:
reads_set.add(_name)
reads_refs.append(pileupread.alignment)

junctions = {}

for read in iter:
for read in reads_refs:
# print(read.query_name)

if read.cigartuples is None:
# Skipping read with no CIGAR string
continue
Expand Down
14 changes: 12 additions & 2 deletions tests/test_BamFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import pytest
import os

import pandas as pd

from epivizFileParser import BamFile

__author__ = "jkanche, alexanv8"
Expand All @@ -18,11 +20,19 @@


def test_getRange():
res, error = bb.getRange("chr10", 1, 100000000)
coverage, error = bb.getRange("chr10", 1, 100000000)
assert(error == None)

assert(len(bb.columns) == 4)
assert(bb.columns == ["chr", "start", "end", "value"])
assert(len(res) == 8)
assert(len(coverage) == 25)

assert (coverage['value'].tolist() == [0, 1, 0, 1, 0, 1, 0,
0, 1, 0, 0, 1, 2, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0])
assert (coverage['start'].tolist() == [10265460, 10265473, 10266155, 10266200, 10266296, 10266311, 10266332, 10266594, 10266646, 10266743, 10266857,
10267044, 10267063, 10267104, 10267116, 10267117, 10267118, 10267119, 10267121, 10267122, 10267142, 10267224, 10267690, 10267691, 10268169])
assert (coverage['end'].tolist() == [10265473, 10266155, 10266200, 10266296, 10266311, 10266332, 10266374, 10266646, 10266743, 10266808, 10267044,
10267063, 10267104, 10267116, 10267117, 10267118, 10267119, 10267121, 10267122, 10267142, 10267224, 10267690, 10267691, 10268169, 10268186])


def test_input_error():
Expand Down
20 changes: 6 additions & 14 deletions tests/test_SplicingBamFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,13 @@ def test_getRange():
assert(len(bb.columns) == 6)
assert(bb.columns == ["chr", "region1_start", "region2_start",
"region1_end", "region2_end", "value"])
assert(len(coverage) == 8)
assert(len(junctions) == 14)
assert(len(junctions) == 8)

assert (coverage['value'].tolist() == [0, 1, 0, 1, 0, 1, 2, 1])
assert (coverage['start'].tolist() == [10265460, 10265473,
10266332, 10266646, 10266743, 10267044, 10267063, 10267142])
assert (coverage['end'].tolist() == [10265473, 10266332,
10266646, 10266743, 10267044, 10267063, 10267142, 10268169])

assert (junctions['value'].tolist() == [
1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
assert (junctions['region2_start'].tolist() == [10266201, 10266312, 10266657, 10266723, 10266972,
10266978, 10267047, 10266897, 10267023, 10267117, 10267120, 10267123, 10267225, 10267692])
assert (junctions['region1_end'].tolist() == [10266155, 10266296, 10266641, 10266707, 10266961,
10266972, 10267043, 10266881, 10267007, 10267104, 10267117, 10267121, 10267118, 10267690])
assert (junctions['value'].tolist() == [1, 1, 1, 1, 1, 1, 1, 1])
assert (junctions['region2_start'].tolist() == [
10266201, 10266312, 10267023, 10267117, 10267120, 10267123, 10267225, 10267692])
assert (junctions['region1_end'].tolist() == [
10266155, 10266296, 10267007, 10267104, 10267117, 10267121, 10267118, 10267690])


# same test as bam file (copy paste)
Expand Down