Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bam reads #15

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 67 additions & 14 deletions src/epivizFileParser/BamFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,29 +76,82 @@ def getRange(self, chr, start, end, bins=2000, zoomlvl=-1, metric="AVG", respTyp
result = []

try:
iter = self.file.pileup(chr, start, end)
iter = self.file.pileup(
chr,
start,
end,
truncate=True,
)

chrTemp = valueTemp = None
startTemp = endTemp = start

chrTemp = startTemp = endTemp = valueTemp = None
for x in iter:
# get data
_current_reference_name = x.reference_name
_current_reference_pos = x.reference_pos
_current_valueTemp = x.get_num_aligned() # or x.nsegments

# print(x.pileups[0])

for pileupread in x.pileups:
# or look at x.get_query_sequences()
_current_valueTemp -= pileupread.is_del

if not x.reference_pos >= start or not x.reference_pos <= end:
# print(
# f"{_current_reference_pos} : {_current_valueTemp} : {x.get_query_sequences()}")

# is first?
if valueTemp is None:
chrTemp = _current_reference_name
startTemp = _current_reference_pos
valueTemp = _current_valueTemp

endTemp = _current_reference_pos + 1
continue

if x.reference_pos == end and valueTemp is not None:

# is it at the end? did it finish before end?
if x.reference_pos == end-1 and valueTemp is not None:
# last min change
if valueTemp is not _current_valueTemp:
result.append((chrTemp, startTemp, endTemp, valueTemp))
chrTemp = _current_reference_name
startTemp = _current_reference_pos
valueTemp = _current_valueTemp

result.append((chrTemp, startTemp, end, valueTemp))

continue

if valueTemp is None:
chrTemp = x.reference_name
startTemp = x.reference_pos
valueTemp = x.get_num_aligned()
elif valueTemp is not x.get_num_aligned():
# gap detection
if _current_reference_pos != endTemp:
# print('caught gap')
# print(f"{_current_reference_pos} : {endTemp}")
result.append((chrTemp, startTemp, endTemp, valueTemp))
chrTemp = _current_reference_name
startTemp = _current_reference_pos
valueTemp = _current_valueTemp

endTemp = _current_reference_pos + 1
continue

# change of coverage value
if valueTemp is not _current_valueTemp:
result.append((chrTemp, startTemp, endTemp, valueTemp))
chrTemp = x.reference_name
startTemp = x.reference_pos
valueTemp = x.get_num_aligned()
chrTemp = _current_reference_name
startTemp = _current_reference_pos
valueTemp = _current_valueTemp

endTemp = x.reference_pos+1
endTemp = _current_reference_pos + 1
continue

# no change
endTemp = _current_reference_pos + 1

# last insertion check
if len(result) == 0 or startTemp != result[len(result) - 1][1]:
if valueTemp is not None:
result.append((chrTemp, startTemp, endTemp, valueTemp))

self.get_col_names()

Expand Down
82 changes: 82 additions & 0 deletions src/epivizFileParser/ReadingsBamFile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# import pysam
# import numpy
from .BamFile import BamFile
from .utils import toDataFrame

__author__ = "Jayaram Kancherla, Victor Alexandru"
__copyright__ = "jkanche"
__license__ = "mit"


class ReadingsBamFile(BamFile):
"""
Class to parse bam files

Args:
file (str): file location can be local (full path) or hosted publicly
columns ([str]) : names of columns in the bam file

Attributes:
file: a pysam file object
fileSrc: location of the file
cacheData: cache of accessed data in memory
columns: column names
"""

def __init__(self, file, columns=None):
super().__init__(file, columns)

def get_col_names(self):
"""
Columns of a bam file
"""
if self.columns is None:
self.columns = ["chr", "name", "start", "length"]
return self.columns

def getRange(self, chr, start, end, bins=2000, zoomlvl=-1, metric="AVG", respType="DataFrame"):
"""
Get data for a genomic location

Args:
chr (str): chromosome
start (int): genomic start
end (int): genomic end
respType (str): result format type, default is "DataFrame

Returns:
a tuple of
result
a DataFrame with matched regions from the input genomic location if respType is DataFrame else result is an array
error
if there was any error during the process
"""

reads = []

try:
for read in self.file.fetch(chr, start, end):
# for future ref
# .get_blocks() # the actual broken segments seen in ivg
# .reference_start - .reference_end

reads.append((
read.reference_name, # chr
read.query_name, # read name
read.reference_start,
read.reference_length # actual length with gaps
))

self.get_col_names()

if respType == "DataFrame":
result = toDataFrame(reads, self.columns)

except ValueError as e:
result = toDataFrame(reads, self.get_col_names())
return result, "Invalid input. (chr, start, end)"
except Exception as e:
result = toDataFrame(reads, self.get_col_names())
return result, str(e)

return result, None
25 changes: 23 additions & 2 deletions src/epivizFileParser/SplicingBamFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,32 @@ def get_col_names(self):
return self.columns

def _get_junctions(self, chr, start, end):
iter = self.file.fetch(chr, start, end)
# iter = self.file.fetch(chr, start, end)
iter = self.file.pileup(
chr,
start,
end,
truncate=True,
)

reads_set = set()
reads_refs = []

for pileupcolumn in iter:
for pileupread in pileupcolumn.pileups:
_name = pileupread.alignment.query_name

if _name in reads_set:
pass
else:
reads_set.add(_name)
reads_refs.append(pileupread.alignment)

junctions = {}

for read in iter:
for read in reads_refs:
# print(read.query_name)

if read.cigartuples is None:
# Skipping read with no CIGAR string
continue
Expand Down
1 change: 1 addition & 0 deletions src/epivizFileParser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from .SamFile import SamFile
from .BamFile import BamFile
from .SplicingBamFile import SplicingBamFile
from .ReadingsBamFile import ReadingsBamFile
from .TbxFile import TbxFile
from .GtfFile import GtfFile
from .InteractionBigBed import InteractionBigBed
Expand Down
14 changes: 12 additions & 2 deletions tests/test_BamFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import pytest
import os

import pandas as pd

from epivizFileParser import BamFile

__author__ = "jkanche, alexanv8"
Expand All @@ -18,11 +20,19 @@


def test_getRange():
res, error = bb.getRange("chr10", 1, 100000000)
coverage, error = bb.getRange("chr10", 1, 100000000)
assert(error == None)

assert(len(bb.columns) == 4)
assert(bb.columns == ["chr", "start", "end", "value"])
assert(len(res) == 8)
assert(len(coverage) == 25)

assert (coverage['value'].tolist() == [0, 1, 0, 1, 0, 1, 0,
0, 1, 0, 0, 1, 2, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0])
assert (coverage['start'].tolist() == [10265460, 10265473, 10266155, 10266200, 10266296, 10266311, 10266332, 10266594, 10266646, 10266743, 10266857,
10267044, 10267063, 10267104, 10267116, 10267117, 10267118, 10267119, 10267121, 10267122, 10267142, 10267224, 10267690, 10267691, 10268169])
assert (coverage['end'].tolist() == [10265473, 10266155, 10266200, 10266296, 10266311, 10266332, 10266374, 10266646, 10266743, 10266808, 10267044,
10267063, 10267104, 10267116, 10267117, 10267118, 10267119, 10267121, 10267122, 10267142, 10267224, 10267690, 10267691, 10268169, 10268186])


def test_input_error():
Expand Down
46 changes: 46 additions & 0 deletions tests/test_ReadingsBamFile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-

import pytest
import os

from epivizFileParser import ReadingsBamFile

__author__ = "jkanche, alexanv8"
__copyright__ = "jkanche"
__license__ = "mit"

"""
The file test.bigBed and test assertions
come from the pyBigWig library
"""

bb = ReadingsBamFile("tests/data/test.bam")


def test_getRange():
reads, error = bb.getRange("chr10", 10265461, 10266657)
assert(error == None)

assert(len(bb.columns) == 4)
assert(bb.columns == ["chr", "name", "start", "length"])
assert(len(reads) == 4)

assert (reads['name'].tolist() == ['973:285',
'1198:1458', '2195:2352', '554:2315'])
assert (reads['start'].tolist() == [
10265460, 10266594, 10266600, 10266656])
assert (reads['length'].tolist() == [914, 172, 186, 152])


def test_input_error():
res, error = bb.getRange("chr11", 1, 100000000)
assert(res.empty)
assert (error == "Invalid input. (chr, start, end)")

res, error = bb.getRange("chr10", 7, 4)
assert(res.empty)
assert (error == "Invalid input. (chr, start, end)")


def test_empty():
assert(len(bb.getRange("chr10", 4, 7)[0]) == 0)
20 changes: 6 additions & 14 deletions tests/test_SplicingBamFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,13 @@ def test_getRange():
assert(len(bb.columns) == 6)
assert(bb.columns == ["chr", "region1_start", "region2_start",
"region1_end", "region2_end", "value"])
assert(len(coverage) == 8)
assert(len(junctions) == 14)
assert(len(junctions) == 8)

assert (coverage['value'].tolist() == [0, 1, 0, 1, 0, 1, 2, 1])
assert (coverage['start'].tolist() == [10265460, 10265473,
10266332, 10266646, 10266743, 10267044, 10267063, 10267142])
assert (coverage['end'].tolist() == [10265473, 10266332,
10266646, 10266743, 10267044, 10267063, 10267142, 10268169])

assert (junctions['value'].tolist() == [
1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
assert (junctions['region2_start'].tolist() == [10266201, 10266312, 10266657, 10266723, 10266972,
10266978, 10267047, 10266897, 10267023, 10267117, 10267120, 10267123, 10267225, 10267692])
assert (junctions['region1_end'].tolist() == [10266155, 10266296, 10266641, 10266707, 10266961,
10266972, 10267043, 10266881, 10267007, 10267104, 10267117, 10267121, 10267118, 10267690])
assert (junctions['value'].tolist() == [1, 1, 1, 1, 1, 1, 1, 1])
assert (junctions['region2_start'].tolist() == [
10266201, 10266312, 10267023, 10267117, 10267120, 10267123, 10267225, 10267692])
assert (junctions['region1_end'].tolist() == [
10266155, 10266296, 10267007, 10267104, 10267117, 10267121, 10267118, 10267690])


# same test as bam file (copy paste)
Expand Down