diff --git a/src/epivizFileParser/BamFile.py b/src/epivizFileParser/BamFile.py index 45016a4..8942325 100644 --- a/src/epivizFileParser/BamFile.py +++ b/src/epivizFileParser/BamFile.py @@ -76,29 +76,82 @@ def getRange(self, chr, start, end, bins=2000, zoomlvl=-1, metric="AVG", respTyp result = [] try: - iter = self.file.pileup(chr, start, end) + iter = self.file.pileup( + chr, + start, + end, + truncate=True, + ) + + chrTemp = valueTemp = None + startTemp = endTemp = start - chrTemp = startTemp = endTemp = valueTemp = None for x in iter: + # get data + _current_reference_name = x.reference_name + _current_reference_pos = x.reference_pos + _current_valueTemp = x.get_num_aligned() # or x.nsegments + + # print(x.pileups[0]) + + for pileupread in x.pileups: + # or look at x.get_query_sequences() + _current_valueTemp -= pileupread.is_del - if not x.reference_pos >= start or not x.reference_pos <= end: + # print( + # f"{_current_reference_pos} : {_current_valueTemp} : {x.get_query_sequences()}") + + # is first? + if valueTemp is None: + chrTemp = _current_reference_name + startTemp = _current_reference_pos + valueTemp = _current_valueTemp + + endTemp = _current_reference_pos + 1 continue - - if x.reference_pos == end and valueTemp is not None: + + # is it at the end? did it finish before end? + if x.reference_pos == end-1 and valueTemp is not None: + # last min change + if valueTemp is not _current_valueTemp: + result.append((chrTemp, startTemp, endTemp, valueTemp)) + chrTemp = _current_reference_name + startTemp = _current_reference_pos + valueTemp = _current_valueTemp + result.append((chrTemp, startTemp, end, valueTemp)) + continue - if valueTemp is None: - chrTemp = x.reference_name - startTemp = x.reference_pos - valueTemp = x.get_num_aligned() - elif valueTemp is not x.get_num_aligned(): + # gap detection + if _current_reference_pos != endTemp: + # print('caught gap') + # print(f"{_current_reference_pos} : {endTemp}") + result.append((chrTemp, startTemp, endTemp, valueTemp)) + chrTemp = _current_reference_name + startTemp = _current_reference_pos + valueTemp = _current_valueTemp + + endTemp = _current_reference_pos + 1 + continue + + # change of coverage value + if valueTemp is not _current_valueTemp: result.append((chrTemp, startTemp, endTemp, valueTemp)) - chrTemp = x.reference_name - startTemp = x.reference_pos - valueTemp = x.get_num_aligned() + chrTemp = _current_reference_name + startTemp = _current_reference_pos + valueTemp = _current_valueTemp - endTemp = x.reference_pos+1 + endTemp = _current_reference_pos + 1 + continue + + # no change + endTemp = _current_reference_pos + 1 + + # last insertion check + if len(result) == 0 or startTemp != result[len(result) - 1][1]: + if valueTemp is not None: + result.append((chrTemp, startTemp, endTemp, valueTemp)) self.get_col_names() diff --git a/src/epivizFileParser/ReadingsBamFile.py b/src/epivizFileParser/ReadingsBamFile.py new file mode 100644 index 0000000..d0737da --- /dev/null +++ b/src/epivizFileParser/ReadingsBamFile.py @@ -0,0 +1,82 @@ +# import pysam +# import numpy +from .BamFile import BamFile +from .utils import toDataFrame + +__author__ = "Jayaram Kancherla, Victor Alexandru" +__copyright__ = "jkanche" +__license__ = "mit" + + +class ReadingsBamFile(BamFile): + """ + Class to parse bam files + + Args: + file (str): file location can be local (full path) or hosted publicly + columns ([str]) : names of columns in the bam file + + Attributes: + file: a pysam file object + fileSrc: location of the file + cacheData: cache of accessed data in memory + columns: column names + """ + + def __init__(self, file, columns=None): + super().__init__(file, columns) + + def get_col_names(self): + """ + Columns of a bam file + """ + if self.columns is None: + self.columns = ["chr", "name", "start", "length"] + return self.columns + + def getRange(self, chr, start, end, bins=2000, zoomlvl=-1, metric="AVG", respType="DataFrame"): + """ + Get data for a genomic location + + Args: + chr (str): chromosome + start (int): genomic start + end (int): genomic end + respType (str): result format type, default is "DataFrame + + Returns: + a tuple of + result + a DataFrame with matched regions from the input genomic location if respType is DataFrame else result is an array + error + if there was any error during the process + """ + + reads = [] + + try: + for read in self.file.fetch(chr, start, end): + # for future ref + # .get_blocks() # the actual broken segments seen in ivg + # .reference_start - .reference_end + + reads.append(( + read.reference_name, # chr + read.query_name, # read name + read.reference_start, + read.reference_length # actual length with gaps + )) + + self.get_col_names() + + if respType == "DataFrame": + result = toDataFrame(reads, self.columns) + + except ValueError as e: + result = toDataFrame(reads, self.get_col_names()) + return result, "Invalid input. (chr, start, end)" + except Exception as e: + result = toDataFrame(reads, self.get_col_names()) + return result, str(e) + + return result, None diff --git a/src/epivizFileParser/SplicingBamFile.py b/src/epivizFileParser/SplicingBamFile.py index f2e7b49..850a863 100644 --- a/src/epivizFileParser/SplicingBamFile.py +++ b/src/epivizFileParser/SplicingBamFile.py @@ -36,11 +36,32 @@ def get_col_names(self): return self.columns def _get_junctions(self, chr, start, end): - iter = self.file.fetch(chr, start, end) + # iter = self.file.fetch(chr, start, end) + iter = self.file.pileup( + chr, + start, + end, + truncate=True, + ) + + reads_set = set() + reads_refs = [] + + for pileupcolumn in iter: + for pileupread in pileupcolumn.pileups: + _name = pileupread.alignment.query_name + + if _name in reads_set: + pass + else: + reads_set.add(_name) + reads_refs.append(pileupread.alignment) junctions = {} - for read in iter: + for read in reads_refs: + # print(read.query_name) + if read.cigartuples is None: # Skipping read with no CIGAR string continue diff --git a/src/epivizFileParser/__init__.py b/src/epivizFileParser/__init__.py index b5bd4f2..413747a 100644 --- a/src/epivizFileParser/__init__.py +++ b/src/epivizFileParser/__init__.py @@ -25,6 +25,7 @@ from .SamFile import SamFile from .BamFile import BamFile from .SplicingBamFile import SplicingBamFile +from .ReadingsBamFile import ReadingsBamFile from .TbxFile import TbxFile from .GtfFile import GtfFile from .InteractionBigBed import InteractionBigBed diff --git a/tests/test_BamFile.py b/tests/test_BamFile.py index 5b0e24b..58f10de 100644 --- a/tests/test_BamFile.py +++ b/tests/test_BamFile.py @@ -3,6 +3,8 @@ import pytest import os +import pandas as pd + from epivizFileParser import BamFile __author__ = "jkanche, alexanv8" @@ -18,11 +20,19 @@ def test_getRange(): - res, error = bb.getRange("chr10", 1, 100000000) + coverage, error = bb.getRange("chr10", 1, 100000000) assert(error == None) + assert(len(bb.columns) == 4) assert(bb.columns == ["chr", "start", "end", "value"]) - assert(len(res) == 8) + assert(len(coverage) == 25) + + assert (coverage['value'].tolist() == [0, 1, 0, 1, 0, 1, 0, + 0, 1, 0, 0, 1, 2, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]) + assert (coverage['start'].tolist() == [10265460, 10265473, 10266155, 10266200, 10266296, 10266311, 10266332, 10266594, 10266646, 10266743, 10266857, + 10267044, 10267063, 10267104, 10267116, 10267117, 10267118, 10267119, 10267121, 10267122, 10267142, 10267224, 10267690, 10267691, 10268169]) + assert (coverage['end'].tolist() == [10265473, 10266155, 10266200, 10266296, 10266311, 10266332, 10266374, 10266646, 10266743, 10266808, 10267044, + 10267063, 10267104, 10267116, 10267117, 10267118, 10267119, 10267121, 10267122, 10267142, 10267224, 10267690, 10267691, 10268169, 10268186]) def test_input_error(): diff --git a/tests/test_ReadingsBamFile.py b/tests/test_ReadingsBamFile.py new file mode 100644 index 0000000..febb93d --- /dev/null +++ b/tests/test_ReadingsBamFile.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- + +import pytest +import os + +from epivizFileParser import ReadingsBamFile + +__author__ = "jkanche, alexanv8" +__copyright__ = "jkanche" +__license__ = "mit" + +""" + The file test.bigBed and test assertions + come from the pyBigWig library +""" + +bb = ReadingsBamFile("tests/data/test.bam") + + +def test_getRange(): + reads, error = bb.getRange("chr10", 10265461, 10266657) + assert(error == None) + + assert(len(bb.columns) == 4) + assert(bb.columns == ["chr", "name", "start", "length"]) + assert(len(reads) == 4) + + assert (reads['name'].tolist() == ['973:285', + '1198:1458', '2195:2352', '554:2315']) + assert (reads['start'].tolist() == [ + 10265460, 10266594, 10266600, 10266656]) + assert (reads['length'].tolist() == [914, 172, 186, 152]) + + +def test_input_error(): + res, error = bb.getRange("chr11", 1, 100000000) + assert(res.empty) + assert (error == "Invalid input. (chr, start, end)") + + res, error = bb.getRange("chr10", 7, 4) + assert(res.empty) + assert (error == "Invalid input. (chr, start, end)") + + +def test_empty(): + assert(len(bb.getRange("chr10", 4, 7)[0]) == 0) diff --git a/tests/test_SplicingBamFile.py b/tests/test_SplicingBamFile.py index 17e7b4a..78eedf5 100644 --- a/tests/test_SplicingBamFile.py +++ b/tests/test_SplicingBamFile.py @@ -24,21 +24,13 @@ def test_getRange(): assert(len(bb.columns) == 6) assert(bb.columns == ["chr", "region1_start", "region2_start", "region1_end", "region2_end", "value"]) - assert(len(coverage) == 8) - assert(len(junctions) == 14) + assert(len(junctions) == 8) - assert (coverage['value'].tolist() == [0, 1, 0, 1, 0, 1, 2, 1]) - assert (coverage['start'].tolist() == [10265460, 10265473, - 10266332, 10266646, 10266743, 10267044, 10267063, 10267142]) - assert (coverage['end'].tolist() == [10265473, 10266332, - 10266646, 10266743, 10267044, 10267063, 10267142, 10268169]) - - assert (junctions['value'].tolist() == [ - 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) - assert (junctions['region2_start'].tolist() == [10266201, 10266312, 10266657, 10266723, 10266972, - 10266978, 10267047, 10266897, 10267023, 10267117, 10267120, 10267123, 10267225, 10267692]) - assert (junctions['region1_end'].tolist() == [10266155, 10266296, 10266641, 10266707, 10266961, - 10266972, 10267043, 10266881, 10267007, 10267104, 10267117, 10267121, 10267118, 10267690]) + assert (junctions['value'].tolist() == [1, 1, 1, 1, 1, 1, 1, 1]) + assert (junctions['region2_start'].tolist() == [ + 10266201, 10266312, 10267023, 10267117, 10267120, 10267123, 10267225, 10267692]) + assert (junctions['region1_end'].tolist() == [ + 10266155, 10266296, 10267007, 10267104, 10267117, 10267121, 10267118, 10267690]) # same test as bam file (copy paste)