Skip to content

Commit

Permalink
Bugfixes (#120)
Browse files Browse the repository at this point in the history
* Fix Issue #114 (#116)

* refactored drop demo code s.t. config and sample annotation files are in the package

* adapted pipeline to handle non-existing external counts
new group in demo to distinguish count import from no count import cases
some bugfixes regarding exportCounts parsing and import counts subsetting

* fixed AE pipeline test

* travis with latest R version 4

* revert pipeline in pytest to 2 cores

* R 4.0.2

Co-authored-by: mumichae <mi.mueller@tum.de>

* Refactoring (#118)

* refactored Submodule classes into separate files

* using MAE and AS functions from respective classes instead of Snakefile or R script header

* fixed input for MAE rules

* use chr with bam counts for ASEReadCounter

* added test for aberrant splicing count file functions

* FRASER compatibility fixes

* fixed subsetGroup API change

Co-authored-by: Michaela Müller <mi.mueller@tum.de>
Co-authored-by: Christian Mertes <mertes@in.tum.de>

* using wbuild 1.8.0 for pip installation

* fixed travis install error

Co-authored-by: Michaela Müller <mi.mueller@tum.de>
Co-authored-by: Vicente <yepez@in.tum.de>
Co-authored-by: Christian Mertes <mertes@in.tum.de>
  • Loading branch information
4 people authored Oct 27, 2020
1 parent 5ce58f4 commit 6e9b586
Show file tree
Hide file tree
Showing 34 changed files with 621 additions and 364 deletions.
5 changes: 2 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,8 @@ install:
- conda create -q -n drop -c conda-forge -c bioconda python=$TRAVIS_PYTHON_VERSION r-base=4.0.2
- conda activate drop
- conda install -c conda-forge -c bioconda drop
- conda remove --force drop wbuild
- conda install -c conda-forge -c bioconda wbuild=1.7.1
- pip install -r requirements_test.txt
- conda remove --force drop
- pip install -r tests/requirements.txt

# install drop
- pip install . -vv
Expand Down
1 change: 1 addition & 0 deletions drop/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .setupDrop import *
from . import config
from . import utils
from . import demo
26 changes: 5 additions & 21 deletions drop/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,26 +101,10 @@ def demo():
response = subprocess.run(["bash", download_script], stderr=subprocess.STDOUT)
response.check_returncode()

# fix config file
with open("config.yaml", "r") as f:
config = yaml.load(f, Loader=yaml.Loader)
path_keys = {"root": None,
"htmlOutputPath": None,
"sampleAnnotation": None,
"v29": ["geneAnnotation"],
"genome": ["mae"], "qcVcf": ["mae"]}

for key, sub in path_keys.items():
# iterate to key and entry
dict_ = config
if sub is not None:
for x in sub:
dict_ = dict_[x]
# set absolute path
dict_[key] = str(Path(dict_[key]).resolve())

with open("config.yaml", "w") as f:
yaml.safe_dump(config.copy(), f, default_flow_style=False,
sort_keys=False)
# copy sample annotation and config files with absolute paths
demo_repo = Path(drop.__file__).parent / "demo"
drop.demo.fixSampleAnnotation(demo_repo / "sample_annotation_relative.tsv",
Path.cwd() / "Data" / "sample_annotation.tsv")
drop.demo.fixConfig(demo_repo / "config_relative.yaml", Path.cwd() / "config.yaml")

logger.info("demo project created")
58 changes: 45 additions & 13 deletions drop/config/DropConfig.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .SampleAnnotation import SampleAnnotation
from .Submodules import AE, AS, MAE
from .submodules import *
from .ExportCounts import ExportCounts
from drop import utils
from pathlib import Path
Expand Down Expand Up @@ -35,23 +35,42 @@ def __init__(self, wbuildConfig):
self.htmlOutputPath = Path(self.get("htmlOutputPath"))
self.readmePath = Path(self.get("readmePath"))

# annotations
self.geneAnnotation = self.get("geneAnnotation")
self.genomeAssembly = self.get("genomeAssembly")
self.fastaFile = self.get("mae")["genome"] # TODO: move fasta outside of mae
self.fastaDict = Path(self.fastaFile).with_suffix(".dict")
self.sampleAnnotation = SampleAnnotation(self.get("sampleAnnotation"), self.root)

# setup submodules
cfg = self.config_dict
sa = self.sampleAnnotation
pd = self.processedDataDir
pr = self.processedResultsDir
self.AE = AE(cfg["aberrantExpression"], sa, pd, pr)
self.AS = AS(cfg["aberrantSplicing"], sa, pd, pr)
self.MAE = MAE(cfg["mae"], sa, pd, pr)
# submodules
self.AE = AE(
config=self.get("aberrantExpression"),
sampleAnnotation=self.sampleAnnotation,
processedDataDir=self.processedDataDir,
processedResultsDir=self.processedResultsDir
)
self.AS = AS(
config=self.get("aberrantSplicing"),
sampleAnnotation=self.sampleAnnotation,
processedDataDir=self.processedDataDir,
processedResultsDir=self.processedResultsDir
)
self.MAE = MAE(
config=self.get("mae"),
sampleAnnotation=self.sampleAnnotation,
processedDataDir=self.processedDataDir,
processedResultsDir=self.processedResultsDir
)

# counts export
self.exportCounts = ExportCounts(
self.config_dict, self.processedResultsDir,
self.sampleAnnotation, self.getGeneAnnotations(), self.get("genomeAssembly"),
aberrantExpression=self.AE, aberrantSplicing=self.AS
dict_=self.get("exportCounts"),
outputRoot=self.processedResultsDir,
sampleAnnotation=self.sampleAnnotation,
geneAnnotations=self.getGeneAnnotations(),
genomeAssembly=self.get("genomeAssembly"),
aberrantExpression=self.AE,
aberrantSplicing=self.AS
)

# legacy
Expand Down Expand Up @@ -110,9 +129,16 @@ def getHtmlFromScript(self, path, str_=True):

def get(self, key):
if key not in self.CONFIG_KEYS:
raise KeyError(f"{key} not defined for Drop config")
raise KeyError(f"'{key}' not defined for DROP config")
return self.wBuildConfig.get(key)

def getTool(self, tool):
try:
toolCmd = self.get("tools")[tool]
except KeyError:
raise KeyError(f"'{toolCmd}' not a defined tool for DROP config")
return toolCmd

def getGeneAnnotations(self):
return self.geneAnnotation

Expand All @@ -121,3 +147,9 @@ def getGeneVersions(self):

def getGeneAnnotationFile(self, annotation):
return self.geneAnnotation[annotation]

def getFastaFile(self, str_=True):
return utils.returnPath(self.fastaFile, str_)

def getFastaDict(self, str_=True):
return utils.returnPath(self.fastaDict, str_)
68 changes: 35 additions & 33 deletions drop/config/SampleAnnotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@
from pathlib import Path
from snakemake.logging import logger
import warnings

warnings.filterwarnings("ignore", 'This pattern has match groups')


class SampleAnnotation:

SAMPLE_ANNOTATION_COLUMNS = [
"RNA_ID", "RNA_BAM_FILE", "DNA_ID", "DNA_VCF_FILE", "DROP_GROUP", "GENE_COUNTS_FILE", "ANNOTATION",
FILE_TYPES = ["RNA_BAM_FILE", "DNA_VCF_FILE", "GENE_COUNTS_FILE"]
SAMPLE_ANNOTATION_COLUMNS = FILE_TYPES + [
"RNA_ID", "DNA_ID", "DROP_GROUP", "ANNOTATION",
"PAIRED_END", "COUNT_MODE", "COUNT_OVERLAPS", "STRAND"
]

def __init__(self, file, root):
"""
sa_file: sample annotation file location from config
Expand All @@ -22,12 +24,12 @@ def __init__(self, file, root):
self.sa = self.parse()
self.idMapping = self.createIdMapping()
self.sampleFileMapping = self.createSampleFileMapping()

self.rnaIDs = self.createGroupIds(file_type="RNA_BAM_FILE", sep=',')
self.dnaIDs = self.createGroupIds(file_type="DNA_VCF_FILE", sep=',')
# external counts
self.extGeneCountIDs = self.createGroupIds(file_type="GENE_COUNTS_FILE", sep=',')

def parse(self, sep='\t'):
"""
read and check sample annotation for missing columns
Expand All @@ -37,16 +39,16 @@ def parse(self, sep='\t'):
missing_cols = [x for x in self.SAMPLE_ANNOTATION_COLUMNS if x not in sa.columns.values]
if len(missing_cols) > 0:
raise ValueError(f"Incorrect columns in sample annotation file. Missing:\n{missing_cols}")

# remove unwanted characters
col = sa["DROP_GROUP"]
col = col.str.replace(" ", "").str.replace("(|)", "", regex=True)
sa["DROP_GROUP"] = col

# set ID type as string
for ID_key in ["RNA_ID", "DNA_ID"]:
sa[ID_key] = sa[ID_key].apply(
lambda x: str(x) if not pd.isnull(x) else x
lambda x: str(x) if not pd.isnull(x) else x
)
return sa

Expand All @@ -57,7 +59,7 @@ def createIdMapping(self):
Get mapping of RNA and DNA IDs
"""
return self.sa[["RNA_ID", "DNA_ID"]].drop_duplicates().dropna()

def createSampleFileMapping(self):
"""
create a sample file mapping with unique entries of existing files
Expand All @@ -74,11 +76,11 @@ def createSampleFileMapping(self):
df['FILE_TYPE'] = file_type
assay_subsets.append(df)
file_mapping = pd.concat(assay_subsets)

# cleaning SAMPLE_FILE_MAPPING
file_mapping.dropna(inplace=True)
file_mapping.drop_duplicates(inplace=True)

# check for missing files
existing = utils.checkFileExists(file_mapping["FILE_PATH"])
if len(existing) == 0:
Expand All @@ -102,7 +104,13 @@ def createGroupIds(self, group_key="DROP_GROUP", file_type=None, sep=','):
if not file_type:
file_type = "RNA_BAM_FILE"
# infer ID type from file type
assay_id = self.subsetFileMapping(file_type)["ASSAY"].iloc[0]
assay_id = self.subsetFileMapping(file_type)["ASSAY"].unique()
if len(assay_id) == 0:
return {} # no files, return empty mapping
elif len(assay_id) > 1:
raise ValueError(f"More than 1 assay entry for file type {file_type}:\n{assay_id}")
else:
assay_id = assay_id[0]

# Subset sample annotation to only IDs of specified file_type
ids = self.getSampleIDs(file_type)
Expand Down Expand Up @@ -139,7 +147,7 @@ def subsetSampleAnnotation(self, column, values, subset=None):
# check type for subset
if not isinstance(subset, pd.DataFrame):
raise TypeError(f"Is not pandas DataFrame\n {subset}")
if not sa_cols <= set(subset.columns): # check if mandatory cols not contained
if not sa_cols <= set(subset.columns): # check if mandatory cols not contained
raise ValueError(f"Subset columns not the same as {sa_cols}\ngot: {subset.columns}")

# check if column is valid
Expand All @@ -158,12 +166,12 @@ def subsetFileMapping(self, file_type=None, sample_id=None):
subset = utils.subsetBy(subset, "ID", sample_id)
return subset

def subsetGroups(self, subset_groups, assay="RNA", warn=30, error=10):
def subsetGroups(self, subset_groups, assay="RNA"):
"""
Subset DROP group to sample IDs mapping by list of groups (`subset_groups`).
Give warning or error if subsetting results in too few sample IDs per group.
warn : number of samples threshold at which to warn about too few samples
error: number of samples threshold at which to give error
:param subset_groups: list of groups to include
:param assay: name/prefix of assay type
:return: dictionary with group names as keys and ID lists as entries
"""
ids_by_group = self.getGroupedIDs(assay)

Expand All @@ -173,16 +181,6 @@ def subsetGroups(self, subset_groups, assay="RNA", warn=30, error=10):
subset_groups = [subset_groups] if subset_groups.__class__ == str else subset_groups
subset = {gr: ids for gr, ids in
ids_by_group.items() if gr in subset_groups}

for group in subset_groups:
if len(subset[group]) < error:
message = f'Too few IDs in DROP_GROUP {group}'
message += f', please ensure that it has at least {error} IDs'
message += f', groups: {subset[group]}'
raise ValueError(message)
elif len(subset[group]) < warn:
logger.info(f'WARNING: Less than {warn} IDs in DROP_GROUP {group}')

return subset

### Getters
Expand All @@ -199,7 +197,7 @@ def getFilePath(self, sample_id, file_type, single_file=True):
raise ValueError(message)
path = path[0]
return path

def getFilePaths(self, file_type, group=None):
"""
Get all file paths of a file type
Expand Down Expand Up @@ -233,7 +231,7 @@ def getRow(self, column, value):
return row

### DROP Groups ###

def getGroupedIDs(self, assays):
"""
Get group to IDs mapping
Expand All @@ -251,12 +249,16 @@ def getGroupedIDs(self, assays):
else:
raise ValueError(f"'{assay}' is not a valid assay name")
return groupedIDs

def getGroups(self, assay="RNA"):
return self.getGroupedIDs(assay).keys()

def getIDsByGroup(self, group, assay="RNA"):
return self.getGroupedIDs(assay)[group]
try:
ids = self.getGroupedIDs(assay)[group]
except:
ids = []
return ids

def getSampleIDs(self, file_type):
ids = self.subsetFileMapping(file_type)["ID"]
Expand Down
Loading

0 comments on commit 6e9b586

Please sign in to comment.