From bfd61f8e3c0927173787624a7d8bffcca6a74fa0 Mon Sep 17 00:00:00 2001 From: WagnerNils Date: Thu, 9 Feb 2023 15:26:25 +0100 Subject: [PATCH] include outliers in absplice_rna --- absplice/cat_dataloader.py | 1 + absplice/result.py | 46 +++++++++++++++++++++++++++++++------- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/absplice/cat_dataloader.py b/absplice/cat_dataloader.py index ddb0ffd..8de91e4 100644 --- a/absplice/cat_dataloader.py +++ b/absplice/cat_dataloader.py @@ -2,6 +2,7 @@ from absplice.dataloader import SpliceMapMixin from absplice.utils import delta_logit_PSI_to_delta_PSI, logit import pandas as pd +pd.options.mode.chained_assignment = None import re from typing import List from splicemap.splice_map import SpliceMap diff --git a/absplice/result.py b/absplice/result.py index d433bfa..546d22c 100644 --- a/absplice/result.py +++ b/absplice/result.py @@ -68,6 +68,7 @@ def __init__(self, df_spliceai=None, df_cadd_splice=None, df_mmsplice_cat=None, + df_outliers_cat=None, gene_map=None, gene_tpm=None, df_var_samples=None, @@ -79,6 +80,7 @@ def __init__(self, self.df_var_samples = self.validate_df_var_samples(df_var_samples) self.df_mmsplice = self.validate_df_mmsplice(df_mmsplice) self.df_mmsplice_cat = self.validate_df_mmsplice_cat(df_mmsplice_cat) + self.df_outliers_cat = self.validate_df_outliers_cat(df_outliers_cat) self.gene_map = self.validate_df_gene_map(gene_map) self.gene_tpm = self.validate_df_gene_tpm(gene_tpm) self.df_spliceai = self.validate_df_spliceai(df_spliceai) @@ -101,6 +103,7 @@ def __init__(self, self._gene_absplice_rna = None self._variant_mmsplice = None self._variant_mmsplice_cat = None + self._variant_outliers_cat = None self._variant_spliceai = None self._variant_absplice_dna = None self._variant_absplice_rna = None @@ -143,6 +146,18 @@ def validate_df_mmsplice_cat(self, df_mmsplice_cat): ~df_mmsplice_cat['delta_psi_cat'].isna() ] return df_mmsplice_cat + + def validate_df_outliers_cat(self, df_outliers_cat): + if df_outliers_cat is not None: + df_outliers_cat = self._validate_df( + df_outliers_cat, + columns=['variant', 'gene_id', 'tissue', 'sample' + 'pValueGene_g_minus_log10']) + df_outliers_cat = self._validate_dtype(df_outliers_cat) + df_outliers_cat = df_outliers_cat[ + ~df_outliers_cat['pValueGene_g_minus_log10'].isna() + ] + return df_outliers_cat def validate_df_spliceai(self, df_spliceai): if df_spliceai is not None: @@ -239,7 +254,7 @@ def validate_absplice_rna_input(self, df_absplice_rna_input): 'junction', 'delta_score', 'delta_logit_psi', 'delta_psi', 'ref_psi', 'median_n', 'tissue_cat', 'k_cat', 'n_cat', 'median_n_cat', 'psi_cat', 'ref_psi_cat', - 'delta_logit_psi_cat', 'delta_psi_cat' + 'delta_logit_psi_cat', 'delta_psi_cat', 'pValueGene_g_minus_log10' ]) df_absplice_rna_input = self._validate_dtype(df_absplice_rna_input) groupby = ['variant', 'gene_id', 'tissue', 'sample'] @@ -316,8 +331,9 @@ def _add_tissue_info_to_cadd_splice(self): return self._df_cadd_splice_tissue def _add_samples(self, df): - assert 'chr' not in df.iloc[0]['variant'] - assert 'chr' not in self.df_var_samples.iloc[0]['variant'] + chr_df = 'chr' in df.iloc[0]['variant'] + chr_samples = 'chr' in self.df_var_samples.iloc[0]['variant'] + assert (chr_df == True and chr_samples == True) or (chr_df == False and chr_samples == False) df = df.set_index('variant') \ .join(self.df_var_samples.set_index('variant'), how='inner') \ @@ -471,6 +487,14 @@ def variant_mmsplice_cat(self): self._variant_mmsplice_cat = self._get_maximum_effect( self.df_mmsplice_cat, groupby, score='delta_psi_cat') return self._variant_mmsplice_cat + + @property + def variant_outliers_cat(self): + groupby = ['variant', 'gene_id', 'tissue', 'sample'] + if self._variant_outliers_cat is None: + self._variant_outliers_cat = self._get_maximum_effect( + self.df_outliers_cat, groupby, score='pValueGene_g_minus_log10') + return self._variant_outliers_cat @property def variant_spliceai(self): # NOTE: max aggregate for variant on each gene @@ -491,9 +515,11 @@ def absplice_dna_input(self): # MMSplice (SpliceMap) cols_mmsplice = [ - 'Chromosome', 'Start', 'End', 'Strand', 'junction', 'event_type', 'splice_site', 'gene_name', - 'delta_logit_psi', 'delta_psi', 'ref_psi', 'k', 'n', 'median_n', - 'novel_junction', 'weak_site_donor', 'weak_site_acceptor'] + 'junction', 'event_type', + 'splice_site', 'ref_psi', 'median_n', + 'gene_name', + 'delta_logit_psi', 'delta_psi', + ] if self.df_mmsplice is not None: df_mmsplice = self._get_maximum_effect( self.df_mmsplice, groupby, score='delta_psi') @@ -532,7 +558,7 @@ def absplice_dna_input(self): return self._absplice_dna_input @property - def absplice_rna_input(self): + def absplice_rna_input(self): #TODO: check if tissue_cat should be included in groubpy if self._absplice_rna_input is None: groupby = ['variant', 'gene_id', 'tissue', 'sample'] if not pd.Series(groupby).isin(self.absplice_dna_input.index.names).all(): @@ -542,8 +568,12 @@ def absplice_rna_input(self): cols_mmsplice_cat = [ 'junction', 'delta_psi', 'ref_psi', 'median_n', *[col for col in df_mmsplice_cat.columns if 'cat' in col]] + df_outliers_cat = self._get_maximum_effect( + self.df_outliers_cat, groupby, score='pValueGene_g_minus_log10') + cols_outliers_cat = ['pValueGene_g_minus_log10'] self._absplice_rna_input = self.absplice_dna_input.join( - df_mmsplice_cat[cols_mmsplice_cat], how='outer', rsuffix='_from_cat_infer') + df_mmsplice_cat[cols_mmsplice_cat], how='outer', rsuffix='_from_cat_infer').join( + df_outliers_cat[cols_outliers_cat], how='outer', rsuffix='_outlier_cat') return self._absplice_rna_input def _predict_absplice(self, df, absplice_score, pickle_file, features, abs_features, median_n_cutoff, tpm_cutoff):