From 35a501ba90c97a82f5d8ddeaa3fc90be5bc815ce Mon Sep 17 00:00:00 2001 From: Johannes Linder Date: Fri, 19 Apr 2024 11:00:29 -0700 Subject: [PATCH 1/3] Added data processing scripts. --- src/scripts/data/qtl_data/README.md | 33 +++ src/scripts/data/qtl_data/download_finemap.py | 62 +++++ src/scripts/data/qtl_data/download_sumstat.py | 56 +++++ .../qtl_data/ipaqtl_make_negative_sets.py | 196 +++++++++++++++ .../qtl_data/ipaqtl_make_positive_sets.py | 191 ++++++++++++++ src/scripts/data/qtl_data/ipaqtl_vcfs.py | 234 ++++++++++++++++++ .../data/qtl_data/make_expression_tables.py | 181 ++++++++++++++ src/scripts/data/qtl_data/make_vcfs.py | 112 +++++++++ .../data/qtl_data/merge_finemapping_tables.py | 102 ++++++++ .../data/qtl_data/paqtl_make_negative_sets.py | 196 +++++++++++++++ .../data/qtl_data/paqtl_make_positive_sets.py | 191 ++++++++++++++ src/scripts/data/qtl_data/paqtl_vcfs.py | 234 ++++++++++++++++++ .../data/qtl_data/sqtl_make_negative_sets.py | 195 +++++++++++++++ .../data/qtl_data/sqtl_make_positive_sets.py | 190 ++++++++++++++ src/scripts/data/qtl_data/sqtl_vcfs.py | 234 ++++++++++++++++++ src/scripts/data/training_data/Makefile | 47 ++++ src/scripts/data/training_data/README.md | 11 + 17 files changed, 2465 insertions(+) create mode 100644 src/scripts/data/qtl_data/README.md create mode 100644 src/scripts/data/qtl_data/download_finemap.py create mode 100644 src/scripts/data/qtl_data/download_sumstat.py create mode 100644 src/scripts/data/qtl_data/ipaqtl_make_negative_sets.py create mode 100644 src/scripts/data/qtl_data/ipaqtl_make_positive_sets.py create mode 100644 src/scripts/data/qtl_data/ipaqtl_vcfs.py create mode 100644 src/scripts/data/qtl_data/make_expression_tables.py create mode 100644 src/scripts/data/qtl_data/make_vcfs.py create mode 100644 src/scripts/data/qtl_data/merge_finemapping_tables.py create mode 100644 src/scripts/data/qtl_data/paqtl_make_negative_sets.py create mode 100644 src/scripts/data/qtl_data/paqtl_make_positive_sets.py create mode 100644 src/scripts/data/qtl_data/paqtl_vcfs.py create mode 100644 src/scripts/data/qtl_data/sqtl_make_negative_sets.py create mode 100644 src/scripts/data/qtl_data/sqtl_make_positive_sets.py create mode 100644 src/scripts/data/qtl_data/sqtl_vcfs.py create mode 100644 src/scripts/data/training_data/Makefile create mode 100644 src/scripts/data/training_data/README.md diff --git a/src/scripts/data/qtl_data/README.md b/src/scripts/data/qtl_data/README.md new file mode 100644 index 0000000..3a3589e --- /dev/null +++ b/src/scripts/data/qtl_data/README.md @@ -0,0 +1,33 @@ +## QTL data processing + +The scripts in this folder are used to extract fine-mapped causal sQTLs, paQTLs and ipaQTLs from the results of the eQTL eQTL Catalogue, as well as construct distance- and expression-matched negative SNPs.
+
+*Notes*: +- The pipeline requires the GTEx v8 (median) TPM matrix, which can be downloaded [here](https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz).
+
+ +As a prerequisite to generating any of the QTL datasets, run the following scripts (in order): +1. download_finemap.py +2. download_sumstat.py +3. merge_finemapping_tables.py +4. make_expression_tables.py +
+ +To prepare the sQTL dataset, run these scripts: +1. sqtl_make_positive_sets.py +2. sqtl_make_negative_sets.py +
+ +To prepare the paQTL dataset, run these scripts: +1. paqtl_make_positive_sets.py +2. paqtl_make_negative_sets.py +
+ +To prepare the ipaQTL dataset, run these scripts: +1. ipaqtl_make_positive_sets.py +2. ipaqtl_make_negative_sets.py +
+ +Finally, to generate the QTL VCF files, run this script: +1. make_vcfs.py +
diff --git a/src/scripts/data/qtl_data/download_finemap.py b/src/scripts/data/qtl_data/download_finemap.py new file mode 100644 index 0000000..558e3ef --- /dev/null +++ b/src/scripts/data/qtl_data/download_finemap.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +from optparse import OptionParser + +import os + +import pandas as pd + +import util + +''' +download_finemap.py + +Download QTL Catalogue fine-mapping results. +''' + +################################################################################ +# main +################################################################################ +def main(): + usage = 'usage: %prog [options] arg' + parser = OptionParser(usage) + #parser.add_option() + (options,args) = parser.parse_args() + + # read remote table + samples_df = pd.read_csv('https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/master/tabix/tabix_ftp_paths.tsv', sep='\t') + + # filter GTEx (for now) + samples_df = samples_df[samples_df.study == 'GTEx'] + + + ################################################ + # txrevise for splicing / polyA / TSS QTLs + + os.makedirs('txrev', exist_ok=True) + txrev_df = samples_df[samples_df.quant_method == 'txrev'] + + jobs = [] + for all_ftp_path in txrev_df.ftp_path: + # ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/sumstats/Alasoo_2018/txrev/Alasoo_2018_txrev_macrophage_IFNg+Salmonella.all.tsv.gz + # ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/credible_sets//Alasoo_2018_txrev_macrophage_IFNg+Salmonella.purity_filtered.txt.gz + + all_ftp_file = all_ftp_path.split('/')[-1] + fine_ftp_file = all_ftp_file.replace('all.tsv', 'purity_filtered.txt') + + fine_ftp_path = 'ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/credible_sets/' + fine_ftp_path += fine_ftp_file + + local_path = 'txrev/%s' % fine_ftp_file + if not os.path.isfile(local_path): + cmd = 'curl -o %s %s' % (local_path, fine_ftp_path) + jobs.append(cmd) + + util.exec_par(jobs, 4, verbose=True) + # print('\n'.join(jobs)) + + +################################################################################ +# __main__ +################################################################################ +if __name__ == '__main__': + main() diff --git a/src/scripts/data/qtl_data/download_sumstat.py b/src/scripts/data/qtl_data/download_sumstat.py new file mode 100644 index 0000000..ca402df --- /dev/null +++ b/src/scripts/data/qtl_data/download_sumstat.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +from optparse import OptionParser + +import os + +import pandas as pd + +import util + +''' +download_sumstat.py + +Download QTL Catalogue sumstats. +''' + +################################################################################ +# main +################################################################################ +def main(): + usage = 'usage: %prog [options] arg' + parser = OptionParser(usage) + #parser.add_option() + (options,args) = parser.parse_args() + + # read remote table + samples_df = pd.read_csv('https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/master/tabix/tabix_ftp_paths.tsv', sep='\t') + + # filter GTEx (for now) + samples_df = samples_df[samples_df.study == 'GTEx'] + + + ################################################ + # ge for sumstat (we want SNPs and possibly also base expression) + + os.makedirs('ge', exist_ok=True) + txrev_df = samples_df[samples_df.quant_method == 'ge'] + + jobs = [] + for all_ftp_path in txrev_df.ftp_path: + # ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/sumstats/Alasoo_2018/txrev/Alasoo_2018_txrev_macrophage_IFNg+Salmonella.all.tsv.gz + + local_path = 'ge/%s' % all_ftp_path.split("/")[-1] + + if not os.path.isfile(local_path): + cmd = 'curl -o %s %s' % (local_path, all_ftp_path) + jobs.append(cmd) + + util.exec_par(jobs, 4, verbose=True) + # print('\n'.join(jobs)) + + +################################################################################ +# __main__ +################################################################################ +if __name__ == '__main__': + main() diff --git a/src/scripts/data/qtl_data/ipaqtl_make_negative_sets.py b/src/scripts/data/qtl_data/ipaqtl_make_negative_sets.py new file mode 100644 index 0000000..3f4d49d --- /dev/null +++ b/src/scripts/data/qtl_data/ipaqtl_make_negative_sets.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python +from optparse import OptionParser + +import os + +import util + +import numpy as np +import pandas as pd + +import pyranges as pr + +''' +paqtl_make_negative_sets.py + +Build tables with negative (non-causal) SNPs for paQTLs. +''' + +################################################################################ +# main +################################################################################ +def main(): + usage = 'usage: %prog [options] arg' + parser = OptionParser(usage) + #parser.add_option() + (options,args) = parser.parse_args() + + #Parameters + pip_cutoff = 0.01 + max_distance = 10000 + gene_pad = 50 + apa_file = 'polyadb_intron.bed' + gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf' + finemap_file = 'txrev/GTEx_txrev_finemapped_merged.csv.gz' + + #Define tissues + tissue_names = [ + 'adipose_subcutaneous', + 'adipose_visceral', + 'adrenal_gland', + 'artery_aorta', + 'artery_coronary', + 'artery_tibial', + 'blood', + 'brain_amygdala', + 'brain_anterior_cingulate_cortex', + 'brain_caudate', + 'brain_cerebellar_hemisphere', + 'brain_cerebellum', + 'brain_cortex', + 'brain_frontal_cortex', + 'brain_hippocampus', + 'brain_hypothalamus', + 'brain_nucleus_accumbens', + 'brain_putamen', + 'brain_spinal_cord', + 'brain_substantia_nigra', + 'breast', + 'colon_sigmoid', + 'colon_transverse', + 'esophagus_gej', + 'esophagus_mucosa', + 'esophagus_muscularis', + 'fibroblast', + 'heart_atrial_appendage', + 'heart_left_ventricle', + 'kidney_cortex', + 'LCL', + 'liver', + 'lung', + 'minor_salivary_gland', + 'muscle', + 'nerve_tibial', + 'ovary', + 'pancreas', + 'pituitary', + 'prostate', + 'skin_not_sun_exposed', + 'skin_sun_exposed', + 'small_intestine', + 'spleen', + 'stomach', + 'testis', + 'thyroid', + 'uterus', + 'vagina', + ] + + #Compile negative SNP set for each tissue + for tissue_name in tissue_names : + + print("-- " + str(tissue_name) + " --") + + #Load summary stats and extract unique set of SNPs + vcf_df = pd.read_csv("ge/GTEx_ge_" + tissue_name + ".all.tsv.gz", sep='\t', compression='gzip', usecols=['chromosome', 'position', 'ref', 'alt']).drop_duplicates(subset=['chromosome', 'position', 'ref', 'alt'], keep='first').copy().reset_index(drop=True) + + #Only keep SNPs (no indels) + vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True) + + vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str) + vcf_df['start'] = vcf_df['position'].astype(int) + vcf_df['end'] = vcf_df['start'] + 1 + vcf_df['strand'] = "." + + vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand']] + vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'}) + + print("len(vcf_df) = " + str(len(vcf_df))) + + #Store intermediate SNPs + #vcf_df.to_csv("ge/GTEx_snps_" + tissue_name + ".bed.gz", sep='\t', index=False, header=False) + + #Load polyadenylation site annotation + apa_df = pd.read_csv(apa_file, sep='\t', names=['Chromosome', 'Start', 'End', 'pas_id', 'feat1', 'Strand']) + apa_df['Start'] += 1 + + #Load gene span annotation + gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str']) + gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True) + + gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0]) + + gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True) + + gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad + gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad + + #Join dataframes against gtf annotation + apa_pr = pr.PyRanges(apa_df) + gtf_pr = pr.PyRanges(gtf_df) + vcf_pr = pr.PyRanges(vcf_df) + + apa_gtf_pr = apa_pr.join(gtf_pr, strandedness='same') + vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False) + + apa_gtf_df = apa_gtf_pr.df[['Chromosome', 'Start', 'End', 'pas_id', 'gene_id', 'Strand']].copy().reset_index(drop=True) + vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id']].copy().reset_index(drop=True) + + apa_gtf_df['Start'] -= max_distance + apa_gtf_df['End'] += max_distance + + #Join vcf against polyadenylation annotation + apa_gtf_pr = pr.PyRanges(apa_gtf_df) + vcf_gtf_pr = pr.PyRanges(vcf_gtf_df) + + vcf_apa_pr = vcf_gtf_pr.join(apa_gtf_pr, strandedness=False) + + #Force gene_id of SNP to be same as the gene_id of the polyA site + vcf_apa_df = vcf_apa_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True) + vcf_apa_df = vcf_apa_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'pas_id', 'Strand_b', 'Start_b']] + + #PolyA site position + vcf_apa_df['Start_b'] += max_distance + vcf_apa_df = vcf_apa_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'pas_pos', 'Strand_b' : 'Strand'}) + + #Distance to polyA site + vcf_apa_df['distance'] = np.abs(vcf_apa_df['Pos'] - vcf_apa_df['pas_pos']) + + #Choose unique SNPs by shortest distance to polyA site + vcf_apa_df = vcf_apa_df.sort_values(by='distance', ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True) + vcf_apa_df = vcf_apa_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True) + + vcf_df_filtered = vcf_apa_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'}) + vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'pas_id', 'strand', 'pas_pos', 'distance']] + + print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered))) + + #Store intermediate SNPs (filtered) + vcf_df_filtered.to_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_filtered.bed.gz", sep='\t', index=False) + + #Reload filtered SNP file + vcf_df_filtered = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_filtered.bed.gz", sep='\t', compression='gzip') + + #Create variant identifier + vcf_df_filtered['variant'] = vcf_df_filtered['chrom'] + "_" + vcf_df_filtered['pos'].astype(str) + "_" + vcf_df_filtered['ref'] + "_" + vcf_df_filtered['alt'] + + #Load merged fine-mapping dataframe + finemap_df = pd.read_csv(finemap_file, sep='\t')[['variant', 'pip']] + + #Join against fine-mapping dataframe + neg_df = vcf_df_filtered.join(finemap_df.set_index('variant'), on='variant', how='left') + neg_df.loc[neg_df['pip'].isnull(), 'pip'] = 0. + + #Only keep SNPs with PIP < cutoff + neg_df = neg_df.query("pip < " + str(pip_cutoff)).copy().reset_index(drop=True) + + #Store final table of negative SNPs + neg_df.to_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_negatives.bed.gz", sep='\t', index=False) + + print("len(neg_df) = " + str(len(neg_df))) + +################################################################################ +# __main__ +################################################################################ +if __name__ == '__main__': + main() diff --git a/src/scripts/data/qtl_data/ipaqtl_make_positive_sets.py b/src/scripts/data/qtl_data/ipaqtl_make_positive_sets.py new file mode 100644 index 0000000..f1afb7b --- /dev/null +++ b/src/scripts/data/qtl_data/ipaqtl_make_positive_sets.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python +from optparse import OptionParser + +import os + +import util + +import numpy as np +import pandas as pd + +import pyranges as pr + +''' +paqtl_make_positive_sets.py + +Build tables with positive (causal) SNPs for paQTLs. +''' + +################################################################################ +# main +################################################################################ +def main(): + usage = 'usage: %prog [options] arg' + parser = OptionParser(usage) + #parser.add_option() + (options,args) = parser.parse_args() + + #Parameters + pip_cutoff = 0.01 + max_distance = 10000 + gene_pad = 50 + apa_file = 'polyadb_intron.bed' + gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf' + + #Define tissues + tissue_names = [ + 'adipose_subcutaneous', + 'adipose_visceral', + 'adrenal_gland', + 'artery_aorta', + 'artery_coronary', + 'artery_tibial', + 'blood', + 'brain_amygdala', + 'brain_anterior_cingulate_cortex', + 'brain_caudate', + 'brain_cerebellar_hemisphere', + 'brain_cerebellum', + 'brain_cortex', + 'brain_frontal_cortex', + 'brain_hippocampus', + 'brain_hypothalamus', + 'brain_nucleus_accumbens', + 'brain_putamen', + 'brain_spinal_cord', + 'brain_substantia_nigra', + 'breast', + 'colon_sigmoid', + 'colon_transverse', + 'esophagus_gej', + 'esophagus_mucosa', + 'esophagus_muscularis', + 'fibroblast', + 'heart_atrial_appendage', + 'heart_left_ventricle', + 'kidney_cortex', + 'LCL', + 'liver', + 'lung', + 'minor_salivary_gland', + 'muscle', + 'nerve_tibial', + 'ovary', + 'pancreas', + 'pituitary', + 'prostate', + 'skin_not_sun_exposed', + 'skin_sun_exposed', + 'small_intestine', + 'spleen', + 'stomach', + 'testis', + 'thyroid', + 'uterus', + 'vagina', + ] + + #Compile positive SNP set for each tissue + for tissue_name in tissue_names : + + print("-- " + str(tissue_name) + " --") + + #Load fine-mapping table + vcf_df = pd.read_csv("txrev/GTEx_txrev_" + tissue_name + ".purity_filtered.txt.gz", sep='\t', usecols=['chromosome', 'position', 'ref', 'alt', 'variant', 'pip', 'molecular_trait_id'], low_memory=False) + + #Only keep SNPs (no indels) + vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True) + + #Only keep SNPs associated with polyadenylation events + vcf_df = vcf_df.loc[vcf_df['molecular_trait_id'].str.contains(".downstream.")].copy().reset_index(drop=True) + + vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str) + vcf_df['start'] = vcf_df['position'].astype(int) + vcf_df['end'] = vcf_df['start'] + 1 + vcf_df['strand'] = "." + + vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand', 'variant', 'pip', 'molecular_trait_id']] + vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'}) + + print("len(vcf_df) = " + str(len(vcf_df))) + + #Load polyadenylation site annotation + apa_df = pd.read_csv(apa_file, sep='\t', names=['Chromosome', 'Start', 'End', 'pas_id', 'feat1', 'Strand']) + apa_df['Start'] += 1 + + #Load gene span annotation + gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str']) + gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True) + + gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0]) + + gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True) + + gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad + gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad + + #Join dataframes against gtf annotation + apa_pr = pr.PyRanges(apa_df) + gtf_pr = pr.PyRanges(gtf_df) + vcf_pr = pr.PyRanges(vcf_df) + + apa_gtf_pr = apa_pr.join(gtf_pr, strandedness='same') + vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False) + + apa_gtf_df = apa_gtf_pr.df[['Chromosome', 'Start', 'End', 'pas_id', 'gene_id', 'Strand']].copy().reset_index(drop=True) + vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id', 'variant', 'pip', 'molecular_trait_id']].copy().reset_index(drop=True) + + apa_gtf_df['Start'] -= max_distance + apa_gtf_df['End'] += max_distance + + #Join vcf against polyadenylation annotation + apa_gtf_pr = pr.PyRanges(apa_gtf_df) + vcf_gtf_pr = pr.PyRanges(vcf_gtf_df) + + vcf_apa_pr = vcf_gtf_pr.join(apa_gtf_pr, strandedness=False) + + #Force gene_id of SNP to be same as the gene_id of the polyA site + vcf_apa_df = vcf_apa_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True) + vcf_apa_df = vcf_apa_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'pas_id', 'Strand_b', 'Start_b', 'variant', 'pip', 'molecular_trait_id']] + + #Force gene_id of SNP to be same as the gene_id of the finemapped molecular trait + vcf_apa_df['molecular_trait_gene_id'] = vcf_apa_df['molecular_trait_id'].apply(lambda x: x.split(".")[0]) + vcf_apa_df = vcf_apa_df.query("gene_id == molecular_trait_gene_id").copy().reset_index(drop=True) + + #PolyA site position + vcf_apa_df['Start_b'] += max_distance + vcf_apa_df = vcf_apa_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'pas_pos', 'Strand_b' : 'Strand'}) + + #Distance to polyA site + vcf_apa_df['distance'] = np.abs(vcf_apa_df['Pos'] - vcf_apa_df['pas_pos']) + + #Choose unique SNPs by shortest distance to polyA site (and inverse PIP for tie-breaking) + vcf_apa_df['pip_inv'] = 1. - vcf_apa_df['pip'] + + vcf_apa_df = vcf_apa_df.sort_values(by=['distance', 'pip_inv'], ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True) + vcf_apa_df = vcf_apa_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True) + + vcf_df_filtered = vcf_apa_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'}) + vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'pas_id', 'strand', 'pas_pos', 'distance', 'variant', 'pip', 'molecular_trait_id']] + + print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered))) + + #Store intermediate SNPs (filtered) + vcf_df_filtered.to_csv("txrev/GTEx_snps_" + tissue_name + "_intronic_polya_finemapped_filtered.bed.gz", sep='\t', index=False) + + #Reload filtered SNP file + vcf_df_filtered = pd.read_csv("txrev/GTEx_snps_" + tissue_name + "_intronic_polya_finemapped_filtered.bed.gz", sep='\t', compression='gzip') + + #Only keep SNPs with PIP > cutoff + pos_df = vcf_df_filtered.query("pip > " + str(pip_cutoff)).copy().reset_index(drop=True) + + #Store final table of positive SNPs + pos_df.to_csv("txrev/GTEx_snps_" + tissue_name + "_intronic_polya_positives.bed.gz", sep='\t', index=False) + + print("len(pos_df) = " + str(len(pos_df))) + +################################################################################ +# __main__ +################################################################################ +if __name__ == '__main__': + main() diff --git a/src/scripts/data/qtl_data/ipaqtl_vcfs.py b/src/scripts/data/qtl_data/ipaqtl_vcfs.py new file mode 100644 index 0000000..773c45e --- /dev/null +++ b/src/scripts/data/qtl_data/ipaqtl_vcfs.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python +from optparse import OptionParser +import os +import pdb +import time + +import numpy as np +import pandas as pd +import pyranges as pr +from tqdm import tqdm + +''' +ipaqtl_vcfs.py + +Generate positive and negative intronic paQTL sets from the QTL catalog txrevise. +''' + +################################################################################ +# main +################################################################################ +def main(): + usage = 'usage: %prog [options]' + parser = OptionParser(usage) + parser.add_option('--neg_pip', dest='neg_pip', + default=0.01, type='float', + help='PIP upper limit for negative examples. [Default: %default]') + parser.add_option('--pos_pip', dest='pos_pip', + default=0.9, type='float', + help='PIP lower limit for positive examples. [Default: %default]') + parser.add_option('--match_gene', dest='match_gene', + default=0, type='int', + help='Try finding negative in same gene as positive. [Default: %default]') + parser.add_option('--match_allele', dest='match_allele', + default=0, type='int', + help='Try finding negative with same ref and alt alleles. [Default: %default]') + parser.add_option('-o', dest='out_prefix', + default='qtlcat_ipaqtl') + (options,args) = parser.parse_args() + + tissue_name = options.out_prefix.split('txrev_')[1] + + gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort_protein.gtf' + + # read variant table + qtlcat_df_neg = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_negatives.bed.gz", sep='\t') + qtlcat_df_pos = pd.read_csv("txrev/GTEx_snps_" + tissue_name + "_intronic_polya_positives.bed.gz", sep='\t') + + # read TPM bin table and construct lookup dictionaries + tpm_df = pd.read_csv('ge/GTEx_ge_' + tissue_name + "_tpms.csv", sep='\t')[['gene_id', 'tpm', 'bin_index', 'bin_index_l', 'bin_index_r']] + gene_to_tpm_dict = tpm_df.set_index('gene_id').to_dict(orient='index') + + # filter on SNPs with genes in TPM bin dict + qtlcat_df_neg = qtlcat_df_neg.loc[qtlcat_df_neg['gene_id'].isin(tpm_df['gene_id'].values.tolist())].copy().reset_index(drop=True) + qtlcat_df_pos = qtlcat_df_pos.loc[qtlcat_df_pos['gene_id'].isin(tpm_df['gene_id'].values.tolist())].copy().reset_index(drop=True) + + #Load gene span annotation (protein-coding/categorized only) + gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['id_str']) + gtf_genes = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0]).unique().tolist() + + # filter on SNPs with genes in GTF file + qtlcat_df_neg = qtlcat_df_neg.loc[qtlcat_df_neg['gene_id'].isin(gtf_genes)].copy().reset_index(drop=True) + qtlcat_df_pos = qtlcat_df_pos.loc[qtlcat_df_pos['gene_id'].isin(gtf_genes)].copy().reset_index(drop=True) + + bin_to_genes_dict = {} + for _, row in tpm_df.iterrows() : + + if row['bin_index'] not in bin_to_genes_dict : + bin_to_genes_dict[row['bin_index']] = [] + + bin_to_genes_dict[row['bin_index']].append(row['gene_id']) + + for sample_bin in bin_to_genes_dict : + bin_to_genes_dict[sample_bin] = set(bin_to_genes_dict[sample_bin]) + + # split molecular trait id and filter for polyadenylation (for positives) + qtlcat_df_pos['gene'] = [mti.split('.')[0] for mti in qtlcat_df_pos.molecular_trait_id] + qtlcat_df_pos['event'] = [mti.split('.')[2] for mti in qtlcat_df_pos.molecular_trait_id] + + qtlcat_df_pos = qtlcat_df_pos[qtlcat_df_pos.event == 'downstream'] + qtlcat_df_pos = qtlcat_df_pos.rename(columns={'distance' : 'pas_dist'}) + + qtlcat_df_neg['molecular_trait_id'] = qtlcat_df_neg['gene_id'] + "." + "grp_0.downstream.negative" + qtlcat_df_neg['gene'] = qtlcat_df_neg['gene_id'] + qtlcat_df_neg['event'] = 'downstream' + qtlcat_df_neg = qtlcat_df_neg.rename(columns={'distance' : 'pas_dist'}) + + paqtl_df = pd.concat([qtlcat_df_neg, qtlcat_df_pos]).copy().reset_index(drop=True) + + # determine positive variants + paqtl_pos_df = paqtl_df[paqtl_df.pip >= options.pos_pip] + paqtl_neg_df = paqtl_df[paqtl_df.pip < options.neg_pip] + pos_variants = set(paqtl_pos_df.variant) + + neg_gene_and_allele_variants = 0 + neg_gene_variants = 0 + + neg_expr_and_allele_variants = 0 + neg_expr_variants = 0 + + unmatched_variants = 0 + + # choose negative variants + neg_variants = set() + neg_dict = {} + for pvariant in tqdm(pos_variants): + paqtl_this_df = paqtl_pos_df[paqtl_pos_df.variant == pvariant] + + neg_found = False + + # optionally prefer negative from positive's gene set + if options.match_gene == 1 and options.match_allele == 1 : + pgenes = set(paqtl_this_df.gene) + neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True) + + if neg_found : + neg_gene_and_allele_variants += 1 + + if not neg_found and options.match_gene == 1 : + pgenes = set(paqtl_this_df.gene) + neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False) + + if neg_found : + neg_gene_variants += 1 + + if not neg_found and options.match_allele == 1 : + pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index']] + neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True) + + if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l'] : + pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l']] + neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True) + + if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r'] : + pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r']] + neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True) + + if neg_found : + neg_expr_and_allele_variants += 1 + + if not neg_found : + pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index']] + neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False) + + if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l'] : + pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l']] + neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False) + + if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r'] : + pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r']] + neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False) + + if neg_found : + neg_expr_variants += 1 + + if not neg_found : + print("[Warning] Could not find a matching negative for '" + pvariant + "'") + unmatched_variants += 1 + + print('%d positive variants' % len(pos_variants)) + print('%d negative variants' % len(neg_variants)) + print(' - %d gene-matched negatives with same alleles' % neg_gene_and_allele_variants) + print(' - %d gene-matched negatives ' % neg_gene_variants) + print(' - %d expr-matched negatives with same alleles' % neg_expr_and_allele_variants) + print(' - %d expr-matched negatives ' % neg_expr_variants) + print(' - %d unmatched negatives ' % unmatched_variants) + + pos_dict = {pv: pv for pv in pos_variants} + + # write VCFs + write_vcf('%s_pos.vcf' % options.out_prefix, paqtl_df, pos_variants, pos_dict) + write_vcf('%s_neg.vcf' % options.out_prefix, paqtl_df, neg_variants, neg_dict) + +def find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, match_allele) : + + gene_mask = np.array([gene in pgenes for gene in paqtl_neg_df.gene]) + paqtl_neg_gene_df = paqtl_neg_df[gene_mask] + + # match PAS distance + this_dist = paqtl_this_df.iloc[0].pas_dist + dist_cmp = np.abs(paqtl_neg_gene_df.pas_dist - this_dist) + dist_cmp_unique = np.sort(np.unique(dist_cmp.values)) + + this_ref = paqtl_this_df.iloc[0].ref + this_alt = paqtl_this_df.iloc[0].alt + + for ni_unique in dist_cmp_unique: + + paqtl_neg_gene_dist_df = paqtl_neg_gene_df.loc[dist_cmp == ni_unique] + + shuffle_index = np.arange(len(paqtl_neg_gene_dist_df), dtype='int32') + np.random.shuffle(shuffle_index) + + for npaqtl_i in range(len(paqtl_neg_gene_dist_df)) : + npaqtl = paqtl_neg_gene_dist_df.iloc[shuffle_index[npaqtl_i]] + + if not match_allele or (npaqtl.ref == this_ref and npaqtl.alt == this_alt): + if npaqtl.variant not in neg_variants and npaqtl.variant not in pos_variants: + + neg_variants.add(npaqtl.variant) + neg_dict[npaqtl.variant] = paqtl_this_df.iloc[0].variant + + return True + + return False + +def write_vcf(vcf_file, df, variants_write, variants_dict): + vcf_open = open(vcf_file, 'w') + print('##fileformat=VCFv4.2', file=vcf_open) + print('##INFO=', + file=vcf_open) + print('##INFO=', + file=vcf_open) + print('##INFO=', + file=vcf_open) + cols = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO'] + print('\t'.join(cols), file=vcf_open) + + variants_written = set() + + for v in df.itertuples(): + if v.variant in variants_write and v.variant not in variants_written: + cols = [v.chrom, str(v.pos), v.variant, v.ref, v.alt, '.', '.'] + cols += ['MT=%s;PD=%d;PI=%s' % (v.molecular_trait_id, v.pas_dist, variants_dict[v.variant])] + print('\t'.join(cols), file=vcf_open) + variants_written.add(v.variant) + + vcf_open.close() + + +################################################################################ +# __main__ +################################################################################ +if __name__ == '__main__': + main() diff --git a/src/scripts/data/qtl_data/make_expression_tables.py b/src/scripts/data/qtl_data/make_expression_tables.py new file mode 100644 index 0000000..ddc2a63 --- /dev/null +++ b/src/scripts/data/qtl_data/make_expression_tables.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python +from optparse import OptionParser + +import os + +import util + +import numpy as np +import pandas as pd + +import pyranges as pr + +import matplotlib.pyplot as plt + +''' +make_expression_tables.py + +Contruct TPM bucket to sample genes from. +''' + +################################################################################ +# main +################################################################################ +def main(): + usage = 'usage: %prog [options] arg' + parser = OptionParser(usage) + #parser.add_option() + (options,args) = parser.parse_args() + + #Define tissue column-to-file mapping + tissue_dict = { + 'Adipose - Subcutaneous' : 'adipose_subcutaneous', + 'Adipose - Visceral (Omentum)' : 'adipose_visceral', + 'Adrenal Gland' : 'adrenal_gland', + 'Artery - Aorta' : 'artery_aorta', + 'Artery - Coronary' : 'artery_coronary', + 'Artery - Tibial' : 'artery_tibial', + 'Whole Blood' : 'blood', + 'Brain - Amygdala' : 'brain_amygdala', + 'Brain - Anterior cingulate cortex (BA24)' : 'brain_anterior_cingulate_cortex', + 'Brain - Caudate (basal ganglia)' : 'brain_caudate', + 'Brain - Cerebellar Hemisphere' : 'brain_cerebellar_hemisphere', + 'Brain - Cerebellum' : 'brain_cerebellum', + 'Brain - Cortex' : 'brain_cortex', + 'Brain - Frontal Cortex (BA9)' : 'brain_frontal_cortex', + 'Brain - Hippocampus' : 'brain_hippocampus', + 'Brain - Hypothalamus' : 'brain_hypothalamus', + 'Brain - Nucleus accumbens (basal ganglia)' : 'brain_nucleus_accumbens', + 'Brain - Putamen (basal ganglia)' : 'brain_putamen', + 'Brain - Spinal cord (cervical c-1)' : 'brain_spinal_cord', + 'Brain - Substantia nigra' : 'brain_substantia_nigra', + 'Breast - Mammary Tissue' : 'breast', + 'Colon - Sigmoid' : 'colon_sigmoid', + 'Colon - Transverse' : 'colon_transverse', + 'Esophagus - Gastroesophageal Junction' : 'esophagus_gej', + 'Esophagus - Mucosa' : 'esophagus_mucosa', + 'Esophagus - Muscularis' : 'esophagus_muscularis', + 'Cells - Cultured fibroblasts' : 'fibroblast', + 'Heart - Atrial Appendage' : 'heart_atrial_appendage', + 'Heart - Left Ventricle' : 'heart_left_ventricle', + 'Kidney - Cortex' : 'kidney_cortex', + 'Cells - EBV-transformed lymphocytes' : 'LCL', + 'Liver' : 'liver', + 'Lung' : 'lung', + 'Minor Salivary Gland' : 'minor_salivary_gland', + 'Muscle - Skeletal' : 'muscle', + 'Nerve - Tibial' : 'nerve_tibial', + 'Ovary' : 'ovary', + 'Pancreas' : 'pancreas', + 'Pituitary' : 'pituitary', + 'Prostate' : 'prostate', + 'Skin - Not Sun Exposed (Suprapubic)' : 'skin_not_sun_exposed', + 'Skin - Sun Exposed (Lower leg)' : 'skin_sun_exposed', + 'Small Intestine - Terminal Ileum' : 'small_intestine', + 'Spleen' : 'spleen', + 'Stomach' : 'stomach', + 'Testis' : 'testis', + 'Thyroid' : 'thyroid', + 'Uterus' : 'uterus', + 'Vagina' : 'vagina', + } + + for tissue_name in tissue_dict : + + #Load TPM matrix + tpm_df = pd.read_csv("GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz", sep='\t', compression='gzip', skiprows=2) + + save_name = tissue_dict[tissue_name] + + print("-- " + save_name + " --") + + #Clean dataframe + tpm_df['gene_id'] = tpm_df['Name'].apply(lambda x: x.split(".")[0]) + + tpm_df = tpm_df.drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True) + + tpm_df['tpm'] = tpm_df[tissue_name] + tpm_df = tpm_df[['gene_id', 'tpm']] + + #Get non-zero TPM entries + tpm_df_zero = tpm_df.loc[tpm_df['tpm'] == 0].copy().reset_index(drop=True) + tpm_df_nonzero = tpm_df.loc[tpm_df['tpm'] > 0].copy().reset_index(drop=True) + + tpm_df_zero['tpm_log2'] = 0. + tpm_df_nonzero['tpm_log2'] = np.log2(tpm_df_nonzero['tpm']) + + #Clip at extremes + min_q = 0.0075 + max_q = 0.9925 + + #Log2 fold change bin sizes + bin_size = 0.4 + bin_offset = 0.15 + + min_tpm_log2 = np.quantile(tpm_df_nonzero['tpm_log2'], q=min_q) + max_tpm_log2 = np.quantile(tpm_df_nonzero['tpm_log2'], q=max_q) + + tpm_df_nonzero.loc[tpm_df_nonzero['tpm_log2'] < min_tpm_log2, 'tpm_log2'] = min_tpm_log2 + tpm_df_nonzero.loc[tpm_df_nonzero['tpm_log2'] > max_tpm_log2, 'tpm_log2'] = max_tpm_log2 + + tpm_log2 = tpm_df_nonzero['tpm_log2'].values + + n_bins = int((max_tpm_log2 - min_tpm_log2) / bin_size) + + #Get sample bins + sample_bins = np.linspace(min_tpm_log2, max_tpm_log2, n_bins+1) + + #Map values to bins + bin_index = np.digitize(tpm_log2, sample_bins[1:], right=True) + bin_index_l = np.digitize(tpm_log2 - bin_offset, sample_bins[1:], right=True) + bin_index_r = np.digitize(tpm_log2 + bin_offset, sample_bins[1:], right=True) + + tpm_df_zero['bin_index_l'] = -1 * np.ones(len(tpm_df_zero), dtype='int32') + tpm_df_zero['bin_index'] = -1 * np.ones(len(tpm_df_zero), dtype='int32') + tpm_df_zero['bin_index_r'] = -1 * np.ones(len(tpm_df_zero), dtype='int32') + + tpm_df_nonzero['bin_index_l'] = bin_index_l + tpm_df_nonzero['bin_index'] = bin_index + tpm_df_nonzero['bin_index_r'] = bin_index_r + + tpm_df = pd.concat([tpm_df_zero, tpm_df_nonzero]).copy().reset_index(drop=True) + + tpm_df = tpm_df.sort_values(by='gene_id', ascending=True).copy().reset_index(drop=True) + + #Save dataframe + tpm_df.to_csv('ge/GTEx_ge_' + save_name + "_tpms.csv", sep='\t', index=False) + + #Visualize TPM sample bins + tpm_df_filtered = tpm_df.loc[tpm_df['tpm'] > 0.] + + f = plt.figure(figsize=(4, 3)) + + plt.hist(tpm_df_filtered['bin_index'].values, bins=np.unique(tpm_df_filtered['bin_index'].values)) + + plt.xlim(0, np.max(tpm_df_filtered['bin_index'].values)) + + plt.xticks(fontsize=8) + plt.yticks(fontsize=8) + + plt.xlabel("Sample bin (FC < " + str(round(2**(bin_size+2*bin_offset), 2)) + ")", fontsize=8) + plt.ylabel("# of genes", fontsize=8) + + plt.title("TPM sample bins (" + save_name + ")", fontsize=8) + + plt.tight_layout() + + plt.savefig('ge/GTEx_ge_' + save_name + "_tpms.png", transparent=False, dpi=300) + + plt.close() + + #Check and warn in case of low-support bins + _, bin_support = np.unique(tpm_df_filtered['bin_index'].values, return_counts=True) + + if np.any(bin_support < 100) : + print("[Warning] Less than 100 genes in some of the TPM sample bins (min = " + str(int(np.min(bin_support))) + ").") + +################################################################################ +# __main__ +################################################################################ +if __name__ == '__main__': + main() diff --git a/src/scripts/data/qtl_data/make_vcfs.py b/src/scripts/data/qtl_data/make_vcfs.py new file mode 100644 index 0000000..aa251d0 --- /dev/null +++ b/src/scripts/data/qtl_data/make_vcfs.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python +from optparse import OptionParser + +import glob +import os + +import pandas as pd + +import util + +''' +make_vcfs.py + +Download QTL Catalogue fine-mapping results. +''' + +################################################################################ +# main +################################################################################ +def main(): + usage = 'usage: %prog [options] arg' + parser = OptionParser(usage) + #parser.add_option() + (options,args) = parser.parse_args() + + pip = 0.2 + match_gene = 0 + match_allele = 1 + + ################################################ + # intronic polyA QTLs + + out_dir = 'ipaqtl_pip%d%s%s' % (pip*100, 'g' if match_gene == 1 else 'e', 'a' if match_allele else '') + os.makedirs(out_dir, exist_ok=True) + + jobs = [] + for table_file in glob.glob('txrev/*.txt.gz'): + out_prefix = table_file.replace('txrev/', '%s/' % out_dir) + out_prefix = out_prefix.replace('.purity_filtered.txt.gz', '') + cmd = './ipaqtl_vcfs.py --neg_pip 0.01 --pos_pip %f --match_gene %d --match_allele %d -o %s' % (pip, match_gene, match_allele, out_prefix) + jobs.append(cmd) + util.exec_par(jobs, 6, verbose=True) + + # merge study/tissue variants + mpos_vcf_file = '%s/pos_merge.vcf' % out_dir + mneg_vcf_file = '%s/neg_merge.vcf' % out_dir + merge_variants(mpos_vcf_file, '%s/*_pos.vcf' % out_dir) + merge_variants(mneg_vcf_file, '%s/*_neg.vcf' % out_dir) + + + ################################################ + # polyA QTLs + + out_dir = 'paqtl_pip%d%s%s' % (pip*100, 'g' if match_gene == 1 else 'e', 'a' if match_allele else '') + os.makedirs(out_dir, exist_ok=True) + + jobs = [] + for table_file in glob.glob('txrev/*.txt.gz'): + out_prefix = table_file.replace('txrev/', '%s/' % out_dir) + out_prefix = out_prefix.replace('.purity_filtered.txt.gz', '') + cmd = './paqtl_vcfs.py --neg_pip 0.01 --pos_pip %f --match_gene %d --match_allele %d -o %s' % (pip, match_gene, match_allele, out_prefix) + jobs.append(cmd) + util.exec_par(jobs, 6, verbose=True) + + # merge study/tissue variants + mpos_vcf_file = '%s/pos_merge.vcf' % out_dir + mneg_vcf_file = '%s/neg_merge.vcf' % out_dir + merge_variants(mpos_vcf_file, '%s/*_pos.vcf' % out_dir) + merge_variants(mneg_vcf_file, '%s/*_neg.vcf' % out_dir) + + ################################################ + # splicing QTLs + + out_dir = 'sqtl_pip%d%s%s' % (pip*100, 'g' if match_gene == 1 else 'e', 'a' if match_allele else '') + os.makedirs(out_dir, exist_ok=True) + + jobs = [] + for table_file in glob.glob('txrev/*.txt.gz'): + out_prefix = table_file.replace('txrev/', '%s/' % out_dir) + out_prefix = out_prefix.replace('.purity_filtered.txt.gz', '') + cmd = './sqtl_vcfs.py --neg_pip 0.01 --pos_pip %f --match_gene %d --match_allele %d -o %s' % (pip, match_gene, match_allele, out_prefix) + jobs.append(cmd) + util.exec_par(jobs, 6, verbose=True) + + # merge study/tissue variants + mpos_vcf_file = '%s/pos_merge.vcf' % out_dir + mneg_vcf_file = '%s/neg_merge.vcf' % out_dir + merge_variants(mpos_vcf_file, '%s/*_pos.vcf' % out_dir) + merge_variants(mneg_vcf_file, '%s/*_neg.vcf' % out_dir) + + +def merge_variants(merge_vcf_file, vcf_glob): + with open(merge_vcf_file, 'w') as merge_vcf_open: + vcf0_file = list(glob.glob(vcf_glob))[0] + for line in open(vcf0_file): + if line[0] == '#': + print(line, end='', file=merge_vcf_open) + + merged_variants = set() + for vcf_file in glob.glob(vcf_glob): + for line in open(vcf_file): + if not line.startswith('#'): + variant = line.split()[2] + if variant not in merged_variants: + print(line, file=merge_vcf_open, end='') + merged_variants.add(variant) + +################################################################################ +# __main__ +################################################################################ +if __name__ == '__main__': + main() diff --git a/src/scripts/data/qtl_data/merge_finemapping_tables.py b/src/scripts/data/qtl_data/merge_finemapping_tables.py new file mode 100644 index 0000000..ac4fa7d --- /dev/null +++ b/src/scripts/data/qtl_data/merge_finemapping_tables.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +from optparse import OptionParser + +import os + +import util + +import numpy as np +import pandas as pd + +''' +merge_finemapping_tables.py + +Merge fine-mapping tables of QTL credible sets. +''' + +################################################################################ +# main +################################################################################ +def main(): + usage = 'usage: %prog [options] arg' + parser = OptionParser(usage) + #parser.add_option() + (options,args) = parser.parse_args() + + #Define tissues + tissue_names = [ + 'adipose_subcutaneous', + 'adipose_visceral', + 'adrenal_gland', + 'artery_aorta', + 'artery_coronary', + 'artery_tibial', + 'blood', + 'brain_amygdala', + 'brain_anterior_cingulate_cortex', + 'brain_caudate', + 'brain_cerebellar_hemisphere', + 'brain_cerebellum', + 'brain_cortex', + 'brain_frontal_cortex', + 'brain_hippocampus', + 'brain_hypothalamus', + 'brain_nucleus_accumbens', + 'brain_putamen', + 'brain_spinal_cord', + 'brain_substantia_nigra', + 'breast', + 'colon_sigmoid', + 'colon_transverse', + 'esophagus_gej', + 'esophagus_mucosa', + 'esophagus_muscularis', + 'fibroblast', + 'heart_atrial_appendage', + 'heart_left_ventricle', + 'kidney_cortex', + 'LCL', + 'liver', + 'lung', + 'minor_salivary_gland', + 'muscle', + 'nerve_tibial', + 'ovary', + 'pancreas', + 'pituitary', + 'prostate', + 'skin_not_sun_exposed', + 'skin_sun_exposed', + 'small_intestine', + 'spleen', + 'stomach', + 'testis', + 'thyroid', + 'uterus', + 'vagina', + ] + + #Load and merge fine-mapping results + dfs = [] + for tissue_name in tissue_names : + + print("-- " + tissue_name + " --") + + df = pd.read_csv("txrev/GTEx_txrev_" + tissue_name + ".purity_filtered.txt.gz", sep='\t', usecols=['chromosome', 'position', 'ref', 'alt', 'variant', 'pip'], low_memory=False) + dfs.append(df.sort_values(by='pip', ascending=False).drop_duplicates(subset=['variant'], keep='first').copy().reset_index(drop=True)) + + df = pd.concat(dfs).sort_values(by='pip', ascending=False).drop_duplicates(subset=['variant'], keep='first').copy().reset_index(drop=True) + + df['chromosome'] = "chr" + df['chromosome'].astype(str) + df = df.rename(columns={'chromosome' : 'chrom', 'position' : 'pos'}) + + print("len(df) = " + str(len(df))) + + #Save union of dataframes + df.to_csv("txrev/GTEx_txrev_finemapped_merged.csv.gz", sep='\t', index=False) + +################################################################################ +# __main__ +################################################################################ +if __name__ == '__main__': + main() diff --git a/src/scripts/data/qtl_data/paqtl_make_negative_sets.py b/src/scripts/data/qtl_data/paqtl_make_negative_sets.py new file mode 100644 index 0000000..a5da60d --- /dev/null +++ b/src/scripts/data/qtl_data/paqtl_make_negative_sets.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python +from optparse import OptionParser + +import os + +import util + +import numpy as np +import pandas as pd + +import pyranges as pr + +''' +paqtl_make_negative_sets.py + +Build tables with negative (non-causal) SNPs for paQTLs. +''' + +################################################################################ +# main +################################################################################ +def main(): + usage = 'usage: %prog [options] arg' + parser = OptionParser(usage) + #parser.add_option() + (options,args) = parser.parse_args() + + #Parameters + pip_cutoff = 0.01 + max_distance = 10000 + gene_pad = 50 + apa_file = '/home/drk/common/data/genomes/hg38/genes/polyadb/polyadb_exon3.bed' + gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf' + finemap_file = 'txrev/GTEx_txrev_finemapped_merged.csv.gz' + + #Define tissues + tissue_names = [ + 'adipose_subcutaneous', + 'adipose_visceral', + 'adrenal_gland', + 'artery_aorta', + 'artery_coronary', + 'artery_tibial', + 'blood', + 'brain_amygdala', + 'brain_anterior_cingulate_cortex', + 'brain_caudate', + 'brain_cerebellar_hemisphere', + 'brain_cerebellum', + 'brain_cortex', + 'brain_frontal_cortex', + 'brain_hippocampus', + 'brain_hypothalamus', + 'brain_nucleus_accumbens', + 'brain_putamen', + 'brain_spinal_cord', + 'brain_substantia_nigra', + 'breast', + 'colon_sigmoid', + 'colon_transverse', + 'esophagus_gej', + 'esophagus_mucosa', + 'esophagus_muscularis', + 'fibroblast', + 'heart_atrial_appendage', + 'heart_left_ventricle', + 'kidney_cortex', + 'LCL', + 'liver', + 'lung', + 'minor_salivary_gland', + 'muscle', + 'nerve_tibial', + 'ovary', + 'pancreas', + 'pituitary', + 'prostate', + 'skin_not_sun_exposed', + 'skin_sun_exposed', + 'small_intestine', + 'spleen', + 'stomach', + 'testis', + 'thyroid', + 'uterus', + 'vagina', + ] + + #Compile negative SNP set for each tissue + for tissue_name in tissue_names : + + print("-- " + str(tissue_name) + " --") + + #Load summary stats and extract unique set of SNPs + vcf_df = pd.read_csv("ge/GTEx_ge_" + tissue_name + ".all.tsv.gz", sep='\t', compression='gzip', usecols=['chromosome', 'position', 'ref', 'alt']).drop_duplicates(subset=['chromosome', 'position', 'ref', 'alt'], keep='first').copy().reset_index(drop=True) + + #Only keep SNPs (no indels) + vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True) + + vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str) + vcf_df['start'] = vcf_df['position'].astype(int) + vcf_df['end'] = vcf_df['start'] + 1 + vcf_df['strand'] = "." + + vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand']] + vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'}) + + print("len(vcf_df) = " + str(len(vcf_df))) + + #Store intermediate SNPs + #vcf_df.to_csv("ge/GTEx_snps_" + tissue_name + ".bed.gz", sep='\t', index=False, header=False) + + #Load polyadenylation site annotation + apa_df = pd.read_csv(apa_file, sep='\t', names=['Chromosome', 'Start', 'End', 'pas_id', 'feat1', 'Strand']) + apa_df['Start'] += 1 + + #Load gene span annotation + gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str']) + gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True) + + gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0]) + + gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True) + + gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad + gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad + + #Join dataframes against gtf annotation + apa_pr = pr.PyRanges(apa_df) + gtf_pr = pr.PyRanges(gtf_df) + vcf_pr = pr.PyRanges(vcf_df) + + apa_gtf_pr = apa_pr.join(gtf_pr, strandedness='same') + vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False) + + apa_gtf_df = apa_gtf_pr.df[['Chromosome', 'Start', 'End', 'pas_id', 'gene_id', 'Strand']].copy().reset_index(drop=True) + vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id']].copy().reset_index(drop=True) + + apa_gtf_df['Start'] -= max_distance + apa_gtf_df['End'] += max_distance + + #Join vcf against polyadenylation annotation + apa_gtf_pr = pr.PyRanges(apa_gtf_df) + vcf_gtf_pr = pr.PyRanges(vcf_gtf_df) + + vcf_apa_pr = vcf_gtf_pr.join(apa_gtf_pr, strandedness=False) + + #Force gene_id of SNP to be same as the gene_id of the polyA site + vcf_apa_df = vcf_apa_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True) + vcf_apa_df = vcf_apa_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'pas_id', 'Strand_b', 'Start_b']] + + #PolyA site position + vcf_apa_df['Start_b'] += max_distance + vcf_apa_df = vcf_apa_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'pas_pos', 'Strand_b' : 'Strand'}) + + #Distance to polyA site + vcf_apa_df['distance'] = np.abs(vcf_apa_df['Pos'] - vcf_apa_df['pas_pos']) + + #Choose unique SNPs by shortest distance to polyA site + vcf_apa_df = vcf_apa_df.sort_values(by='distance', ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True) + vcf_apa_df = vcf_apa_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True) + + vcf_df_filtered = vcf_apa_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'}) + vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'pas_id', 'strand', 'pas_pos', 'distance']] + + print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered))) + + #Store intermediate SNPs (filtered) + vcf_df_filtered.to_csv("ge/GTEx_snps_" + tissue_name + "_polya_filtered.bed.gz", sep='\t', index=False) + + #Reload filtered SNP file + vcf_df_filtered = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_polya_filtered.bed.gz", sep='\t', compression='gzip') + + #Create variant identifier + vcf_df_filtered['variant'] = vcf_df_filtered['chrom'] + "_" + vcf_df_filtered['pos'].astype(str) + "_" + vcf_df_filtered['ref'] + "_" + vcf_df_filtered['alt'] + + #Load merged fine-mapping dataframe + finemap_df = pd.read_csv(finemap_file, sep='\t')[['variant', 'pip']] + + #Join against fine-mapping dataframe + neg_df = vcf_df_filtered.join(finemap_df.set_index('variant'), on='variant', how='left') + neg_df.loc[neg_df['pip'].isnull(), 'pip'] = 0. + + #Only keep SNPs with PIP < cutoff + neg_df = neg_df.query("pip < " + str(pip_cutoff)).copy().reset_index(drop=True) + + #Store final table of negative SNPs + neg_df.to_csv("ge/GTEx_snps_" + tissue_name + "_polya_negatives.bed.gz", sep='\t', index=False) + + print("len(neg_df) = " + str(len(neg_df))) + +################################################################################ +# __main__ +################################################################################ +if __name__ == '__main__': + main() diff --git a/src/scripts/data/qtl_data/paqtl_make_positive_sets.py b/src/scripts/data/qtl_data/paqtl_make_positive_sets.py new file mode 100644 index 0000000..3d07fa3 --- /dev/null +++ b/src/scripts/data/qtl_data/paqtl_make_positive_sets.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python +from optparse import OptionParser + +import os + +import util + +import numpy as np +import pandas as pd + +import pyranges as pr + +''' +paqtl_make_positive_sets.py + +Build tables with positive (causal) SNPs for paQTLs. +''' + +################################################################################ +# main +################################################################################ +def main(): + usage = 'usage: %prog [options] arg' + parser = OptionParser(usage) + #parser.add_option() + (options,args) = parser.parse_args() + + #Parameters + pip_cutoff = 0.01 + max_distance = 10000 + gene_pad = 50 + apa_file = '/home/drk/common/data/genomes/hg38/genes/polyadb/polyadb_exon3.bed' + gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf' + + #Define tissues + tissue_names = [ + 'adipose_subcutaneous', + 'adipose_visceral', + 'adrenal_gland', + 'artery_aorta', + 'artery_coronary', + 'artery_tibial', + 'blood', + 'brain_amygdala', + 'brain_anterior_cingulate_cortex', + 'brain_caudate', + 'brain_cerebellar_hemisphere', + 'brain_cerebellum', + 'brain_cortex', + 'brain_frontal_cortex', + 'brain_hippocampus', + 'brain_hypothalamus', + 'brain_nucleus_accumbens', + 'brain_putamen', + 'brain_spinal_cord', + 'brain_substantia_nigra', + 'breast', + 'colon_sigmoid', + 'colon_transverse', + 'esophagus_gej', + 'esophagus_mucosa', + 'esophagus_muscularis', + 'fibroblast', + 'heart_atrial_appendage', + 'heart_left_ventricle', + 'kidney_cortex', + 'LCL', + 'liver', + 'lung', + 'minor_salivary_gland', + 'muscle', + 'nerve_tibial', + 'ovary', + 'pancreas', + 'pituitary', + 'prostate', + 'skin_not_sun_exposed', + 'skin_sun_exposed', + 'small_intestine', + 'spleen', + 'stomach', + 'testis', + 'thyroid', + 'uterus', + 'vagina', + ] + + #Compile positive SNP set for each tissue + for tissue_name in tissue_names : + + print("-- " + str(tissue_name) + " --") + + #Load fine-mapping table + vcf_df = pd.read_csv("txrev/GTEx_txrev_" + tissue_name + ".purity_filtered.txt.gz", sep='\t', usecols=['chromosome', 'position', 'ref', 'alt', 'variant', 'pip', 'molecular_trait_id'], low_memory=False) + + #Only keep SNPs (no indels) + vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True) + + #Only keep SNPs associated with polyadenylation events + vcf_df = vcf_df.loc[vcf_df['molecular_trait_id'].str.contains(".downstream.")].copy().reset_index(drop=True) + + vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str) + vcf_df['start'] = vcf_df['position'].astype(int) + vcf_df['end'] = vcf_df['start'] + 1 + vcf_df['strand'] = "." + + vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand', 'variant', 'pip', 'molecular_trait_id']] + vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'}) + + print("len(vcf_df) = " + str(len(vcf_df))) + + #Load polyadenylation site annotation + apa_df = pd.read_csv(apa_file, sep='\t', names=['Chromosome', 'Start', 'End', 'pas_id', 'feat1', 'Strand']) + apa_df['Start'] += 1 + + #Load gene span annotation + gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str']) + gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True) + + gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0]) + + gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True) + + gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad + gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad + + #Join dataframes against gtf annotation + apa_pr = pr.PyRanges(apa_df) + gtf_pr = pr.PyRanges(gtf_df) + vcf_pr = pr.PyRanges(vcf_df) + + apa_gtf_pr = apa_pr.join(gtf_pr, strandedness='same') + vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False) + + apa_gtf_df = apa_gtf_pr.df[['Chromosome', 'Start', 'End', 'pas_id', 'gene_id', 'Strand']].copy().reset_index(drop=True) + vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id', 'variant', 'pip', 'molecular_trait_id']].copy().reset_index(drop=True) + + apa_gtf_df['Start'] -= max_distance + apa_gtf_df['End'] += max_distance + + #Join vcf against polyadenylation annotation + apa_gtf_pr = pr.PyRanges(apa_gtf_df) + vcf_gtf_pr = pr.PyRanges(vcf_gtf_df) + + vcf_apa_pr = vcf_gtf_pr.join(apa_gtf_pr, strandedness=False) + + #Force gene_id of SNP to be same as the gene_id of the polyA site + vcf_apa_df = vcf_apa_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True) + vcf_apa_df = vcf_apa_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'pas_id', 'Strand_b', 'Start_b', 'variant', 'pip', 'molecular_trait_id']] + + #Force gene_id of SNP to be same as the gene_id of the finemapped molecular trait + vcf_apa_df['molecular_trait_gene_id'] = vcf_apa_df['molecular_trait_id'].apply(lambda x: x.split(".")[0]) + vcf_apa_df = vcf_apa_df.query("gene_id == molecular_trait_gene_id").copy().reset_index(drop=True) + + #PolyA site position + vcf_apa_df['Start_b'] += max_distance + vcf_apa_df = vcf_apa_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'pas_pos', 'Strand_b' : 'Strand'}) + + #Distance to polyA site + vcf_apa_df['distance'] = np.abs(vcf_apa_df['Pos'] - vcf_apa_df['pas_pos']) + + #Choose unique SNPs by shortest distance to polyA site (and inverse PIP for tie-breaking) + vcf_apa_df['pip_inv'] = 1. - vcf_apa_df['pip'] + + vcf_apa_df = vcf_apa_df.sort_values(by=['distance', 'pip_inv'], ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True) + vcf_apa_df = vcf_apa_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True) + + vcf_df_filtered = vcf_apa_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'}) + vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'pas_id', 'strand', 'pas_pos', 'distance', 'variant', 'pip', 'molecular_trait_id']] + + print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered))) + + #Store intermediate SNPs (filtered) + vcf_df_filtered.to_csv("txrev/GTEx_snps_" + tissue_name + "_polya_finemapped_filtered.bed.gz", sep='\t', index=False) + + #Reload filtered SNP file + vcf_df_filtered = pd.read_csv("txrev/GTEx_snps_" + tissue_name + "_polya_finemapped_filtered.bed.gz", sep='\t', compression='gzip') + + #Only keep SNPs with PIP > cutoff + pos_df = vcf_df_filtered.query("pip > " + str(pip_cutoff)).copy().reset_index(drop=True) + + #Store final table of positive SNPs + pos_df.to_csv("txrev/GTEx_snps_" + tissue_name + "_polya_positives.bed.gz", sep='\t', index=False) + + print("len(pos_df) = " + str(len(pos_df))) + +################################################################################ +# __main__ +################################################################################ +if __name__ == '__main__': + main() diff --git a/src/scripts/data/qtl_data/paqtl_vcfs.py b/src/scripts/data/qtl_data/paqtl_vcfs.py new file mode 100644 index 0000000..f0884b1 --- /dev/null +++ b/src/scripts/data/qtl_data/paqtl_vcfs.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python +from optparse import OptionParser +import os +import pdb +import time + +import numpy as np +import pandas as pd +import pyranges as pr +from tqdm import tqdm + +''' +paqtl_vcfs.py + +Generate positive and negative paQTL sets from the QTL catalog txrevise. +''' + +################################################################################ +# main +################################################################################ +def main(): + usage = 'usage: %prog [options]' + parser = OptionParser(usage) + parser.add_option('--neg_pip', dest='neg_pip', + default=0.01, type='float', + help='PIP upper limit for negative examples. [Default: %default]') + parser.add_option('--pos_pip', dest='pos_pip', + default=0.9, type='float', + help='PIP lower limit for positive examples. [Default: %default]') + parser.add_option('--match_gene', dest='match_gene', + default=0, type='int', + help='Try finding negative in same gene as positive. [Default: %default]') + parser.add_option('--match_allele', dest='match_allele', + default=0, type='int', + help='Try finding negative with same ref and alt alleles. [Default: %default]') + parser.add_option('-o', dest='out_prefix', + default='qtlcat_paqtl') + (options,args) = parser.parse_args() + + tissue_name = options.out_prefix.split('txrev_')[1] + + gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort_protein.gtf' + + # read variant table + qtlcat_df_neg = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_polya_negatives.bed.gz", sep='\t') + qtlcat_df_pos = pd.read_csv("txrev/GTEx_snps_" + tissue_name + "_polya_positives.bed.gz", sep='\t') + + # read TPM bin table and construct lookup dictionaries + tpm_df = pd.read_csv('ge/GTEx_ge_' + tissue_name + "_tpms.csv", sep='\t')[['gene_id', 'tpm', 'bin_index', 'bin_index_l', 'bin_index_r']] + gene_to_tpm_dict = tpm_df.set_index('gene_id').to_dict(orient='index') + + # filter on SNPs with genes in TPM bin dict + qtlcat_df_neg = qtlcat_df_neg.loc[qtlcat_df_neg['gene_id'].isin(tpm_df['gene_id'].values.tolist())].copy().reset_index(drop=True) + qtlcat_df_pos = qtlcat_df_pos.loc[qtlcat_df_pos['gene_id'].isin(tpm_df['gene_id'].values.tolist())].copy().reset_index(drop=True) + + #Load gene span annotation (protein-coding/categorized only) + gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['id_str']) + gtf_genes = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0]).unique().tolist() + + # filter on SNPs with genes in GTF file + qtlcat_df_neg = qtlcat_df_neg.loc[qtlcat_df_neg['gene_id'].isin(gtf_genes)].copy().reset_index(drop=True) + qtlcat_df_pos = qtlcat_df_pos.loc[qtlcat_df_pos['gene_id'].isin(gtf_genes)].copy().reset_index(drop=True) + + bin_to_genes_dict = {} + for _, row in tpm_df.iterrows() : + + if row['bin_index'] not in bin_to_genes_dict : + bin_to_genes_dict[row['bin_index']] = [] + + bin_to_genes_dict[row['bin_index']].append(row['gene_id']) + + for sample_bin in bin_to_genes_dict : + bin_to_genes_dict[sample_bin] = set(bin_to_genes_dict[sample_bin]) + + # split molecular trait id and filter for polyadenylation (for positives) + qtlcat_df_pos['gene'] = [mti.split('.')[0] for mti in qtlcat_df_pos.molecular_trait_id] + qtlcat_df_pos['event'] = [mti.split('.')[2] for mti in qtlcat_df_pos.molecular_trait_id] + + qtlcat_df_pos = qtlcat_df_pos[qtlcat_df_pos.event == 'downstream'] + qtlcat_df_pos = qtlcat_df_pos.rename(columns={'distance' : 'pas_dist'}) + + qtlcat_df_neg['molecular_trait_id'] = qtlcat_df_neg['gene_id'] + "." + "grp_0.downstream.negative" + qtlcat_df_neg['gene'] = qtlcat_df_neg['gene_id'] + qtlcat_df_neg['event'] = 'downstream' + qtlcat_df_neg = qtlcat_df_neg.rename(columns={'distance' : 'pas_dist'}) + + paqtl_df = pd.concat([qtlcat_df_neg, qtlcat_df_pos]).copy().reset_index(drop=True) + + # determine positive variants + paqtl_pos_df = paqtl_df[paqtl_df.pip >= options.pos_pip] + paqtl_neg_df = paqtl_df[paqtl_df.pip < options.neg_pip] + pos_variants = set(paqtl_pos_df.variant) + + neg_gene_and_allele_variants = 0 + neg_gene_variants = 0 + + neg_expr_and_allele_variants = 0 + neg_expr_variants = 0 + + unmatched_variants = 0 + + # choose negative variants + neg_variants = set() + neg_dict = {} + for pvariant in tqdm(pos_variants): + paqtl_this_df = paqtl_pos_df[paqtl_pos_df.variant == pvariant] + + neg_found = False + + # optionally prefer negative from positive's gene set + if options.match_gene == 1 and options.match_allele == 1 : + pgenes = set(paqtl_this_df.gene) + neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True) + + if neg_found : + neg_gene_and_allele_variants += 1 + + if not neg_found and options.match_gene == 1 : + pgenes = set(paqtl_this_df.gene) + neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False) + + if neg_found : + neg_gene_variants += 1 + + if not neg_found and options.match_allele == 1 : + pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index']] + neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True) + + if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l'] : + pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l']] + neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True) + + if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r'] : + pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r']] + neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True) + + if neg_found : + neg_expr_and_allele_variants += 1 + + if not neg_found : + pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index']] + neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False) + + if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l'] : + pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l']] + neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False) + + if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r'] : + pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r']] + neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False) + + if neg_found : + neg_expr_variants += 1 + + if not neg_found : + print("[Warning] Could not find a matching negative for '" + pvariant + "'") + unmatched_variants += 1 + + print('%d positive variants' % len(pos_variants)) + print('%d negative variants' % len(neg_variants)) + print(' - %d gene-matched negatives with same alleles' % neg_gene_and_allele_variants) + print(' - %d gene-matched negatives ' % neg_gene_variants) + print(' - %d expr-matched negatives with same alleles' % neg_expr_and_allele_variants) + print(' - %d expr-matched negatives ' % neg_expr_variants) + print(' - %d unmatched negatives ' % unmatched_variants) + + pos_dict = {pv: pv for pv in pos_variants} + + # write VCFs + write_vcf('%s_pos.vcf' % options.out_prefix, paqtl_df, pos_variants, pos_dict) + write_vcf('%s_neg.vcf' % options.out_prefix, paqtl_df, neg_variants, neg_dict) + +def find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, match_allele) : + + gene_mask = np.array([gene in pgenes for gene in paqtl_neg_df.gene]) + paqtl_neg_gene_df = paqtl_neg_df[gene_mask] + + # match PAS distance + this_dist = paqtl_this_df.iloc[0].pas_dist + dist_cmp = np.abs(paqtl_neg_gene_df.pas_dist - this_dist) + dist_cmp_unique = np.sort(np.unique(dist_cmp.values)) + + this_ref = paqtl_this_df.iloc[0].ref + this_alt = paqtl_this_df.iloc[0].alt + + for ni_unique in dist_cmp_unique: + + paqtl_neg_gene_dist_df = paqtl_neg_gene_df.loc[dist_cmp == ni_unique] + + shuffle_index = np.arange(len(paqtl_neg_gene_dist_df), dtype='int32') + np.random.shuffle(shuffle_index) + + for npaqtl_i in range(len(paqtl_neg_gene_dist_df)) : + npaqtl = paqtl_neg_gene_dist_df.iloc[shuffle_index[npaqtl_i]] + + if not match_allele or (npaqtl.ref == this_ref and npaqtl.alt == this_alt): + if npaqtl.variant not in neg_variants and npaqtl.variant not in pos_variants: + + neg_variants.add(npaqtl.variant) + neg_dict[npaqtl.variant] = paqtl_this_df.iloc[0].variant + + return True + + return False + +def write_vcf(vcf_file, df, variants_write, variants_dict): + vcf_open = open(vcf_file, 'w') + print('##fileformat=VCFv4.2', file=vcf_open) + print('##INFO=', + file=vcf_open) + print('##INFO=', + file=vcf_open) + print('##INFO=', + file=vcf_open) + cols = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO'] + print('\t'.join(cols), file=vcf_open) + + variants_written = set() + + for v in df.itertuples(): + if v.variant in variants_write and v.variant not in variants_written: + cols = [v.chrom, str(v.pos), v.variant, v.ref, v.alt, '.', '.'] + cols += ['MT=%s;PD=%d;PI=%s' % (v.molecular_trait_id, v.pas_dist, variants_dict[v.variant])] + print('\t'.join(cols), file=vcf_open) + variants_written.add(v.variant) + + vcf_open.close() + + +################################################################################ +# __main__ +################################################################################ +if __name__ == '__main__': + main() diff --git a/src/scripts/data/qtl_data/sqtl_make_negative_sets.py b/src/scripts/data/qtl_data/sqtl_make_negative_sets.py new file mode 100644 index 0000000..7518ca4 --- /dev/null +++ b/src/scripts/data/qtl_data/sqtl_make_negative_sets.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python +from optparse import OptionParser + +import os + +import util + +import numpy as np +import pandas as pd + +import pyranges as pr + +''' +sqtl_make_negative_sets.py + +Build tables with negative (non-causal) SNPs for sQTLs. +''' + +################################################################################ +# main +################################################################################ +def main(): + usage = 'usage: %prog [options] arg' + parser = OptionParser(usage) + #parser.add_option() + (options,args) = parser.parse_args() + + #Parameters + pip_cutoff = 0.01 + max_distance = 10000 + gene_pad = 50 + splice_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_protein_splice.gff' + gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf' + finemap_file = 'txrev/GTEx_txrev_finemapped_merged.csv.gz' + + #Define tissues + tissue_names = [ + 'adipose_subcutaneous', + 'adipose_visceral', + 'adrenal_gland', + 'artery_aorta', + 'artery_coronary', + 'artery_tibial', + 'blood', + 'brain_amygdala', + 'brain_anterior_cingulate_cortex', + 'brain_caudate', + 'brain_cerebellar_hemisphere', + 'brain_cerebellum', + 'brain_cortex', + 'brain_frontal_cortex', + 'brain_hippocampus', + 'brain_hypothalamus', + 'brain_nucleus_accumbens', + 'brain_putamen', + 'brain_spinal_cord', + 'brain_substantia_nigra', + 'breast', + 'colon_sigmoid', + 'colon_transverse', + 'esophagus_gej', + 'esophagus_mucosa', + 'esophagus_muscularis', + 'fibroblast', + 'heart_atrial_appendage', + 'heart_left_ventricle', + 'kidney_cortex', + 'LCL', + 'liver', + 'lung', + 'minor_salivary_gland', + 'muscle', + 'nerve_tibial', + 'ovary', + 'pancreas', + 'pituitary', + 'prostate', + 'skin_not_sun_exposed', + 'skin_sun_exposed', + 'small_intestine', + 'spleen', + 'stomach', + 'testis', + 'thyroid', + 'uterus', + 'vagina', + ] + + #Compile negative SNP set for each tissue + for tissue_name in tissue_names : + + print("-- " + str(tissue_name) + " --") + + #Load summary stats and extract unique set of SNPs + vcf_df = pd.read_csv("ge/GTEx_ge_" + tissue_name + ".all.tsv.gz", sep='\t', compression='gzip', usecols=['chromosome', 'position', 'ref', 'alt']).drop_duplicates(subset=['chromosome', 'position', 'ref', 'alt'], keep='first').copy().reset_index(drop=True) + + #Only keep SNPs (no indels) + vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True) + + vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str) + vcf_df['start'] = vcf_df['position'].astype(int) + vcf_df['end'] = vcf_df['start'] + 1 + vcf_df['strand'] = "." + + vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand']] + vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'}) + + print("len(vcf_df) = " + str(len(vcf_df))) + + #Store intermediate SNPs + #vcf_df.to_csv("ge/GTEx_snps_" + tissue_name + ".bed.gz", sep='\t', index=False, header=False) + + #Load splice site annotation + splice_df = pd.read_csv(splice_file, sep='\t', names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'], usecols=['Chromosome', 'Start', 'End', 'feature', 'feat1', 'Strand'])[['Chromosome', 'Start', 'End', 'feature', 'feat1', 'Strand']] + + #Load gene span annotation + gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str']) + gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True) + + gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0]) + + gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True) + + gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad + gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad + + #Join dataframes against gtf annotation + splice_pr = pr.PyRanges(splice_df) + gtf_pr = pr.PyRanges(gtf_df) + vcf_pr = pr.PyRanges(vcf_df) + + splice_gtf_pr = splice_pr.join(gtf_pr, strandedness='same') + vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False) + + splice_gtf_df = splice_gtf_pr.df[['Chromosome', 'Start', 'End', 'feature', 'gene_id', 'Strand']].copy().reset_index(drop=True) + vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id']].copy().reset_index(drop=True) + + splice_gtf_df['Start'] -= max_distance + splice_gtf_df['End'] += max_distance + + #Join vcf against splice annotation + splice_gtf_pr = pr.PyRanges(splice_gtf_df) + vcf_gtf_pr = pr.PyRanges(vcf_gtf_df) + + vcf_splice_pr = vcf_gtf_pr.join(splice_gtf_pr, strandedness=False) + + #Force gene_id of SNP to be same as the gene_id of the splice site + vcf_splice_df = vcf_splice_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True) + vcf_splice_df = vcf_splice_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'feature', 'Strand_b', 'Start_b']] + + #Splice site position + vcf_splice_df['Start_b'] += max_distance + vcf_splice_df = vcf_splice_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'splice_pos', 'Strand_b' : 'Strand'}) + + #Distance to splice site + vcf_splice_df['distance'] = np.abs(vcf_splice_df['Pos'] - vcf_splice_df['splice_pos']) + + #Choose unique SNPs by shortest distance to splice site + vcf_splice_df = vcf_splice_df.sort_values(by='distance', ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True) + vcf_splice_df = vcf_splice_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True) + + vcf_df_filtered = vcf_splice_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'}) + vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'feature', 'strand', 'splice_pos', 'distance']] + + print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered))) + + #Store intermediate SNPs (filtered) + vcf_df_filtered.to_csv("ge/GTEx_snps_" + tissue_name + "_splice_filtered.bed.gz", sep='\t', index=False) + + #Reload filtered SNP file + vcf_df_filtered = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_splice_filtered.bed.gz", sep='\t', compression='gzip') + + #Create variant identifier + vcf_df_filtered['variant'] = vcf_df_filtered['chrom'] + "_" + vcf_df_filtered['pos'].astype(str) + "_" + vcf_df_filtered['ref'] + "_" + vcf_df_filtered['alt'] + + #Load merged fine-mapping dataframe + finemap_df = pd.read_csv(finemap_file, sep='\t')[['variant', 'pip']] + + #Join against fine-mapping dataframe + neg_df = vcf_df_filtered.join(finemap_df.set_index('variant'), on='variant', how='left') + neg_df.loc[neg_df['pip'].isnull(), 'pip'] = 0. + + #Only keep SNPs with PIP < cutoff + neg_df = neg_df.query("pip < " + str(pip_cutoff)).copy().reset_index(drop=True) + + #Store final table of negative SNPs + neg_df.to_csv("ge/GTEx_snps_" + tissue_name + "_splice_negatives.bed.gz", sep='\t', index=False) + + print("len(neg_df) = " + str(len(neg_df))) + +################################################################################ +# __main__ +################################################################################ +if __name__ == '__main__': + main() diff --git a/src/scripts/data/qtl_data/sqtl_make_positive_sets.py b/src/scripts/data/qtl_data/sqtl_make_positive_sets.py new file mode 100644 index 0000000..954ab7e --- /dev/null +++ b/src/scripts/data/qtl_data/sqtl_make_positive_sets.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python +from optparse import OptionParser + +import os + +import util + +import numpy as np +import pandas as pd + +import pyranges as pr + +''' +sqtl_make_positive_sets.py + +Build tables with positive (causal) SNPs for sQTLs. +''' + +################################################################################ +# main +################################################################################ +def main(): + usage = 'usage: %prog [options] arg' + parser = OptionParser(usage) + #parser.add_option() + (options,args) = parser.parse_args() + + #Parameters + pip_cutoff = 0.01 + max_distance = 10000 + gene_pad = 50 + splice_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_protein_splice.gff' + gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf' + + #Define tissues + tissue_names = [ + 'adipose_subcutaneous', + 'adipose_visceral', + 'adrenal_gland', + 'artery_aorta', + 'artery_coronary', + 'artery_tibial', + 'blood', + 'brain_amygdala', + 'brain_anterior_cingulate_cortex', + 'brain_caudate', + 'brain_cerebellar_hemisphere', + 'brain_cerebellum', + 'brain_cortex', + 'brain_frontal_cortex', + 'brain_hippocampus', + 'brain_hypothalamus', + 'brain_nucleus_accumbens', + 'brain_putamen', + 'brain_spinal_cord', + 'brain_substantia_nigra', + 'breast', + 'colon_sigmoid', + 'colon_transverse', + 'esophagus_gej', + 'esophagus_mucosa', + 'esophagus_muscularis', + 'fibroblast', + 'heart_atrial_appendage', + 'heart_left_ventricle', + 'kidney_cortex', + 'LCL', + 'liver', + 'lung', + 'minor_salivary_gland', + 'muscle', + 'nerve_tibial', + 'ovary', + 'pancreas', + 'pituitary', + 'prostate', + 'skin_not_sun_exposed', + 'skin_sun_exposed', + 'small_intestine', + 'spleen', + 'stomach', + 'testis', + 'thyroid', + 'uterus', + 'vagina', + ] + + #Compile positive SNP set for each tissue + for tissue_name in tissue_names : + + print("-- " + str(tissue_name) + " --") + + #Load fine-mapping table + vcf_df = pd.read_csv("txrev/GTEx_txrev_" + tissue_name + ".purity_filtered.txt.gz", sep='\t', usecols=['chromosome', 'position', 'ref', 'alt', 'variant', 'pip', 'molecular_trait_id'], low_memory=False) + + #Only keep SNPs (no indels) + vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True) + + #Only keep SNPs associated with splice events + vcf_df = vcf_df.loc[vcf_df['molecular_trait_id'].str.contains(".contained.")].copy().reset_index(drop=True) + + vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str) + vcf_df['start'] = vcf_df['position'].astype(int) + vcf_df['end'] = vcf_df['start'] + 1 + vcf_df['strand'] = "." + + vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand', 'variant', 'pip', 'molecular_trait_id']] + vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'}) + + print("len(vcf_df) = " + str(len(vcf_df))) + + #Load splice site annotation + splice_df = pd.read_csv(splice_file, sep='\t', names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'], usecols=['Chromosome', 'Start', 'End', 'feature', 'feat1', 'Strand'])[['Chromosome', 'Start', 'End', 'feature', 'feat1', 'Strand']] + + #Load gene span annotation + gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str']) + gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True) + + gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0]) + + gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True) + + gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad + gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad + + #Join dataframes against gtf annotation + splice_pr = pr.PyRanges(splice_df) + gtf_pr = pr.PyRanges(gtf_df) + vcf_pr = pr.PyRanges(vcf_df) + + splice_gtf_pr = splice_pr.join(gtf_pr, strandedness='same') + vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False) + + splice_gtf_df = splice_gtf_pr.df[['Chromosome', 'Start', 'End', 'feature', 'gene_id', 'Strand']].copy().reset_index(drop=True) + vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id', 'variant', 'pip', 'molecular_trait_id']].copy().reset_index(drop=True) + + splice_gtf_df['Start'] -= max_distance + splice_gtf_df['End'] += max_distance + + #Join vcf against splice annotation + splice_gtf_pr = pr.PyRanges(splice_gtf_df) + vcf_gtf_pr = pr.PyRanges(vcf_gtf_df) + + vcf_splice_pr = vcf_gtf_pr.join(splice_gtf_pr, strandedness=False) + + #Force gene_id of SNP to be same as the gene_id of the splice site + vcf_splice_df = vcf_splice_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True) + vcf_splice_df = vcf_splice_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'feature', 'Strand_b', 'Start_b', 'variant', 'pip', 'molecular_trait_id']] + + #Force gene_id of SNP to be same as the gene_id of the finemapped molecular trait + vcf_splice_df['molecular_trait_gene_id'] = vcf_splice_df['molecular_trait_id'].apply(lambda x: x.split(".")[0]) + vcf_splice_df = vcf_splice_df.query("gene_id == molecular_trait_gene_id").copy().reset_index(drop=True) + + #Splice site position + vcf_splice_df['Start_b'] += max_distance + vcf_splice_df = vcf_splice_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'splice_pos', 'Strand_b' : 'Strand'}) + + #Distance to splice site + vcf_splice_df['distance'] = np.abs(vcf_splice_df['Pos'] - vcf_splice_df['splice_pos']) + + #Choose unique SNPs by shortest distance to splice site (and inverse PIP for tie-breaking) + vcf_splice_df['pip_inv'] = 1. - vcf_splice_df['pip'] + + vcf_splice_df = vcf_splice_df.sort_values(by=['distance', 'pip_inv'], ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True) + vcf_splice_df = vcf_splice_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True) + + vcf_df_filtered = vcf_splice_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'}) + vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'feature', 'strand', 'splice_pos', 'distance', 'variant', 'pip', 'molecular_trait_id']] + + print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered))) + + #Store intermediate SNPs (filtered) + vcf_df_filtered.to_csv("txrev/GTEx_snps_" + tissue_name + "_splice_finemapped_filtered.bed.gz", sep='\t', index=False) + + #Reload filtered SNP file + vcf_df_filtered = pd.read_csv("txrev/GTEx_snps_" + tissue_name + "_splice_finemapped_filtered.bed.gz", sep='\t', compression='gzip') + + #Only keep SNPs with PIP > cutoff + pos_df = vcf_df_filtered.query("pip > " + str(pip_cutoff)).copy().reset_index(drop=True) + + #Store final table of positive SNPs + pos_df.to_csv("txrev/GTEx_snps_" + tissue_name + "_splice_positives.bed.gz", sep='\t', index=False) + + print("len(pos_df) = " + str(len(pos_df))) + +################################################################################ +# __main__ +################################################################################ +if __name__ == '__main__': + main() diff --git a/src/scripts/data/qtl_data/sqtl_vcfs.py b/src/scripts/data/qtl_data/sqtl_vcfs.py new file mode 100644 index 0000000..d275a76 --- /dev/null +++ b/src/scripts/data/qtl_data/sqtl_vcfs.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python +from optparse import OptionParser +import os +import pdb +import time + +import numpy as np +import pandas as pd +import pyranges as pr +from tqdm import tqdm + +''' +sqtl_vcfs.py + +Generate positive and negative sQTL sets from the QTL catalog txrevise. +''' + +################################################################################ +# main +################################################################################ +def main(): + usage = 'usage: %prog [options]' + parser = OptionParser(usage) + parser.add_option('--neg_pip', dest='neg_pip', + default=0.01, type='float', + help='PIP upper limit for negative examples. [Default: %default]') + parser.add_option('--pos_pip', dest='pos_pip', + default=0.9, type='float', + help='PIP lower limit for positive examples. [Default: %default]') + parser.add_option('--match_gene', dest='match_gene', + default=0, type='int', + help='Try finding negative in same gene as positive. [Default: %default]') + parser.add_option('--match_allele', dest='match_allele', + default=0, type='int', + help='Try finding negative with same ref and alt alleles. [Default: %default]') + parser.add_option('-o', dest='out_prefix', + default='qtlcat_sqtl') + (options,args) = parser.parse_args() + + tissue_name = options.out_prefix.split('txrev_')[1] + + gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort_protein.gtf' + + # read variant table + qtlcat_df_neg = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_splice_negatives.bed.gz", sep='\t') + qtlcat_df_pos = pd.read_csv("txrev/GTEx_snps_" + tissue_name + "_splice_positives.bed.gz", sep='\t') + + # read TPM bin table and construct lookup dictionaries + tpm_df = pd.read_csv('ge/GTEx_ge_' + tissue_name + "_tpms.csv", sep='\t')[['gene_id', 'tpm', 'bin_index', 'bin_index_l', 'bin_index_r']] + gene_to_tpm_dict = tpm_df.set_index('gene_id').to_dict(orient='index') + + # filter on SNPs with genes in TPM bin dict + qtlcat_df_neg = qtlcat_df_neg.loc[qtlcat_df_neg['gene_id'].isin(tpm_df['gene_id'].values.tolist())].copy().reset_index(drop=True) + qtlcat_df_pos = qtlcat_df_pos.loc[qtlcat_df_pos['gene_id'].isin(tpm_df['gene_id'].values.tolist())].copy().reset_index(drop=True) + + #Load gene span annotation (protein-coding/categorized only) + gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['id_str']) + gtf_genes = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0]).unique().tolist() + + # filter on SNPs with genes in GTF file + qtlcat_df_neg = qtlcat_df_neg.loc[qtlcat_df_neg['gene_id'].isin(gtf_genes)].copy().reset_index(drop=True) + qtlcat_df_pos = qtlcat_df_pos.loc[qtlcat_df_pos['gene_id'].isin(gtf_genes)].copy().reset_index(drop=True) + + bin_to_genes_dict = {} + for _, row in tpm_df.iterrows() : + + if row['bin_index'] not in bin_to_genes_dict : + bin_to_genes_dict[row['bin_index']] = [] + + bin_to_genes_dict[row['bin_index']].append(row['gene_id']) + + for sample_bin in bin_to_genes_dict : + bin_to_genes_dict[sample_bin] = set(bin_to_genes_dict[sample_bin]) + + # split molecular trait id and filter for polyadenylation (for positives) + qtlcat_df_pos['gene'] = [mti.split('.')[0] for mti in qtlcat_df_pos.molecular_trait_id] + qtlcat_df_pos['event'] = [mti.split('.')[2] for mti in qtlcat_df_pos.molecular_trait_id] + + qtlcat_df_pos = qtlcat_df_pos[qtlcat_df_pos.event == 'contained'] + qtlcat_df_pos = qtlcat_df_pos.rename(columns={'distance' : 'splice_dist'}) + + qtlcat_df_neg['molecular_trait_id'] = qtlcat_df_neg['gene_id'] + "." + "grp_0.contained.negative" + qtlcat_df_neg['gene'] = qtlcat_df_neg['gene_id'] + qtlcat_df_neg['event'] = 'contained' + qtlcat_df_neg = qtlcat_df_neg.rename(columns={'distance' : 'splice_dist'}) + + sqtl_df = pd.concat([qtlcat_df_neg, qtlcat_df_pos]).copy().reset_index(drop=True) + + # determine positive variants + sqtl_pos_df = sqtl_df[sqtl_df.pip >= options.pos_pip] + sqtl_neg_df = sqtl_df[sqtl_df.pip < options.neg_pip] + pos_variants = set(sqtl_pos_df.variant) + + neg_gene_and_allele_variants = 0 + neg_gene_variants = 0 + + neg_expr_and_allele_variants = 0 + neg_expr_variants = 0 + + unmatched_variants = 0 + + # choose negative variants + neg_variants = set() + neg_dict = {} + for pvariant in tqdm(pos_variants): + sqtl_this_df = sqtl_pos_df[sqtl_pos_df.variant == pvariant] + + neg_found = False + + # optionally prefer negative from positive's gene set + if options.match_gene == 1 and options.match_allele == 1 : + pgenes = set(sqtl_this_df.gene) + neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, True) + + if neg_found : + neg_gene_and_allele_variants += 1 + + if not neg_found and options.match_gene == 1 : + pgenes = set(sqtl_this_df.gene) + neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, False) + + if neg_found : + neg_gene_variants += 1 + + if not neg_found and options.match_allele == 1 : + pgenes = bin_to_genes_dict[gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index']] + neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, True) + + if not neg_found and gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_l'] : + pgenes = bin_to_genes_dict[gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_l']] + neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, True) + + if not neg_found and gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_r'] : + pgenes = bin_to_genes_dict[gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_r']] + neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, True) + + if neg_found : + neg_expr_and_allele_variants += 1 + + if not neg_found : + pgenes = bin_to_genes_dict[gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index']] + neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, False) + + if not neg_found and gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_l'] : + pgenes = bin_to_genes_dict[gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_l']] + neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, False) + + if not neg_found and gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_r'] : + pgenes = bin_to_genes_dict[gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_r']] + neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, False) + + if neg_found : + neg_expr_variants += 1 + + if not neg_found : + print("[Warning] Could not find a matching negative for '" + pvariant + "'") + unmatched_variants += 1 + + print('%d positive variants' % len(pos_variants)) + print('%d negative variants' % len(neg_variants)) + print(' - %d gene-matched negatives with same alleles' % neg_gene_and_allele_variants) + print(' - %d gene-matched negatives ' % neg_gene_variants) + print(' - %d expr-matched negatives with same alleles' % neg_expr_and_allele_variants) + print(' - %d expr-matched negatives ' % neg_expr_variants) + print(' - %d unmatched negatives ' % unmatched_variants) + + pos_dict = {pv: pv for pv in pos_variants} + + # write VCFs + write_vcf('%s_pos.vcf' % options.out_prefix, sqtl_df, pos_variants, pos_dict) + write_vcf('%s_neg.vcf' % options.out_prefix, sqtl_df, neg_variants, neg_dict) + +def find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, match_allele) : + + gene_mask = np.array([gene in pgenes for gene in sqtl_neg_df.gene]) + sqtl_neg_gene_df = sqtl_neg_df[gene_mask] + + # match PAS distance + this_dist = sqtl_this_df.iloc[0].splice_dist + dist_cmp = np.abs(sqtl_neg_gene_df.splice_dist - this_dist) + dist_cmp_unique = np.sort(np.unique(dist_cmp.values)) + + this_ref = sqtl_this_df.iloc[0].ref + this_alt = sqtl_this_df.iloc[0].alt + + for ni_unique in dist_cmp_unique: + + sqtl_neg_gene_dist_df = sqtl_neg_gene_df.loc[dist_cmp == ni_unique] + + shuffle_index = np.arange(len(sqtl_neg_gene_dist_df), dtype='int32') + np.random.shuffle(shuffle_index) + + for nsqtl_i in range(len(sqtl_neg_gene_dist_df)) : + nsqtl = sqtl_neg_gene_dist_df.iloc[shuffle_index[nsqtl_i]] + + if not match_allele or (nsqtl.ref == this_ref and nsqtl.alt == this_alt): + if nsqtl.variant not in neg_variants and nsqtl.variant not in pos_variants: + + neg_variants.add(nsqtl.variant) + neg_dict[nsqtl.variant] = sqtl_this_df.iloc[0].variant + + return True + + return False + +def write_vcf(vcf_file, df, variants_write, variants_dict): + vcf_open = open(vcf_file, 'w') + print('##fileformat=VCFv4.2', file=vcf_open) + print('##INFO=', + file=vcf_open) + print('##INFO=', + file=vcf_open) + print('##INFO=', + file=vcf_open) + cols = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO'] + print('\t'.join(cols), file=vcf_open) + + variants_written = set() + + for v in df.itertuples(): + if v.variant in variants_write and v.variant not in variants_written: + cols = [v.chrom, str(v.pos), v.variant, v.ref, v.alt, '.', '.'] + cols += ['MT=%s;SD=%d;PI=%s' % (v.molecular_trait_id, v.splice_dist, variants_dict[v.variant])] + print('\t'.join(cols), file=vcf_open) + variants_written.add(v.variant) + + vcf_open.close() + + +################################################################################ +# __main__ +################################################################################ +if __name__ == '__main__': + main() diff --git a/src/scripts/data/training_data/Makefile b/src/scripts/data/training_data/Makefile new file mode 100644 index 0000000..170222b --- /dev/null +++ b/src/scripts/data/training_data/Makefile @@ -0,0 +1,47 @@ +FASTA_HUMAN=$$HG38/assembly/ucsc/hg38.ml.fa +GAPS_HUMAN=$$HG38/assembly/ucsc/hg38_gaps.bed +UMAP_HUMAN=$$HG38//mappability/umap_k36_t10_l32.bed +BLACK_HUMAN=$$HG38/blacklist/blacklist_hg38_all.bed + +FASTA_MOUSE=$$MM10/assembly/ucsc/mm10.ml.fa +GAPS_MOUSE=$$MM10/assembly/ucsc/mm10_gaps.bed +UMAP_MOUSE=$$MM10//mappability/umap_k36_t10_l32.bed +BLACK_MOUSE=$$MM10/blacklist/blacklist_mm10_all.bed + +ALIGN=$$HG38/align/hg38.mm10.syn.net.gz + +OUT=/scratch3/drk/seqnn/data/v9 + +# LENGTH=393216 +# TSTRIDE=43691 # (393216-2*131072)/3 +# CROP=131072 + +LENGTH=524288 +TSTRIDE=49173 # (524288-2*163840)/4 + 21 +CROP=163840 +WIDTH=32 +FOLDS=8 + +AOPTS=--break 2097152 -c $(CROP) --nf 524288 --no 393216 -l $(LENGTH) --stride $(TSTRIDE) -f $(FOLDS) --umap_t 0.5 -w $(WIDTH) +DOPTS=-c $(CROP) -d 2 -f $(FOLDS) -l $(LENGTH) -p 64 -r 16 --umap_clip 0.5 -w $(WIDTH) + + +all: $(OUT)/hg38/tfrecords/train-0.tfr $(OUT)/mm10/tfrecords/train-0.tfr + +umap_human.bed: + cat $(UMAP_HUMAN) $(BLACK_HUMAN) | awk 'BEGIN {OFS="\t"} {print $$1, $$2, $$3}' | bedtools sort -i - | bedtools merge -i - > umap_human.bed + +umap_mouse.bed: + cat $(UMAP_MOUSE) $(BLACK_MOUSE) | awk 'BEGIN {OFS="\t"} {print $$1, $$2, $$3}' | bedtools sort -i - | bedtools merge -i - > umap_mouse.bed + +targets_human.txt targets_mouse.txt: + ./make_targets.py + +$(OUT)/hg38/sequences.bed $(OUT)/mm10/sequences.bed: umap_human.bed umap_mouse.bed + basenji_data_align.py -a hg38,mm10 -g $(GAPS_HUMAN),$(GAPS_MOUSE) -u umap_human.bed,umap_mouse.bed $(AOPTS) -o $(OUT) $(ALIGN) $(FASTA_HUMAN),$(FASTA_MOUSE) + +$(OUT)/hg38/tfrecords/train-0.tfr: $(OUT)/hg38/sequences.bed targets_human.txt + basenji_data.py --restart $(DOPTS) -b $(BLACK_HUMAN) -o $(OUT)/hg38 $(FASTA_HUMAN) -u umap_human.bed targets_human.txt + +$(OUT)/mm10/tfrecords/train-0.tfr: $(OUT)/mm10/sequences.bed targets_mouse.txt + basenji_data.py --restart $(DOPTS) -b $(BLACK_MOUSE) -o $(OUT)/mm10 $(FASTA_MOUSE) -u umap_mouse.bed targets_mouse.txt diff --git a/src/scripts/data/training_data/README.md b/src/scripts/data/training_data/README.md new file mode 100644 index 0000000..de8af67 --- /dev/null +++ b/src/scripts/data/training_data/README.md @@ -0,0 +1,11 @@ +## Data processing & Training + +Processing of the ENCODE, GTEx, FANTOM5, and CATlas training data is done through the Makefile in this folder. It requires a number of auxiliary files (e.g. genome alignments and lists of unmappable regions), which can be downloaded from the Borzoi training data bucket [here](https://storage.googleapis.com/borzoi-paper/data/) (GCP).
+
+The Makefile relies on the script 'basenji_data.py' from the [basenji repository](https://github.com/calico/basenji/blob/master/bin/basenji_data.py), which in turn calls the scripts 'basenji_data_read.py' and 'basenji_data_write.py' from the same repo, in order to (1) read coverage data (from bigwig-like files) along with a matched segment from a fasta genome file, and (2) write the (one-hot coded) sequence along with coverage values into compressed TF records.
+
+*Notes*: +- The attached Makefile shows the exact commands used to call basenji_data.py and other related scripts to create the specific training data for the published model. +- The script(s) take as input a fasta genome file, optional blacklist+unmappable region files, as well as a .txt file where each row points to a bigwig coverage file location (see for [this file](https://raw.githubusercontent.com/calico/borzoi/main/examples/targets_human.txt)).
+
+The model training is done through the script 'hound_train.py' from the [baskerville repository](https://github.com/calico/baskerville/blob/main/src/baskerville/scripts/hound_train.py). Most of the training parameters are set through a .json file that is supplied to the script. The published model's parameter file can be found [here](https://storage.googleapis.com/seqnn-share/borzoi/params.json).
From 6882605d6f83bc19bf9c7d52852808354308184b Mon Sep 17 00:00:00 2001 From: johli Date: Fri, 19 Apr 2024 11:04:24 -0700 Subject: [PATCH 2/3] Update README.md --- src/scripts/data/training_data/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/scripts/data/training_data/README.md b/src/scripts/data/training_data/README.md index de8af67..7c2751e 100644 --- a/src/scripts/data/training_data/README.md +++ b/src/scripts/data/training_data/README.md @@ -1,11 +1,11 @@ ## Data processing & Training -Processing of the ENCODE, GTEx, FANTOM5, and CATlas training data is done through the Makefile in this folder. It requires a number of auxiliary files (e.g. genome alignments and lists of unmappable regions), which can be downloaded from the Borzoi training data bucket [here](https://storage.googleapis.com/borzoi-paper/data/) (GCP).
-
+Processing of ENCODE, GTEx, FANTOM5, and CATlas training data is done through a Makefile. It requires a number of auxiliary files (e.g. genome alignments), which can be downloaded from the Borzoi training data bucket [here](https://storage.googleapis.com/borzoi-paper/data/) (GCP).
+ The Makefile relies on the script 'basenji_data.py' from the [basenji repository](https://github.com/calico/basenji/blob/master/bin/basenji_data.py), which in turn calls the scripts 'basenji_data_read.py' and 'basenji_data_write.py' from the same repo, in order to (1) read coverage data (from bigwig-like files) along with a matched segment from a fasta genome file, and (2) write the (one-hot coded) sequence along with coverage values into compressed TF records.
-
+ *Notes*: - The attached Makefile shows the exact commands used to call basenji_data.py and other related scripts to create the specific training data for the published model. - The script(s) take as input a fasta genome file, optional blacklist+unmappable region files, as well as a .txt file where each row points to a bigwig coverage file location (see for [this file](https://raw.githubusercontent.com/calico/borzoi/main/examples/targets_human.txt)).
-
+ The model training is done through the script 'hound_train.py' from the [baskerville repository](https://github.com/calico/baskerville/blob/main/src/baskerville/scripts/hound_train.py). Most of the training parameters are set through a .json file that is supplied to the script. The published model's parameter file can be found [here](https://storage.googleapis.com/seqnn-share/borzoi/params.json).
From 2eebb71bfac223768297dbf0754e7516203194cd Mon Sep 17 00:00:00 2001 From: johli Date: Fri, 19 Apr 2024 11:06:22 -0700 Subject: [PATCH 3/3] Update README.md --- src/scripts/data/qtl_data/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/scripts/data/qtl_data/README.md b/src/scripts/data/qtl_data/README.md index 3a3589e..8831abf 100644 --- a/src/scripts/data/qtl_data/README.md +++ b/src/scripts/data/qtl_data/README.md @@ -1,9 +1,9 @@ ## QTL data processing -The scripts in this folder are used to extract fine-mapped causal sQTLs, paQTLs and ipaQTLs from the results of the eQTL eQTL Catalogue, as well as construct distance- and expression-matched negative SNPs.
-
+The scripts in this folder are used to extract fine-mapped causal sQTLs, paQTLs and ipaQTLs from the results of the eQTL Catalogue, as well as construct distance- and expression-matched negative SNPs.
+ *Notes*: -- The pipeline requires the GTEx v8 (median) TPM matrix, which can be downloaded [here](https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz).
+- The pipeline requires the GTEx v8 (median) TPM matrix, which can be downloaded [here](https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz).
As a prerequisite to generating any of the QTL datasets, run the following scripts (in order):