From 35a501ba90c97a82f5d8ddeaa3fc90be5bc815ce Mon Sep 17 00:00:00 2001
From: Johannes Linder <jlinder@jlinder-mbp-1802.calicolabs.local>
Date: Fri, 19 Apr 2024 11:00:29 -0700
Subject: [PATCH 1/3] Added data processing scripts.

---
 src/scripts/data/qtl_data/README.md           |  33 +++
 src/scripts/data/qtl_data/download_finemap.py |  62 +++++
 src/scripts/data/qtl_data/download_sumstat.py |  56 +++++
 .../qtl_data/ipaqtl_make_negative_sets.py     | 196 +++++++++++++++
 .../qtl_data/ipaqtl_make_positive_sets.py     | 191 ++++++++++++++
 src/scripts/data/qtl_data/ipaqtl_vcfs.py      | 234 ++++++++++++++++++
 .../data/qtl_data/make_expression_tables.py   | 181 ++++++++++++++
 src/scripts/data/qtl_data/make_vcfs.py        | 112 +++++++++
 .../data/qtl_data/merge_finemapping_tables.py | 102 ++++++++
 .../data/qtl_data/paqtl_make_negative_sets.py | 196 +++++++++++++++
 .../data/qtl_data/paqtl_make_positive_sets.py | 191 ++++++++++++++
 src/scripts/data/qtl_data/paqtl_vcfs.py       | 234 ++++++++++++++++++
 .../data/qtl_data/sqtl_make_negative_sets.py  | 195 +++++++++++++++
 .../data/qtl_data/sqtl_make_positive_sets.py  | 190 ++++++++++++++
 src/scripts/data/qtl_data/sqtl_vcfs.py        | 234 ++++++++++++++++++
 src/scripts/data/training_data/Makefile       |  47 ++++
 src/scripts/data/training_data/README.md      |  11 +
 17 files changed, 2465 insertions(+)
 create mode 100644 src/scripts/data/qtl_data/README.md
 create mode 100644 src/scripts/data/qtl_data/download_finemap.py
 create mode 100644 src/scripts/data/qtl_data/download_sumstat.py
 create mode 100644 src/scripts/data/qtl_data/ipaqtl_make_negative_sets.py
 create mode 100644 src/scripts/data/qtl_data/ipaqtl_make_positive_sets.py
 create mode 100644 src/scripts/data/qtl_data/ipaqtl_vcfs.py
 create mode 100644 src/scripts/data/qtl_data/make_expression_tables.py
 create mode 100644 src/scripts/data/qtl_data/make_vcfs.py
 create mode 100644 src/scripts/data/qtl_data/merge_finemapping_tables.py
 create mode 100644 src/scripts/data/qtl_data/paqtl_make_negative_sets.py
 create mode 100644 src/scripts/data/qtl_data/paqtl_make_positive_sets.py
 create mode 100644 src/scripts/data/qtl_data/paqtl_vcfs.py
 create mode 100644 src/scripts/data/qtl_data/sqtl_make_negative_sets.py
 create mode 100644 src/scripts/data/qtl_data/sqtl_make_positive_sets.py
 create mode 100644 src/scripts/data/qtl_data/sqtl_vcfs.py
 create mode 100644 src/scripts/data/training_data/Makefile
 create mode 100644 src/scripts/data/training_data/README.md
diff --git a/src/scripts/data/qtl_data/README.md b/src/scripts/data/qtl_data/README.md
new file mode 100644
index 0000000..3a3589e
--- /dev/null
+++ b/src/scripts/data/qtl_data/README.md
@@ -0,0 +1,33 @@
+## QTL data processing
+
+The scripts in this folder are used to extract fine-mapped causal sQTLs, paQTLs and ipaQTLs from the results of the eQTL eQTL Catalogue, as well as construct distance- and expression-matched negative SNPs.<br/>
+<br/>
+*Notes*: 
+- The pipeline requires the GTEx v8 (median) TPM matrix, which can be downloaded [here](https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz).<br/>
+<br/>
+
+As a prerequisite to generating any of the QTL datasets, run the following scripts (in order):
+1. download_finemap.py
+2. download_sumstat.py
+3. merge_finemapping_tables.py
+4. make_expression_tables.py
+<br/>
+
+To prepare the sQTL dataset, run these scripts:
+1. sqtl_make_positive_sets.py
+2. sqtl_make_negative_sets.py
+<br/>
+
+To prepare the paQTL dataset, run these scripts:
+1. paqtl_make_positive_sets.py
+2. paqtl_make_negative_sets.py
+<br/>
+
+To prepare the ipaQTL dataset, run these scripts:
+1. ipaqtl_make_positive_sets.py
+2. ipaqtl_make_negative_sets.py
+<br/>
+
+Finally, to generate the QTL VCF files, run this script:
+1. make_vcfs.py
+<br/>
diff --git a/src/scripts/data/qtl_data/download_finemap.py b/src/scripts/data/qtl_data/download_finemap.py
new file mode 100644
index 0000000..558e3ef
--- /dev/null
+++ b/src/scripts/data/qtl_data/download_finemap.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import pandas as pd
+
+import util
+
+'''
+download_finemap.py
+
+Download QTL Catalogue fine-mapping results.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+    usage = 'usage: %prog [options] arg'
+    parser = OptionParser(usage)
+    #parser.add_option()
+    (options,args) = parser.parse_args()
+
+    # read remote table
+    samples_df = pd.read_csv('https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/master/tabix/tabix_ftp_paths.tsv', sep='\t')
+
+    # filter GTEx (for now)
+    samples_df = samples_df[samples_df.study == 'GTEx']
+
+
+    ################################################
+    # txrevise for splicing / polyA / TSS QTLs
+
+    os.makedirs('txrev', exist_ok=True)
+    txrev_df = samples_df[samples_df.quant_method == 'txrev']
+
+    jobs = []
+    for all_ftp_path in txrev_df.ftp_path:
+        # ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/sumstats/Alasoo_2018/txrev/Alasoo_2018_txrev_macrophage_IFNg+Salmonella.all.tsv.gz
+        # ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/credible_sets//Alasoo_2018_txrev_macrophage_IFNg+Salmonella.purity_filtered.txt.gz
+
+        all_ftp_file = all_ftp_path.split('/')[-1]
+        fine_ftp_file = all_ftp_file.replace('all.tsv', 'purity_filtered.txt')
+
+        fine_ftp_path = 'ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/credible_sets/'
+        fine_ftp_path += fine_ftp_file
+
+        local_path = 'txrev/%s' % fine_ftp_file
+        if not os.path.isfile(local_path):
+            cmd = 'curl -o %s %s' % (local_path, fine_ftp_path)
+            jobs.append(cmd)
+
+    util.exec_par(jobs, 4, verbose=True)
+    # print('\n'.join(jobs))
+
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+    main()
diff --git a/src/scripts/data/qtl_data/download_sumstat.py b/src/scripts/data/qtl_data/download_sumstat.py
new file mode 100644
index 0000000..ca402df
--- /dev/null
+++ b/src/scripts/data/qtl_data/download_sumstat.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import pandas as pd
+
+import util
+
+'''
+download_sumstat.py
+
+Download QTL Catalogue sumstats.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+    usage = 'usage: %prog [options] arg'
+    parser = OptionParser(usage)
+    #parser.add_option()
+    (options,args) = parser.parse_args()
+
+    # read remote table
+    samples_df = pd.read_csv('https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/master/tabix/tabix_ftp_paths.tsv', sep='\t')
+
+    # filter GTEx (for now)
+    samples_df = samples_df[samples_df.study == 'GTEx']
+
+
+    ################################################
+    # ge for sumstat (we want SNPs and possibly also base expression)
+
+    os.makedirs('ge', exist_ok=True)
+    txrev_df = samples_df[samples_df.quant_method == 'ge']
+
+    jobs = []
+    for all_ftp_path in txrev_df.ftp_path:
+        # ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/sumstats/Alasoo_2018/txrev/Alasoo_2018_txrev_macrophage_IFNg+Salmonella.all.tsv.gz
+        
+        local_path = 'ge/%s' % all_ftp_path.split("/")[-1]
+        
+        if not os.path.isfile(local_path):
+            cmd = 'curl -o %s %s' % (local_path, all_ftp_path)
+            jobs.append(cmd)
+
+    util.exec_par(jobs, 4, verbose=True)
+    # print('\n'.join(jobs))
+
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+    main()
diff --git a/src/scripts/data/qtl_data/ipaqtl_make_negative_sets.py b/src/scripts/data/qtl_data/ipaqtl_make_negative_sets.py
new file mode 100644
index 0000000..3f4d49d
--- /dev/null
+++ b/src/scripts/data/qtl_data/ipaqtl_make_negative_sets.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import util
+
+import numpy as np
+import pandas as pd
+
+import pyranges as pr
+
+'''
+paqtl_make_negative_sets.py
+
+Build tables with negative (non-causal) SNPs for paQTLs.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+    usage = 'usage: %prog [options] arg'
+    parser = OptionParser(usage)
+    #parser.add_option()
+    (options,args) = parser.parse_args()
+    
+    #Parameters
+    pip_cutoff = 0.01
+    max_distance = 10000
+    gene_pad = 50
+    apa_file = 'polyadb_intron.bed'
+    gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf'
+    finemap_file = 'txrev/GTEx_txrev_finemapped_merged.csv.gz'
+
+    #Define tissues
+    tissue_names = [
+        'adipose_subcutaneous',
+        'adipose_visceral',
+        'adrenal_gland',
+        'artery_aorta',
+        'artery_coronary',
+        'artery_tibial',
+        'blood',
+        'brain_amygdala',
+        'brain_anterior_cingulate_cortex',
+        'brain_caudate',
+        'brain_cerebellar_hemisphere',
+        'brain_cerebellum',
+        'brain_cortex',
+        'brain_frontal_cortex',
+        'brain_hippocampus',
+        'brain_hypothalamus',
+        'brain_nucleus_accumbens',
+        'brain_putamen',
+        'brain_spinal_cord',
+        'brain_substantia_nigra',
+        'breast',
+        'colon_sigmoid',
+        'colon_transverse',
+        'esophagus_gej',
+        'esophagus_mucosa',
+        'esophagus_muscularis',
+        'fibroblast',
+        'heart_atrial_appendage',
+        'heart_left_ventricle',
+        'kidney_cortex',
+        'LCL',
+        'liver',
+        'lung',
+        'minor_salivary_gland',
+        'muscle',
+        'nerve_tibial',
+        'ovary',
+        'pancreas',
+        'pituitary',
+        'prostate',
+        'skin_not_sun_exposed',
+        'skin_sun_exposed',
+        'small_intestine',
+        'spleen',
+        'stomach',
+        'testis',
+        'thyroid',
+        'uterus',
+        'vagina',
+    ]
+    
+    #Compile negative SNP set for each tissue
+    for tissue_name in tissue_names :
+        
+        print("-- " + str(tissue_name) + " --")
+
+        #Load summary stats and extract unique set of SNPs
+        vcf_df = pd.read_csv("ge/GTEx_ge_" + tissue_name + ".all.tsv.gz", sep='\t', compression='gzip', usecols=['chromosome', 'position', 'ref', 'alt']).drop_duplicates(subset=['chromosome', 'position', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+
+        #Only keep SNPs (no indels)
+        vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True)
+
+        vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str)
+        vcf_df['start'] = vcf_df['position'].astype(int)
+        vcf_df['end'] = vcf_df['start'] + 1
+        vcf_df['strand'] = "."
+
+        vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand']]
+        vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'})
+
+        print("len(vcf_df) = " + str(len(vcf_df)))
+
+        #Store intermediate SNPs
+        #vcf_df.to_csv("ge/GTEx_snps_" + tissue_name + ".bed.gz", sep='\t', index=False, header=False)
+        
+        #Load polyadenylation site annotation
+        apa_df = pd.read_csv(apa_file, sep='\t', names=['Chromosome', 'Start', 'End', 'pas_id', 'feat1', 'Strand'])
+        apa_df['Start'] += 1
+        
+        #Load gene span annotation
+        gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'])
+        gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True)
+
+        gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0])
+
+        gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True)
+
+        gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad
+        gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad
+        
+        #Join dataframes against gtf annotation
+        apa_pr = pr.PyRanges(apa_df)
+        gtf_pr = pr.PyRanges(gtf_df)
+        vcf_pr = pr.PyRanges(vcf_df)
+
+        apa_gtf_pr = apa_pr.join(gtf_pr, strandedness='same')
+        vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False)
+        
+        apa_gtf_df = apa_gtf_pr.df[['Chromosome', 'Start', 'End', 'pas_id', 'gene_id', 'Strand']].copy().reset_index(drop=True)
+        vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id']].copy().reset_index(drop=True)
+
+        apa_gtf_df['Start'] -= max_distance
+        apa_gtf_df['End'] += max_distance
+        
+        #Join vcf against polyadenylation annotation
+        apa_gtf_pr = pr.PyRanges(apa_gtf_df)
+        vcf_gtf_pr = pr.PyRanges(vcf_gtf_df)
+
+        vcf_apa_pr = vcf_gtf_pr.join(apa_gtf_pr, strandedness=False)
+        
+        #Force gene_id of SNP to be same as the gene_id of the polyA site        
+        vcf_apa_df = vcf_apa_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True)
+        vcf_apa_df = vcf_apa_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'pas_id', 'Strand_b', 'Start_b']]
+
+        #PolyA site position
+        vcf_apa_df['Start_b'] += max_distance
+        vcf_apa_df = vcf_apa_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'pas_pos', 'Strand_b' : 'Strand'})
+
+        #Distance to polyA site
+        vcf_apa_df['distance'] = np.abs(vcf_apa_df['Pos'] - vcf_apa_df['pas_pos'])
+        
+        #Choose unique SNPs by shortest distance to polyA site
+        vcf_apa_df = vcf_apa_df.sort_values(by='distance', ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+        vcf_apa_df = vcf_apa_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True)
+
+        vcf_df_filtered = vcf_apa_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'})
+        vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'pas_id', 'strand', 'pas_pos', 'distance']]
+
+        print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered)))
+        
+        #Store intermediate SNPs (filtered)
+        vcf_df_filtered.to_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_filtered.bed.gz", sep='\t', index=False)
+        
+        #Reload filtered SNP file
+        vcf_df_filtered = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_filtered.bed.gz", sep='\t', compression='gzip')
+
+        #Create variant identifier
+        vcf_df_filtered['variant'] = vcf_df_filtered['chrom'] + "_" + vcf_df_filtered['pos'].astype(str) + "_" + vcf_df_filtered['ref'] + "_" + vcf_df_filtered['alt']
+        
+        #Load merged fine-mapping dataframe
+        finemap_df = pd.read_csv(finemap_file, sep='\t')[['variant', 'pip']]
+        
+        #Join against fine-mapping dataframe
+        neg_df = vcf_df_filtered.join(finemap_df.set_index('variant'), on='variant', how='left')
+        neg_df.loc[neg_df['pip'].isnull(), 'pip'] = 0.
+        
+        #Only keep SNPs with PIP < cutoff
+        neg_df = neg_df.query("pip < " + str(pip_cutoff)).copy().reset_index(drop=True)
+
+        #Store final table of negative SNPs
+        neg_df.to_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_negatives.bed.gz", sep='\t', index=False)
+        
+        print("len(neg_df) = " + str(len(neg_df)))
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+    main()
diff --git a/src/scripts/data/qtl_data/ipaqtl_make_positive_sets.py b/src/scripts/data/qtl_data/ipaqtl_make_positive_sets.py
new file mode 100644
index 0000000..f1afb7b
--- /dev/null
+++ b/src/scripts/data/qtl_data/ipaqtl_make_positive_sets.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import util
+
+import numpy as np
+import pandas as pd
+
+import pyranges as pr
+
+'''
+paqtl_make_positive_sets.py
+
+Build tables with positive (causal) SNPs for paQTLs.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+    usage = 'usage: %prog [options] arg'
+    parser = OptionParser(usage)
+    #parser.add_option()
+    (options,args) = parser.parse_args()
+    
+    #Parameters
+    pip_cutoff = 0.01
+    max_distance = 10000
+    gene_pad = 50
+    apa_file = 'polyadb_intron.bed'
+    gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf'
+
+    #Define tissues
+    tissue_names = [
+        'adipose_subcutaneous',
+        'adipose_visceral',
+        'adrenal_gland',
+        'artery_aorta',
+        'artery_coronary',
+        'artery_tibial',
+        'blood',
+        'brain_amygdala',
+        'brain_anterior_cingulate_cortex',
+        'brain_caudate',
+        'brain_cerebellar_hemisphere',
+        'brain_cerebellum',
+        'brain_cortex',
+        'brain_frontal_cortex',
+        'brain_hippocampus',
+        'brain_hypothalamus',
+        'brain_nucleus_accumbens',
+        'brain_putamen',
+        'brain_spinal_cord',
+        'brain_substantia_nigra',
+        'breast',
+        'colon_sigmoid',
+        'colon_transverse',
+        'esophagus_gej',
+        'esophagus_mucosa',
+        'esophagus_muscularis',
+        'fibroblast',
+        'heart_atrial_appendage',
+        'heart_left_ventricle',
+        'kidney_cortex',
+        'LCL',
+        'liver',
+        'lung',
+        'minor_salivary_gland',
+        'muscle',
+        'nerve_tibial',
+        'ovary',
+        'pancreas',
+        'pituitary',
+        'prostate',
+        'skin_not_sun_exposed',
+        'skin_sun_exposed',
+        'small_intestine',
+        'spleen',
+        'stomach',
+        'testis',
+        'thyroid',
+        'uterus',
+        'vagina',
+    ]
+    
+    #Compile positive SNP set for each tissue
+    for tissue_name in tissue_names :
+        
+        print("-- " + str(tissue_name) + " --")
+
+        #Load fine-mapping table
+        vcf_df = pd.read_csv("txrev/GTEx_txrev_" + tissue_name + ".purity_filtered.txt.gz", sep='\t', usecols=['chromosome', 'position', 'ref', 'alt', 'variant', 'pip', 'molecular_trait_id'], low_memory=False)
+
+        #Only keep SNPs (no indels)
+        vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True)
+        
+        #Only keep SNPs associated with polyadenylation events
+        vcf_df = vcf_df.loc[vcf_df['molecular_trait_id'].str.contains(".downstream.")].copy().reset_index(drop=True)
+
+        vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str)
+        vcf_df['start'] = vcf_df['position'].astype(int)
+        vcf_df['end'] = vcf_df['start'] + 1
+        vcf_df['strand'] = "."
+
+        vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand', 'variant', 'pip', 'molecular_trait_id']]
+        vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'})
+
+        print("len(vcf_df) = " + str(len(vcf_df)))
+
+        #Load polyadenylation site annotation
+        apa_df = pd.read_csv(apa_file, sep='\t', names=['Chromosome', 'Start', 'End', 'pas_id', 'feat1', 'Strand'])
+        apa_df['Start'] += 1
+        
+        #Load gene span annotation
+        gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'])
+        gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True)
+
+        gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0])
+
+        gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True)
+
+        gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad
+        gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad
+        
+        #Join dataframes against gtf annotation
+        apa_pr = pr.PyRanges(apa_df)
+        gtf_pr = pr.PyRanges(gtf_df)
+        vcf_pr = pr.PyRanges(vcf_df)
+
+        apa_gtf_pr = apa_pr.join(gtf_pr, strandedness='same')
+        vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False)
+
+        apa_gtf_df = apa_gtf_pr.df[['Chromosome', 'Start', 'End', 'pas_id', 'gene_id', 'Strand']].copy().reset_index(drop=True)
+        vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id', 'variant', 'pip', 'molecular_trait_id']].copy().reset_index(drop=True)
+
+        apa_gtf_df['Start'] -= max_distance
+        apa_gtf_df['End'] += max_distance
+        
+        #Join vcf against polyadenylation annotation
+        apa_gtf_pr = pr.PyRanges(apa_gtf_df)
+        vcf_gtf_pr = pr.PyRanges(vcf_gtf_df)
+
+        vcf_apa_pr = vcf_gtf_pr.join(apa_gtf_pr, strandedness=False)
+        
+        #Force gene_id of SNP to be same as the gene_id of the polyA site        
+        vcf_apa_df = vcf_apa_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True)
+        vcf_apa_df = vcf_apa_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'pas_id', 'Strand_b', 'Start_b', 'variant', 'pip', 'molecular_trait_id']]
+        
+        #Force gene_id of SNP to be same as the gene_id of the finemapped molecular trait
+        vcf_apa_df['molecular_trait_gene_id'] = vcf_apa_df['molecular_trait_id'].apply(lambda x: x.split(".")[0])
+        vcf_apa_df = vcf_apa_df.query("gene_id == molecular_trait_gene_id").copy().reset_index(drop=True)
+
+        #PolyA site position
+        vcf_apa_df['Start_b'] += max_distance
+        vcf_apa_df = vcf_apa_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'pas_pos', 'Strand_b' : 'Strand'})
+
+        #Distance to polyA site
+        vcf_apa_df['distance'] = np.abs(vcf_apa_df['Pos'] - vcf_apa_df['pas_pos'])
+        
+        #Choose unique SNPs by shortest distance to polyA site (and inverse PIP for tie-breaking)
+        vcf_apa_df['pip_inv'] = 1. - vcf_apa_df['pip']
+        
+        vcf_apa_df = vcf_apa_df.sort_values(by=['distance', 'pip_inv'], ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+        vcf_apa_df = vcf_apa_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True)
+
+        vcf_df_filtered = vcf_apa_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'})
+        vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'pas_id', 'strand', 'pas_pos', 'distance', 'variant', 'pip', 'molecular_trait_id']]
+
+        print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered)))
+        
+        #Store intermediate SNPs (filtered)
+        vcf_df_filtered.to_csv("txrev/GTEx_snps_" + tissue_name + "_intronic_polya_finemapped_filtered.bed.gz", sep='\t', index=False)
+        
+        #Reload filtered SNP file
+        vcf_df_filtered = pd.read_csv("txrev/GTEx_snps_" + tissue_name + "_intronic_polya_finemapped_filtered.bed.gz", sep='\t', compression='gzip')
+
+        #Only keep SNPs with PIP > cutoff
+        pos_df = vcf_df_filtered.query("pip > " + str(pip_cutoff)).copy().reset_index(drop=True)
+
+        #Store final table of positive SNPs
+        pos_df.to_csv("txrev/GTEx_snps_" + tissue_name + "_intronic_polya_positives.bed.gz", sep='\t', index=False)
+        
+        print("len(pos_df) = " + str(len(pos_df)))
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+    main()
diff --git a/src/scripts/data/qtl_data/ipaqtl_vcfs.py b/src/scripts/data/qtl_data/ipaqtl_vcfs.py
new file mode 100644
index 0000000..773c45e
--- /dev/null
+++ b/src/scripts/data/qtl_data/ipaqtl_vcfs.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+import os
+import pdb
+import time
+
+import numpy as np
+import pandas as pd
+import pyranges as pr
+from tqdm import tqdm
+
+'''
+ipaqtl_vcfs.py
+
+Generate positive and negative intronic paQTL sets from the QTL catalog txrevise.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+    usage = 'usage: %prog [options]'
+    parser = OptionParser(usage)
+    parser.add_option('--neg_pip', dest='neg_pip',
+            default=0.01, type='float',
+            help='PIP upper limit for negative examples. [Default: %default]')
+    parser.add_option('--pos_pip', dest='pos_pip',
+            default=0.9, type='float',
+            help='PIP lower limit for positive examples. [Default: %default]')
+    parser.add_option('--match_gene', dest='match_gene',
+            default=0, type='int',
+            help='Try finding negative in same gene as positive. [Default: %default]')
+    parser.add_option('--match_allele', dest='match_allele',
+            default=0, type='int',
+            help='Try finding negative with same ref and alt alleles. [Default: %default]')
+    parser.add_option('-o', dest='out_prefix',
+            default='qtlcat_ipaqtl')
+    (options,args) = parser.parse_args()
+
+    tissue_name = options.out_prefix.split('txrev_')[1]
+    
+    gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort_protein.gtf'
+    
+    # read variant table
+    qtlcat_df_neg = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_negatives.bed.gz", sep='\t')
+    qtlcat_df_pos = pd.read_csv("txrev/GTEx_snps_" + tissue_name + "_intronic_polya_positives.bed.gz", sep='\t')
+    
+    # read TPM bin table and construct lookup dictionaries
+    tpm_df = pd.read_csv('ge/GTEx_ge_' + tissue_name + "_tpms.csv", sep='\t')[['gene_id', 'tpm', 'bin_index', 'bin_index_l', 'bin_index_r']]
+    gene_to_tpm_dict = tpm_df.set_index('gene_id').to_dict(orient='index')
+    
+    # filter on SNPs with genes in TPM bin dict
+    qtlcat_df_neg = qtlcat_df_neg.loc[qtlcat_df_neg['gene_id'].isin(tpm_df['gene_id'].values.tolist())].copy().reset_index(drop=True)
+    qtlcat_df_pos = qtlcat_df_pos.loc[qtlcat_df_pos['gene_id'].isin(tpm_df['gene_id'].values.tolist())].copy().reset_index(drop=True)
+    
+    #Load gene span annotation (protein-coding/categorized only)
+    gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['id_str'])
+    gtf_genes = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0]).unique().tolist()
+
+    # filter on SNPs with genes in GTF file
+    qtlcat_df_neg = qtlcat_df_neg.loc[qtlcat_df_neg['gene_id'].isin(gtf_genes)].copy().reset_index(drop=True)
+    qtlcat_df_pos = qtlcat_df_pos.loc[qtlcat_df_pos['gene_id'].isin(gtf_genes)].copy().reset_index(drop=True)
+    
+    bin_to_genes_dict = {}
+    for _, row in tpm_df.iterrows() :
+        
+        if row['bin_index'] not in bin_to_genes_dict :
+            bin_to_genes_dict[row['bin_index']] = []
+        
+        bin_to_genes_dict[row['bin_index']].append(row['gene_id'])
+    
+    for sample_bin in bin_to_genes_dict :
+        bin_to_genes_dict[sample_bin] = set(bin_to_genes_dict[sample_bin])
+
+    # split molecular trait id and filter for polyadenylation (for positives)
+    qtlcat_df_pos['gene'] = [mti.split('.')[0] for mti in qtlcat_df_pos.molecular_trait_id]
+    qtlcat_df_pos['event'] = [mti.split('.')[2] for mti in qtlcat_df_pos.molecular_trait_id]
+
+    qtlcat_df_pos = qtlcat_df_pos[qtlcat_df_pos.event == 'downstream']
+    qtlcat_df_pos = qtlcat_df_pos.rename(columns={'distance' : 'pas_dist'})
+    
+    qtlcat_df_neg['molecular_trait_id'] = qtlcat_df_neg['gene_id'] + "." + "grp_0.downstream.negative"
+    qtlcat_df_neg['gene'] = qtlcat_df_neg['gene_id']
+    qtlcat_df_neg['event'] = 'downstream'
+    qtlcat_df_neg = qtlcat_df_neg.rename(columns={'distance' : 'pas_dist'})
+
+    paqtl_df = pd.concat([qtlcat_df_neg, qtlcat_df_pos]).copy().reset_index(drop=True)
+
+    # determine positive variants
+    paqtl_pos_df = paqtl_df[paqtl_df.pip >= options.pos_pip]
+    paqtl_neg_df = paqtl_df[paqtl_df.pip < options.neg_pip]
+    pos_variants = set(paqtl_pos_df.variant)
+    
+    neg_gene_and_allele_variants = 0
+    neg_gene_variants = 0
+    
+    neg_expr_and_allele_variants = 0
+    neg_expr_variants = 0
+    
+    unmatched_variants = 0
+
+    # choose negative variants
+    neg_variants = set()
+    neg_dict = {}
+    for pvariant in tqdm(pos_variants):
+        paqtl_this_df = paqtl_pos_df[paqtl_pos_df.variant == pvariant]
+
+        neg_found = False
+        
+        # optionally prefer negative from positive's gene set
+        if options.match_gene == 1 and options.match_allele == 1 :
+            pgenes = set(paqtl_this_df.gene)
+            neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True)
+            
+            if neg_found :
+                neg_gene_and_allele_variants += 1
+        
+        if not neg_found and options.match_gene == 1 :
+            pgenes = set(paqtl_this_df.gene)
+            neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False)
+            
+            if neg_found :
+                neg_gene_variants += 1
+        
+        if not neg_found and options.match_allele == 1 :
+            pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index']]
+            neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True)
+            
+            if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l'] :
+                pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l']]
+                neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True)
+            
+            if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r'] :
+                pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r']]
+                neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True)
+            
+            if neg_found :
+                neg_expr_and_allele_variants += 1
+        
+        if not neg_found :
+            pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index']]
+            neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False)
+            
+            if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l'] :
+                pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l']]
+                neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False)
+            
+            if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r'] :
+                pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r']]
+                neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False)
+            
+            if neg_found :
+                neg_expr_variants += 1
+        
+        if not neg_found :
+            print("[Warning] Could not find a matching negative for '" + pvariant + "'")
+            unmatched_variants += 1
+
+    print('%d positive variants' % len(pos_variants))
+    print('%d negative variants' % len(neg_variants))
+    print(' - %d gene-matched negatives with same alleles' % neg_gene_and_allele_variants)
+    print(' - %d gene-matched negatives ' % neg_gene_variants)
+    print(' - %d expr-matched negatives with same alleles' % neg_expr_and_allele_variants)
+    print(' - %d expr-matched negatives ' % neg_expr_variants)
+    print(' - %d unmatched negatives ' % unmatched_variants)
+
+    pos_dict = {pv: pv for pv in pos_variants}
+    
+    # write VCFs
+    write_vcf('%s_pos.vcf' % options.out_prefix, paqtl_df, pos_variants, pos_dict)
+    write_vcf('%s_neg.vcf' % options.out_prefix, paqtl_df, neg_variants, neg_dict)
+
+def find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, match_allele) :
+    
+    gene_mask = np.array([gene in pgenes for gene in paqtl_neg_df.gene])
+    paqtl_neg_gene_df = paqtl_neg_df[gene_mask]
+
+    # match PAS distance
+    this_dist = paqtl_this_df.iloc[0].pas_dist
+    dist_cmp = np.abs(paqtl_neg_gene_df.pas_dist - this_dist)
+    dist_cmp_unique = np.sort(np.unique(dist_cmp.values))
+    
+    this_ref = paqtl_this_df.iloc[0].ref
+    this_alt = paqtl_this_df.iloc[0].alt
+
+    for ni_unique in dist_cmp_unique:
+        
+        paqtl_neg_gene_dist_df = paqtl_neg_gene_df.loc[dist_cmp == ni_unique]
+        
+        shuffle_index = np.arange(len(paqtl_neg_gene_dist_df), dtype='int32')
+        np.random.shuffle(shuffle_index)
+        
+        for npaqtl_i in range(len(paqtl_neg_gene_dist_df)) :
+            npaqtl = paqtl_neg_gene_dist_df.iloc[shuffle_index[npaqtl_i]]
+        
+            if not match_allele or (npaqtl.ref == this_ref and npaqtl.alt == this_alt):
+                if npaqtl.variant not in neg_variants and npaqtl.variant not in pos_variants:
+
+                    neg_variants.add(npaqtl.variant)
+                    neg_dict[npaqtl.variant] = paqtl_this_df.iloc[0].variant
+
+                    return True
+    
+    return False
+
+def write_vcf(vcf_file, df, variants_write, variants_dict):
+    vcf_open = open(vcf_file, 'w')
+    print('##fileformat=VCFv4.2', file=vcf_open)
+    print('##INFO=<ID=MT,Number=1,Type=String,Description="Molecular trait id">',
+        file=vcf_open)
+    print('##INFO=<ID=PD,Number=1,Type=Integer,Description="PAS distance">',
+        file=vcf_open)
+    print('##INFO=<ID=PI,Number=1,Type=String,Description="Positive SNP id">',
+        file=vcf_open)
+    cols = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
+    print('\t'.join(cols), file=vcf_open)
+
+    variants_written = set()
+
+    for v in df.itertuples():
+        if v.variant in variants_write and v.variant not in variants_written:
+            cols = [v.chrom, str(v.pos), v.variant, v.ref, v.alt, '.', '.']
+            cols += ['MT=%s;PD=%d;PI=%s' % (v.molecular_trait_id, v.pas_dist, variants_dict[v.variant])]
+            print('\t'.join(cols), file=vcf_open)
+            variants_written.add(v.variant)
+
+    vcf_open.close()
+
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+    main()
diff --git a/src/scripts/data/qtl_data/make_expression_tables.py b/src/scripts/data/qtl_data/make_expression_tables.py
new file mode 100644
index 0000000..ddc2a63
--- /dev/null
+++ b/src/scripts/data/qtl_data/make_expression_tables.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import util
+
+import numpy as np
+import pandas as pd
+
+import pyranges as pr
+
+import matplotlib.pyplot as plt
+
+'''
+make_expression_tables.py
+
+Contruct TPM bucket to sample genes from.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+    usage = 'usage: %prog [options] arg'
+    parser = OptionParser(usage)
+    #parser.add_option()
+    (options,args) = parser.parse_args()
+
+    #Define tissue column-to-file mapping
+    tissue_dict = {
+        'Adipose - Subcutaneous' : 'adipose_subcutaneous',
+        'Adipose - Visceral (Omentum)' : 'adipose_visceral',
+        'Adrenal Gland' : 'adrenal_gland',
+        'Artery - Aorta' : 'artery_aorta',
+        'Artery - Coronary' : 'artery_coronary',
+        'Artery - Tibial' : 'artery_tibial',
+        'Whole Blood' : 'blood',
+        'Brain - Amygdala' : 'brain_amygdala',
+        'Brain - Anterior cingulate cortex (BA24)' : 'brain_anterior_cingulate_cortex',
+        'Brain - Caudate (basal ganglia)' : 'brain_caudate',
+        'Brain - Cerebellar Hemisphere' : 'brain_cerebellar_hemisphere',
+        'Brain - Cerebellum' : 'brain_cerebellum',
+        'Brain - Cortex' : 'brain_cortex',
+        'Brain - Frontal Cortex (BA9)' : 'brain_frontal_cortex',
+        'Brain - Hippocampus' : 'brain_hippocampus',
+        'Brain - Hypothalamus' : 'brain_hypothalamus',
+        'Brain - Nucleus accumbens (basal ganglia)' : 'brain_nucleus_accumbens',
+        'Brain - Putamen (basal ganglia)' : 'brain_putamen',
+        'Brain - Spinal cord (cervical c-1)' : 'brain_spinal_cord',
+        'Brain - Substantia nigra' : 'brain_substantia_nigra',
+        'Breast - Mammary Tissue' : 'breast',
+        'Colon - Sigmoid' : 'colon_sigmoid',
+        'Colon - Transverse' : 'colon_transverse',
+        'Esophagus - Gastroesophageal Junction' : 'esophagus_gej',
+        'Esophagus - Mucosa' : 'esophagus_mucosa',
+        'Esophagus - Muscularis' : 'esophagus_muscularis',
+        'Cells - Cultured fibroblasts' : 'fibroblast',
+        'Heart - Atrial Appendage' : 'heart_atrial_appendage',
+        'Heart - Left Ventricle' : 'heart_left_ventricle',
+        'Kidney - Cortex' : 'kidney_cortex',
+        'Cells - EBV-transformed lymphocytes' : 'LCL',
+        'Liver' : 'liver',
+        'Lung' : 'lung',
+        'Minor Salivary Gland' : 'minor_salivary_gland',
+        'Muscle - Skeletal' : 'muscle',
+        'Nerve - Tibial' : 'nerve_tibial',
+        'Ovary' : 'ovary',
+        'Pancreas' : 'pancreas',
+        'Pituitary' : 'pituitary',
+        'Prostate' : 'prostate',
+        'Skin - Not Sun Exposed (Suprapubic)' : 'skin_not_sun_exposed',
+        'Skin - Sun Exposed (Lower leg)' : 'skin_sun_exposed',
+        'Small Intestine - Terminal Ileum' : 'small_intestine',
+        'Spleen' : 'spleen',
+        'Stomach' : 'stomach',
+        'Testis' : 'testis',
+        'Thyroid' : 'thyroid',
+        'Uterus' : 'uterus',
+        'Vagina' : 'vagina',
+    }
+    
+    for tissue_name in tissue_dict :
+        
+        #Load TPM matrix
+        tpm_df = pd.read_csv("GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz", sep='\t', compression='gzip', skiprows=2)
+
+        save_name = tissue_dict[tissue_name]
+
+        print("-- " + save_name + " --")
+
+        #Clean dataframe
+        tpm_df['gene_id'] = tpm_df['Name'].apply(lambda x: x.split(".")[0])
+
+        tpm_df = tpm_df.drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True)
+
+        tpm_df['tpm'] = tpm_df[tissue_name]
+        tpm_df = tpm_df[['gene_id', 'tpm']]
+
+        #Get non-zero TPM entries
+        tpm_df_zero = tpm_df.loc[tpm_df['tpm'] == 0].copy().reset_index(drop=True)
+        tpm_df_nonzero = tpm_df.loc[tpm_df['tpm'] > 0].copy().reset_index(drop=True)
+
+        tpm_df_zero['tpm_log2'] = 0.
+        tpm_df_nonzero['tpm_log2'] = np.log2(tpm_df_nonzero['tpm'])
+
+        #Clip at extremes
+        min_q = 0.0075
+        max_q = 0.9925
+
+        #Log2 fold change bin sizes
+        bin_size = 0.4
+        bin_offset = 0.15
+
+        min_tpm_log2 = np.quantile(tpm_df_nonzero['tpm_log2'], q=min_q)
+        max_tpm_log2 = np.quantile(tpm_df_nonzero['tpm_log2'], q=max_q)
+
+        tpm_df_nonzero.loc[tpm_df_nonzero['tpm_log2'] < min_tpm_log2, 'tpm_log2'] = min_tpm_log2
+        tpm_df_nonzero.loc[tpm_df_nonzero['tpm_log2'] > max_tpm_log2, 'tpm_log2'] = max_tpm_log2
+
+        tpm_log2 = tpm_df_nonzero['tpm_log2'].values
+
+        n_bins = int((max_tpm_log2 - min_tpm_log2) / bin_size)
+
+        #Get sample bins
+        sample_bins = np.linspace(min_tpm_log2, max_tpm_log2, n_bins+1)
+
+        #Map values to bins
+        bin_index = np.digitize(tpm_log2, sample_bins[1:], right=True)
+        bin_index_l = np.digitize(tpm_log2 - bin_offset, sample_bins[1:], right=True)
+        bin_index_r = np.digitize(tpm_log2 + bin_offset, sample_bins[1:], right=True)
+
+        tpm_df_zero['bin_index_l'] = -1 * np.ones(len(tpm_df_zero), dtype='int32')
+        tpm_df_zero['bin_index'] = -1 * np.ones(len(tpm_df_zero), dtype='int32')
+        tpm_df_zero['bin_index_r'] = -1 * np.ones(len(tpm_df_zero), dtype='int32')
+
+        tpm_df_nonzero['bin_index_l'] = bin_index_l
+        tpm_df_nonzero['bin_index'] = bin_index
+        tpm_df_nonzero['bin_index_r'] = bin_index_r
+
+        tpm_df = pd.concat([tpm_df_zero, tpm_df_nonzero]).copy().reset_index(drop=True)
+
+        tpm_df = tpm_df.sort_values(by='gene_id', ascending=True).copy().reset_index(drop=True)
+
+        #Save dataframe
+        tpm_df.to_csv('ge/GTEx_ge_' + save_name + "_tpms.csv", sep='\t', index=False)
+
+        #Visualize TPM sample bins
+        tpm_df_filtered = tpm_df.loc[tpm_df['tpm'] > 0.]
+
+        f = plt.figure(figsize=(4, 3))
+
+        plt.hist(tpm_df_filtered['bin_index'].values, bins=np.unique(tpm_df_filtered['bin_index'].values))
+
+        plt.xlim(0, np.max(tpm_df_filtered['bin_index'].values))
+
+        plt.xticks(fontsize=8)
+        plt.yticks(fontsize=8)
+
+        plt.xlabel("Sample bin (FC < " + str(round(2**(bin_size+2*bin_offset), 2)) + ")", fontsize=8)
+        plt.ylabel("# of genes", fontsize=8)
+
+        plt.title("TPM sample bins (" + save_name + ")", fontsize=8)
+
+        plt.tight_layout()
+
+        plt.savefig('ge/GTEx_ge_' + save_name + "_tpms.png", transparent=False, dpi=300)
+
+        plt.close()
+
+        #Check and warn in case of low-support bins
+        _, bin_support = np.unique(tpm_df_filtered['bin_index'].values, return_counts=True)
+
+        if np.any(bin_support < 100) :
+            print("[Warning] Less than 100 genes in some of the TPM sample bins (min = " + str(int(np.min(bin_support))) + ").")
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+    main()
diff --git a/src/scripts/data/qtl_data/make_vcfs.py b/src/scripts/data/qtl_data/make_vcfs.py
new file mode 100644
index 0000000..aa251d0
--- /dev/null
+++ b/src/scripts/data/qtl_data/make_vcfs.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import glob
+import os
+
+import pandas as pd
+
+import util
+
+'''
+make_vcfs.py
+
+Download QTL Catalogue fine-mapping results.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+    usage = 'usage: %prog [options] arg'
+    parser = OptionParser(usage)
+    #parser.add_option()
+    (options,args) = parser.parse_args()
+
+    pip = 0.2
+    match_gene = 0
+    match_allele = 1
+    
+    ################################################
+    # intronic polyA QTLs
+    
+    out_dir = 'ipaqtl_pip%d%s%s' % (pip*100, 'g' if match_gene == 1 else 'e', 'a' if match_allele else '')
+    os.makedirs(out_dir, exist_ok=True)
+
+    jobs = []
+    for table_file in glob.glob('txrev/*.txt.gz'):
+        out_prefix = table_file.replace('txrev/', '%s/' % out_dir)
+        out_prefix = out_prefix.replace('.purity_filtered.txt.gz', '')
+        cmd = './ipaqtl_vcfs.py --neg_pip 0.01 --pos_pip %f --match_gene %d --match_allele %d -o %s' % (pip, match_gene, match_allele, out_prefix)
+        jobs.append(cmd)
+    util.exec_par(jobs, 6, verbose=True)
+
+    # merge study/tissue variants
+    mpos_vcf_file = '%s/pos_merge.vcf' % out_dir
+    mneg_vcf_file = '%s/neg_merge.vcf' % out_dir
+    merge_variants(mpos_vcf_file, '%s/*_pos.vcf' % out_dir)
+    merge_variants(mneg_vcf_file, '%s/*_neg.vcf' % out_dir)
+    
+    
+    ################################################
+    # polyA QTLs
+
+    out_dir = 'paqtl_pip%d%s%s' % (pip*100, 'g' if match_gene == 1 else 'e', 'a' if match_allele else '')
+    os.makedirs(out_dir, exist_ok=True)
+
+    jobs = []
+    for table_file in glob.glob('txrev/*.txt.gz'):
+        out_prefix = table_file.replace('txrev/', '%s/' % out_dir)
+        out_prefix = out_prefix.replace('.purity_filtered.txt.gz', '')
+        cmd = './paqtl_vcfs.py --neg_pip 0.01 --pos_pip %f --match_gene %d --match_allele %d -o %s' % (pip, match_gene, match_allele, out_prefix)
+        jobs.append(cmd)
+    util.exec_par(jobs, 6, verbose=True)
+
+    # merge study/tissue variants
+    mpos_vcf_file = '%s/pos_merge.vcf' % out_dir
+    mneg_vcf_file = '%s/neg_merge.vcf' % out_dir
+    merge_variants(mpos_vcf_file, '%s/*_pos.vcf' % out_dir)
+    merge_variants(mneg_vcf_file, '%s/*_neg.vcf' % out_dir)
+    
+    ################################################
+    # splicing QTLs
+    
+    out_dir = 'sqtl_pip%d%s%s' % (pip*100, 'g' if match_gene == 1 else 'e', 'a' if match_allele else '')
+    os.makedirs(out_dir, exist_ok=True)
+
+    jobs = []
+    for table_file in glob.glob('txrev/*.txt.gz'):
+        out_prefix = table_file.replace('txrev/', '%s/' % out_dir)
+        out_prefix = out_prefix.replace('.purity_filtered.txt.gz', '')
+        cmd = './sqtl_vcfs.py --neg_pip 0.01 --pos_pip %f --match_gene %d --match_allele %d -o %s' % (pip, match_gene, match_allele, out_prefix)
+        jobs.append(cmd)
+    util.exec_par(jobs, 6, verbose=True)
+
+    # merge study/tissue variants
+    mpos_vcf_file = '%s/pos_merge.vcf' % out_dir
+    mneg_vcf_file = '%s/neg_merge.vcf' % out_dir
+    merge_variants(mpos_vcf_file, '%s/*_pos.vcf' % out_dir)
+    merge_variants(mneg_vcf_file, '%s/*_neg.vcf' % out_dir)
+    
+
+def merge_variants(merge_vcf_file, vcf_glob):
+    with open(merge_vcf_file, 'w') as merge_vcf_open:
+        vcf0_file = list(glob.glob(vcf_glob))[0]
+        for line in open(vcf0_file):
+            if line[0] == '#':
+                print(line, end='', file=merge_vcf_open)
+
+        merged_variants = set()
+        for vcf_file in glob.glob(vcf_glob):
+            for line in open(vcf_file):
+                if not line.startswith('#'):
+                    variant = line.split()[2]
+                    if variant not in merged_variants:
+                        print(line, file=merge_vcf_open, end='')
+                        merged_variants.add(variant)
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+    main()
diff --git a/src/scripts/data/qtl_data/merge_finemapping_tables.py b/src/scripts/data/qtl_data/merge_finemapping_tables.py
new file mode 100644
index 0000000..ac4fa7d
--- /dev/null
+++ b/src/scripts/data/qtl_data/merge_finemapping_tables.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import util
+
+import numpy as np
+import pandas as pd
+
+'''
+merge_finemapping_tables.py
+
+Merge fine-mapping tables of QTL credible sets.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+    usage = 'usage: %prog [options] arg'
+    parser = OptionParser(usage)
+    #parser.add_option()
+    (options,args) = parser.parse_args()
+
+    #Define tissues
+    tissue_names = [
+        'adipose_subcutaneous',
+        'adipose_visceral',
+        'adrenal_gland',
+        'artery_aorta',
+        'artery_coronary',
+        'artery_tibial',
+        'blood',
+        'brain_amygdala',
+        'brain_anterior_cingulate_cortex',
+        'brain_caudate',
+        'brain_cerebellar_hemisphere',
+        'brain_cerebellum',
+        'brain_cortex',
+        'brain_frontal_cortex',
+        'brain_hippocampus',
+        'brain_hypothalamus',
+        'brain_nucleus_accumbens',
+        'brain_putamen',
+        'brain_spinal_cord',
+        'brain_substantia_nigra',
+        'breast',
+        'colon_sigmoid',
+        'colon_transverse',
+        'esophagus_gej',
+        'esophagus_mucosa',
+        'esophagus_muscularis',
+        'fibroblast',
+        'heart_atrial_appendage',
+        'heart_left_ventricle',
+        'kidney_cortex',
+        'LCL',
+        'liver',
+        'lung',
+        'minor_salivary_gland',
+        'muscle',
+        'nerve_tibial',
+        'ovary',
+        'pancreas',
+        'pituitary',
+        'prostate',
+        'skin_not_sun_exposed',
+        'skin_sun_exposed',
+        'small_intestine',
+        'spleen',
+        'stomach',
+        'testis',
+        'thyroid',
+        'uterus',
+        'vagina',
+    ]
+    
+    #Load and merge fine-mapping results
+    dfs = []
+    for tissue_name in tissue_names :
+
+        print("-- " + tissue_name + " --")
+
+        df = pd.read_csv("txrev/GTEx_txrev_" + tissue_name + ".purity_filtered.txt.gz", sep='\t', usecols=['chromosome', 'position', 'ref', 'alt', 'variant', 'pip'], low_memory=False)
+        dfs.append(df.sort_values(by='pip', ascending=False).drop_duplicates(subset=['variant'], keep='first').copy().reset_index(drop=True))
+
+    df = pd.concat(dfs).sort_values(by='pip', ascending=False).drop_duplicates(subset=['variant'], keep='first').copy().reset_index(drop=True)
+
+    df['chromosome'] = "chr" + df['chromosome'].astype(str)
+    df = df.rename(columns={'chromosome' : 'chrom', 'position' : 'pos'})
+    
+    print("len(df) = " + str(len(df)))
+
+    #Save union of dataframes
+    df.to_csv("txrev/GTEx_txrev_finemapped_merged.csv.gz", sep='\t', index=False)
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+    main()
diff --git a/src/scripts/data/qtl_data/paqtl_make_negative_sets.py b/src/scripts/data/qtl_data/paqtl_make_negative_sets.py
new file mode 100644
index 0000000..a5da60d
--- /dev/null
+++ b/src/scripts/data/qtl_data/paqtl_make_negative_sets.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import util
+
+import numpy as np
+import pandas as pd
+
+import pyranges as pr
+
+'''
+paqtl_make_negative_sets.py
+
+Build tables with negative (non-causal) SNPs for paQTLs.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+    usage = 'usage: %prog [options] arg'
+    parser = OptionParser(usage)
+    #parser.add_option()
+    (options,args) = parser.parse_args()
+    
+    #Parameters
+    pip_cutoff = 0.01
+    max_distance = 10000
+    gene_pad = 50
+    apa_file = '/home/drk/common/data/genomes/hg38/genes/polyadb/polyadb_exon3.bed'
+    gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf'
+    finemap_file = 'txrev/GTEx_txrev_finemapped_merged.csv.gz'
+
+    #Define tissues
+    tissue_names = [
+        'adipose_subcutaneous',
+        'adipose_visceral',
+        'adrenal_gland',
+        'artery_aorta',
+        'artery_coronary',
+        'artery_tibial',
+        'blood',
+        'brain_amygdala',
+        'brain_anterior_cingulate_cortex',
+        'brain_caudate',
+        'brain_cerebellar_hemisphere',
+        'brain_cerebellum',
+        'brain_cortex',
+        'brain_frontal_cortex',
+        'brain_hippocampus',
+        'brain_hypothalamus',
+        'brain_nucleus_accumbens',
+        'brain_putamen',
+        'brain_spinal_cord',
+        'brain_substantia_nigra',
+        'breast',
+        'colon_sigmoid',
+        'colon_transverse',
+        'esophagus_gej',
+        'esophagus_mucosa',
+        'esophagus_muscularis',
+        'fibroblast',
+        'heart_atrial_appendage',
+        'heart_left_ventricle',
+        'kidney_cortex',
+        'LCL',
+        'liver',
+        'lung',
+        'minor_salivary_gland',
+        'muscle',
+        'nerve_tibial',
+        'ovary',
+        'pancreas',
+        'pituitary',
+        'prostate',
+        'skin_not_sun_exposed',
+        'skin_sun_exposed',
+        'small_intestine',
+        'spleen',
+        'stomach',
+        'testis',
+        'thyroid',
+        'uterus',
+        'vagina',
+    ]
+    
+    #Compile negative SNP set for each tissue
+    for tissue_name in tissue_names :
+        
+        print("-- " + str(tissue_name) + " --")
+
+        #Load summary stats and extract unique set of SNPs
+        vcf_df = pd.read_csv("ge/GTEx_ge_" + tissue_name + ".all.tsv.gz", sep='\t', compression='gzip', usecols=['chromosome', 'position', 'ref', 'alt']).drop_duplicates(subset=['chromosome', 'position', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+
+        #Only keep SNPs (no indels)
+        vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True)
+
+        vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str)
+        vcf_df['start'] = vcf_df['position'].astype(int)
+        vcf_df['end'] = vcf_df['start'] + 1
+        vcf_df['strand'] = "."
+
+        vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand']]
+        vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'})
+
+        print("len(vcf_df) = " + str(len(vcf_df)))
+
+        #Store intermediate SNPs
+        #vcf_df.to_csv("ge/GTEx_snps_" + tissue_name + ".bed.gz", sep='\t', index=False, header=False)
+        
+        #Load polyadenylation site annotation
+        apa_df = pd.read_csv(apa_file, sep='\t', names=['Chromosome', 'Start', 'End', 'pas_id', 'feat1', 'Strand'])
+        apa_df['Start'] += 1
+        
+        #Load gene span annotation
+        gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'])
+        gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True)
+
+        gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0])
+
+        gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True)
+
+        gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad
+        gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad
+        
+        #Join dataframes against gtf annotation
+        apa_pr = pr.PyRanges(apa_df)
+        gtf_pr = pr.PyRanges(gtf_df)
+        vcf_pr = pr.PyRanges(vcf_df)
+
+        apa_gtf_pr = apa_pr.join(gtf_pr, strandedness='same')
+        vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False)
+        
+        apa_gtf_df = apa_gtf_pr.df[['Chromosome', 'Start', 'End', 'pas_id', 'gene_id', 'Strand']].copy().reset_index(drop=True)
+        vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id']].copy().reset_index(drop=True)
+
+        apa_gtf_df['Start'] -= max_distance
+        apa_gtf_df['End'] += max_distance
+        
+        #Join vcf against polyadenylation annotation
+        apa_gtf_pr = pr.PyRanges(apa_gtf_df)
+        vcf_gtf_pr = pr.PyRanges(vcf_gtf_df)
+
+        vcf_apa_pr = vcf_gtf_pr.join(apa_gtf_pr, strandedness=False)
+        
+        #Force gene_id of SNP to be same as the gene_id of the polyA site        
+        vcf_apa_df = vcf_apa_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True)
+        vcf_apa_df = vcf_apa_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'pas_id', 'Strand_b', 'Start_b']]
+
+        #PolyA site position
+        vcf_apa_df['Start_b'] += max_distance
+        vcf_apa_df = vcf_apa_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'pas_pos', 'Strand_b' : 'Strand'})
+
+        #Distance to polyA site
+        vcf_apa_df['distance'] = np.abs(vcf_apa_df['Pos'] - vcf_apa_df['pas_pos'])
+        
+        #Choose unique SNPs by shortest distance to polyA site
+        vcf_apa_df = vcf_apa_df.sort_values(by='distance', ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+        vcf_apa_df = vcf_apa_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True)
+
+        vcf_df_filtered = vcf_apa_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'})
+        vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'pas_id', 'strand', 'pas_pos', 'distance']]
+
+        print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered)))
+        
+        #Store intermediate SNPs (filtered)
+        vcf_df_filtered.to_csv("ge/GTEx_snps_" + tissue_name + "_polya_filtered.bed.gz", sep='\t', index=False)
+        
+        #Reload filtered SNP file
+        vcf_df_filtered = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_polya_filtered.bed.gz", sep='\t', compression='gzip')
+
+        #Create variant identifier
+        vcf_df_filtered['variant'] = vcf_df_filtered['chrom'] + "_" + vcf_df_filtered['pos'].astype(str) + "_" + vcf_df_filtered['ref'] + "_" + vcf_df_filtered['alt']
+        
+        #Load merged fine-mapping dataframe
+        finemap_df = pd.read_csv(finemap_file, sep='\t')[['variant', 'pip']]
+        
+        #Join against fine-mapping dataframe
+        neg_df = vcf_df_filtered.join(finemap_df.set_index('variant'), on='variant', how='left')
+        neg_df.loc[neg_df['pip'].isnull(), 'pip'] = 0.
+        
+        #Only keep SNPs with PIP < cutoff
+        neg_df = neg_df.query("pip < " + str(pip_cutoff)).copy().reset_index(drop=True)
+
+        #Store final table of negative SNPs
+        neg_df.to_csv("ge/GTEx_snps_" + tissue_name + "_polya_negatives.bed.gz", sep='\t', index=False)
+        
+        print("len(neg_df) = " + str(len(neg_df)))
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+    main()
diff --git a/src/scripts/data/qtl_data/paqtl_make_positive_sets.py b/src/scripts/data/qtl_data/paqtl_make_positive_sets.py
new file mode 100644
index 0000000..3d07fa3
--- /dev/null
+++ b/src/scripts/data/qtl_data/paqtl_make_positive_sets.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import util
+
+import numpy as np
+import pandas as pd
+
+import pyranges as pr
+
+'''
+paqtl_make_positive_sets.py
+
+Build tables with positive (causal) SNPs for paQTLs.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+    usage = 'usage: %prog [options] arg'
+    parser = OptionParser(usage)
+    #parser.add_option()
+    (options,args) = parser.parse_args()
+    
+    #Parameters
+    pip_cutoff = 0.01
+    max_distance = 10000
+    gene_pad = 50
+    apa_file = '/home/drk/common/data/genomes/hg38/genes/polyadb/polyadb_exon3.bed'
+    gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf'
+
+    #Define tissues
+    tissue_names = [
+        'adipose_subcutaneous',
+        'adipose_visceral',
+        'adrenal_gland',
+        'artery_aorta',
+        'artery_coronary',
+        'artery_tibial',
+        'blood',
+        'brain_amygdala',
+        'brain_anterior_cingulate_cortex',
+        'brain_caudate',
+        'brain_cerebellar_hemisphere',
+        'brain_cerebellum',
+        'brain_cortex',
+        'brain_frontal_cortex',
+        'brain_hippocampus',
+        'brain_hypothalamus',
+        'brain_nucleus_accumbens',
+        'brain_putamen',
+        'brain_spinal_cord',
+        'brain_substantia_nigra',
+        'breast',
+        'colon_sigmoid',
+        'colon_transverse',
+        'esophagus_gej',
+        'esophagus_mucosa',
+        'esophagus_muscularis',
+        'fibroblast',
+        'heart_atrial_appendage',
+        'heart_left_ventricle',
+        'kidney_cortex',
+        'LCL',
+        'liver',
+        'lung',
+        'minor_salivary_gland',
+        'muscle',
+        'nerve_tibial',
+        'ovary',
+        'pancreas',
+        'pituitary',
+        'prostate',
+        'skin_not_sun_exposed',
+        'skin_sun_exposed',
+        'small_intestine',
+        'spleen',
+        'stomach',
+        'testis',
+        'thyroid',
+        'uterus',
+        'vagina',
+    ]
+    
+    #Compile positive SNP set for each tissue
+    for tissue_name in tissue_names :
+        
+        print("-- " + str(tissue_name) + " --")
+
+        #Load fine-mapping table
+        vcf_df = pd.read_csv("txrev/GTEx_txrev_" + tissue_name + ".purity_filtered.txt.gz", sep='\t', usecols=['chromosome', 'position', 'ref', 'alt', 'variant', 'pip', 'molecular_trait_id'], low_memory=False)
+
+        #Only keep SNPs (no indels)
+        vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True)
+        
+        #Only keep SNPs associated with polyadenylation events
+        vcf_df = vcf_df.loc[vcf_df['molecular_trait_id'].str.contains(".downstream.")].copy().reset_index(drop=True)
+
+        vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str)
+        vcf_df['start'] = vcf_df['position'].astype(int)
+        vcf_df['end'] = vcf_df['start'] + 1
+        vcf_df['strand'] = "."
+
+        vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand', 'variant', 'pip', 'molecular_trait_id']]
+        vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'})
+
+        print("len(vcf_df) = " + str(len(vcf_df)))
+
+        #Load polyadenylation site annotation
+        apa_df = pd.read_csv(apa_file, sep='\t', names=['Chromosome', 'Start', 'End', 'pas_id', 'feat1', 'Strand'])
+        apa_df['Start'] += 1
+        
+        #Load gene span annotation
+        gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'])
+        gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True)
+
+        gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0])
+
+        gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True)
+
+        gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad
+        gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad
+        
+        #Join dataframes against gtf annotation
+        apa_pr = pr.PyRanges(apa_df)
+        gtf_pr = pr.PyRanges(gtf_df)
+        vcf_pr = pr.PyRanges(vcf_df)
+
+        apa_gtf_pr = apa_pr.join(gtf_pr, strandedness='same')
+        vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False)
+
+        apa_gtf_df = apa_gtf_pr.df[['Chromosome', 'Start', 'End', 'pas_id', 'gene_id', 'Strand']].copy().reset_index(drop=True)
+        vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id', 'variant', 'pip', 'molecular_trait_id']].copy().reset_index(drop=True)
+
+        apa_gtf_df['Start'] -= max_distance
+        apa_gtf_df['End'] += max_distance
+        
+        #Join vcf against polyadenylation annotation
+        apa_gtf_pr = pr.PyRanges(apa_gtf_df)
+        vcf_gtf_pr = pr.PyRanges(vcf_gtf_df)
+
+        vcf_apa_pr = vcf_gtf_pr.join(apa_gtf_pr, strandedness=False)
+        
+        #Force gene_id of SNP to be same as the gene_id of the polyA site        
+        vcf_apa_df = vcf_apa_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True)
+        vcf_apa_df = vcf_apa_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'pas_id', 'Strand_b', 'Start_b', 'variant', 'pip', 'molecular_trait_id']]
+        
+        #Force gene_id of SNP to be same as the gene_id of the finemapped molecular trait
+        vcf_apa_df['molecular_trait_gene_id'] = vcf_apa_df['molecular_trait_id'].apply(lambda x: x.split(".")[0])
+        vcf_apa_df = vcf_apa_df.query("gene_id == molecular_trait_gene_id").copy().reset_index(drop=True)
+
+        #PolyA site position
+        vcf_apa_df['Start_b'] += max_distance
+        vcf_apa_df = vcf_apa_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'pas_pos', 'Strand_b' : 'Strand'})
+
+        #Distance to polyA site
+        vcf_apa_df['distance'] = np.abs(vcf_apa_df['Pos'] - vcf_apa_df['pas_pos'])
+        
+        #Choose unique SNPs by shortest distance to polyA site (and inverse PIP for tie-breaking)
+        vcf_apa_df['pip_inv'] = 1. - vcf_apa_df['pip']
+        
+        vcf_apa_df = vcf_apa_df.sort_values(by=['distance', 'pip_inv'], ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+        vcf_apa_df = vcf_apa_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True)
+
+        vcf_df_filtered = vcf_apa_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'})
+        vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'pas_id', 'strand', 'pas_pos', 'distance', 'variant', 'pip', 'molecular_trait_id']]
+
+        print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered)))
+        
+        #Store intermediate SNPs (filtered)
+        vcf_df_filtered.to_csv("txrev/GTEx_snps_" + tissue_name + "_polya_finemapped_filtered.bed.gz", sep='\t', index=False)
+        
+        #Reload filtered SNP file
+        vcf_df_filtered = pd.read_csv("txrev/GTEx_snps_" + tissue_name + "_polya_finemapped_filtered.bed.gz", sep='\t', compression='gzip')
+
+        #Only keep SNPs with PIP > cutoff
+        pos_df = vcf_df_filtered.query("pip > " + str(pip_cutoff)).copy().reset_index(drop=True)
+
+        #Store final table of positive SNPs
+        pos_df.to_csv("txrev/GTEx_snps_" + tissue_name + "_polya_positives.bed.gz", sep='\t', index=False)
+        
+        print("len(pos_df) = " + str(len(pos_df)))
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+    main()
diff --git a/src/scripts/data/qtl_data/paqtl_vcfs.py b/src/scripts/data/qtl_data/paqtl_vcfs.py
new file mode 100644
index 0000000..f0884b1
--- /dev/null
+++ b/src/scripts/data/qtl_data/paqtl_vcfs.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+import os
+import pdb
+import time
+
+import numpy as np
+import pandas as pd
+import pyranges as pr
+from tqdm import tqdm
+
+'''
+paqtl_vcfs.py
+
+Generate positive and negative paQTL sets from the QTL catalog txrevise.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+    usage = 'usage: %prog [options]'
+    parser = OptionParser(usage)
+    parser.add_option('--neg_pip', dest='neg_pip',
+            default=0.01, type='float',
+            help='PIP upper limit for negative examples. [Default: %default]')
+    parser.add_option('--pos_pip', dest='pos_pip',
+            default=0.9, type='float',
+            help='PIP lower limit for positive examples. [Default: %default]')
+    parser.add_option('--match_gene', dest='match_gene',
+            default=0, type='int',
+            help='Try finding negative in same gene as positive. [Default: %default]')
+    parser.add_option('--match_allele', dest='match_allele',
+            default=0, type='int',
+            help='Try finding negative with same ref and alt alleles. [Default: %default]')
+    parser.add_option('-o', dest='out_prefix',
+            default='qtlcat_paqtl')
+    (options,args) = parser.parse_args()
+
+    tissue_name = options.out_prefix.split('txrev_')[1]
+    
+    gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort_protein.gtf'
+    
+    # read variant table
+    qtlcat_df_neg = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_polya_negatives.bed.gz", sep='\t')
+    qtlcat_df_pos = pd.read_csv("txrev/GTEx_snps_" + tissue_name + "_polya_positives.bed.gz", sep='\t')
+    
+    # read TPM bin table and construct lookup dictionaries
+    tpm_df = pd.read_csv('ge/GTEx_ge_' + tissue_name + "_tpms.csv", sep='\t')[['gene_id', 'tpm', 'bin_index', 'bin_index_l', 'bin_index_r']]
+    gene_to_tpm_dict = tpm_df.set_index('gene_id').to_dict(orient='index')
+    
+    # filter on SNPs with genes in TPM bin dict
+    qtlcat_df_neg = qtlcat_df_neg.loc[qtlcat_df_neg['gene_id'].isin(tpm_df['gene_id'].values.tolist())].copy().reset_index(drop=True)
+    qtlcat_df_pos = qtlcat_df_pos.loc[qtlcat_df_pos['gene_id'].isin(tpm_df['gene_id'].values.tolist())].copy().reset_index(drop=True)
+    
+    #Load gene span annotation (protein-coding/categorized only)
+    gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['id_str'])
+    gtf_genes = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0]).unique().tolist()
+
+    # filter on SNPs with genes in GTF file
+    qtlcat_df_neg = qtlcat_df_neg.loc[qtlcat_df_neg['gene_id'].isin(gtf_genes)].copy().reset_index(drop=True)
+    qtlcat_df_pos = qtlcat_df_pos.loc[qtlcat_df_pos['gene_id'].isin(gtf_genes)].copy().reset_index(drop=True)
+    
+    bin_to_genes_dict = {}
+    for _, row in tpm_df.iterrows() :
+        
+        if row['bin_index'] not in bin_to_genes_dict :
+            bin_to_genes_dict[row['bin_index']] = []
+        
+        bin_to_genes_dict[row['bin_index']].append(row['gene_id'])
+    
+    for sample_bin in bin_to_genes_dict :
+        bin_to_genes_dict[sample_bin] = set(bin_to_genes_dict[sample_bin])
+
+    # split molecular trait id and filter for polyadenylation (for positives)
+    qtlcat_df_pos['gene'] = [mti.split('.')[0] for mti in qtlcat_df_pos.molecular_trait_id]
+    qtlcat_df_pos['event'] = [mti.split('.')[2] for mti in qtlcat_df_pos.molecular_trait_id]
+
+    qtlcat_df_pos = qtlcat_df_pos[qtlcat_df_pos.event == 'downstream']
+    qtlcat_df_pos = qtlcat_df_pos.rename(columns={'distance' : 'pas_dist'})
+    
+    qtlcat_df_neg['molecular_trait_id'] = qtlcat_df_neg['gene_id'] + "." + "grp_0.downstream.negative"
+    qtlcat_df_neg['gene'] = qtlcat_df_neg['gene_id']
+    qtlcat_df_neg['event'] = 'downstream'
+    qtlcat_df_neg = qtlcat_df_neg.rename(columns={'distance' : 'pas_dist'})
+
+    paqtl_df = pd.concat([qtlcat_df_neg, qtlcat_df_pos]).copy().reset_index(drop=True)
+
+    # determine positive variants
+    paqtl_pos_df = paqtl_df[paqtl_df.pip >= options.pos_pip]
+    paqtl_neg_df = paqtl_df[paqtl_df.pip < options.neg_pip]
+    pos_variants = set(paqtl_pos_df.variant)
+    
+    neg_gene_and_allele_variants = 0
+    neg_gene_variants = 0
+    
+    neg_expr_and_allele_variants = 0
+    neg_expr_variants = 0
+    
+    unmatched_variants = 0
+
+    # choose negative variants
+    neg_variants = set()
+    neg_dict = {}
+    for pvariant in tqdm(pos_variants):
+        paqtl_this_df = paqtl_pos_df[paqtl_pos_df.variant == pvariant]
+
+        neg_found = False
+        
+        # optionally prefer negative from positive's gene set
+        if options.match_gene == 1 and options.match_allele == 1 :
+            pgenes = set(paqtl_this_df.gene)
+            neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True)
+            
+            if neg_found :
+                neg_gene_and_allele_variants += 1
+        
+        if not neg_found and options.match_gene == 1 :
+            pgenes = set(paqtl_this_df.gene)
+            neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False)
+            
+            if neg_found :
+                neg_gene_variants += 1
+        
+        if not neg_found and options.match_allele == 1 :
+            pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index']]
+            neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True)
+            
+            if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l'] :
+                pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l']]
+                neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True)
+            
+            if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r'] :
+                pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r']]
+                neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True)
+            
+            if neg_found :
+                neg_expr_and_allele_variants += 1
+        
+        if not neg_found :
+            pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index']]
+            neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False)
+            
+            if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l'] :
+                pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l']]
+                neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False)
+            
+            if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r'] :
+                pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r']]
+                neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False)
+            
+            if neg_found :
+                neg_expr_variants += 1
+        
+        if not neg_found :
+            print("[Warning] Could not find a matching negative for '" + pvariant + "'")
+            unmatched_variants += 1
+
+    print('%d positive variants' % len(pos_variants))
+    print('%d negative variants' % len(neg_variants))
+    print(' - %d gene-matched negatives with same alleles' % neg_gene_and_allele_variants)
+    print(' - %d gene-matched negatives ' % neg_gene_variants)
+    print(' - %d expr-matched negatives with same alleles' % neg_expr_and_allele_variants)
+    print(' - %d expr-matched negatives ' % neg_expr_variants)
+    print(' - %d unmatched negatives ' % unmatched_variants)
+
+    pos_dict = {pv: pv for pv in pos_variants}
+    
+    # write VCFs
+    write_vcf('%s_pos.vcf' % options.out_prefix, paqtl_df, pos_variants, pos_dict)
+    write_vcf('%s_neg.vcf' % options.out_prefix, paqtl_df, neg_variants, neg_dict)
+
+def find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, match_allele) :
+    
+    gene_mask = np.array([gene in pgenes for gene in paqtl_neg_df.gene])
+    paqtl_neg_gene_df = paqtl_neg_df[gene_mask]
+
+    # match PAS distance
+    this_dist = paqtl_this_df.iloc[0].pas_dist
+    dist_cmp = np.abs(paqtl_neg_gene_df.pas_dist - this_dist)
+    dist_cmp_unique = np.sort(np.unique(dist_cmp.values))
+    
+    this_ref = paqtl_this_df.iloc[0].ref
+    this_alt = paqtl_this_df.iloc[0].alt
+
+    for ni_unique in dist_cmp_unique:
+        
+        paqtl_neg_gene_dist_df = paqtl_neg_gene_df.loc[dist_cmp == ni_unique]
+        
+        shuffle_index = np.arange(len(paqtl_neg_gene_dist_df), dtype='int32')
+        np.random.shuffle(shuffle_index)
+        
+        for npaqtl_i in range(len(paqtl_neg_gene_dist_df)) :
+            npaqtl = paqtl_neg_gene_dist_df.iloc[shuffle_index[npaqtl_i]]
+        
+            if not match_allele or (npaqtl.ref == this_ref and npaqtl.alt == this_alt):
+                if npaqtl.variant not in neg_variants and npaqtl.variant not in pos_variants:
+
+                    neg_variants.add(npaqtl.variant)
+                    neg_dict[npaqtl.variant] = paqtl_this_df.iloc[0].variant
+
+                    return True
+    
+    return False
+
+def write_vcf(vcf_file, df, variants_write, variants_dict):
+    vcf_open = open(vcf_file, 'w')
+    print('##fileformat=VCFv4.2', file=vcf_open)
+    print('##INFO=<ID=MT,Number=1,Type=String,Description="Molecular trait id">',
+        file=vcf_open)
+    print('##INFO=<ID=PD,Number=1,Type=Integer,Description="PAS distance">',
+        file=vcf_open)
+    print('##INFO=<ID=PI,Number=1,Type=String,Description="Positive SNP id">',
+        file=vcf_open)
+    cols = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
+    print('\t'.join(cols), file=vcf_open)
+
+    variants_written = set()
+
+    for v in df.itertuples():
+        if v.variant in variants_write and v.variant not in variants_written:
+            cols = [v.chrom, str(v.pos), v.variant, v.ref, v.alt, '.', '.']
+            cols += ['MT=%s;PD=%d;PI=%s' % (v.molecular_trait_id, v.pas_dist, variants_dict[v.variant])]
+            print('\t'.join(cols), file=vcf_open)
+            variants_written.add(v.variant)
+
+    vcf_open.close()
+
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+    main()
diff --git a/src/scripts/data/qtl_data/sqtl_make_negative_sets.py b/src/scripts/data/qtl_data/sqtl_make_negative_sets.py
new file mode 100644
index 0000000..7518ca4
--- /dev/null
+++ b/src/scripts/data/qtl_data/sqtl_make_negative_sets.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import util
+
+import numpy as np
+import pandas as pd
+
+import pyranges as pr
+
+'''
+sqtl_make_negative_sets.py
+
+Build tables with negative (non-causal) SNPs for sQTLs.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+    usage = 'usage: %prog [options] arg'
+    parser = OptionParser(usage)
+    #parser.add_option()
+    (options,args) = parser.parse_args()
+    
+    #Parameters
+    pip_cutoff = 0.01
+    max_distance = 10000
+    gene_pad = 50
+    splice_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_protein_splice.gff'
+    gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf'
+    finemap_file = 'txrev/GTEx_txrev_finemapped_merged.csv.gz'
+
+    #Define tissues
+    tissue_names = [
+        'adipose_subcutaneous',
+        'adipose_visceral',
+        'adrenal_gland',
+        'artery_aorta',
+        'artery_coronary',
+        'artery_tibial',
+        'blood',
+        'brain_amygdala',
+        'brain_anterior_cingulate_cortex',
+        'brain_caudate',
+        'brain_cerebellar_hemisphere',
+        'brain_cerebellum',
+        'brain_cortex',
+        'brain_frontal_cortex',
+        'brain_hippocampus',
+        'brain_hypothalamus',
+        'brain_nucleus_accumbens',
+        'brain_putamen',
+        'brain_spinal_cord',
+        'brain_substantia_nigra',
+        'breast',
+        'colon_sigmoid',
+        'colon_transverse',
+        'esophagus_gej',
+        'esophagus_mucosa',
+        'esophagus_muscularis',
+        'fibroblast',
+        'heart_atrial_appendage',
+        'heart_left_ventricle',
+        'kidney_cortex',
+        'LCL',
+        'liver',
+        'lung',
+        'minor_salivary_gland',
+        'muscle',
+        'nerve_tibial',
+        'ovary',
+        'pancreas',
+        'pituitary',
+        'prostate',
+        'skin_not_sun_exposed',
+        'skin_sun_exposed',
+        'small_intestine',
+        'spleen',
+        'stomach',
+        'testis',
+        'thyroid',
+        'uterus',
+        'vagina',
+    ]
+    
+    #Compile negative SNP set for each tissue
+    for tissue_name in tissue_names :
+        
+        print("-- " + str(tissue_name) + " --")
+
+        #Load summary stats and extract unique set of SNPs
+        vcf_df = pd.read_csv("ge/GTEx_ge_" + tissue_name + ".all.tsv.gz", sep='\t', compression='gzip', usecols=['chromosome', 'position', 'ref', 'alt']).drop_duplicates(subset=['chromosome', 'position', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+
+        #Only keep SNPs (no indels)
+        vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True)
+
+        vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str)
+        vcf_df['start'] = vcf_df['position'].astype(int)
+        vcf_df['end'] = vcf_df['start'] + 1
+        vcf_df['strand'] = "."
+
+        vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand']]
+        vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'})
+
+        print("len(vcf_df) = " + str(len(vcf_df)))
+
+        #Store intermediate SNPs
+        #vcf_df.to_csv("ge/GTEx_snps_" + tissue_name + ".bed.gz", sep='\t', index=False, header=False)
+        
+        #Load splice site annotation
+        splice_df = pd.read_csv(splice_file, sep='\t', names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'], usecols=['Chromosome', 'Start', 'End', 'feature', 'feat1', 'Strand'])[['Chromosome', 'Start', 'End', 'feature', 'feat1', 'Strand']]
+        
+        #Load gene span annotation
+        gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'])
+        gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True)
+
+        gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0])
+
+        gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True)
+
+        gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad
+        gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad
+        
+        #Join dataframes against gtf annotation
+        splice_pr = pr.PyRanges(splice_df)
+        gtf_pr = pr.PyRanges(gtf_df)
+        vcf_pr = pr.PyRanges(vcf_df)
+
+        splice_gtf_pr = splice_pr.join(gtf_pr, strandedness='same')
+        vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False)
+
+        splice_gtf_df = splice_gtf_pr.df[['Chromosome', 'Start', 'End', 'feature', 'gene_id', 'Strand']].copy().reset_index(drop=True)
+        vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id']].copy().reset_index(drop=True)
+
+        splice_gtf_df['Start'] -= max_distance
+        splice_gtf_df['End'] += max_distance
+        
+        #Join vcf against splice annotation
+        splice_gtf_pr = pr.PyRanges(splice_gtf_df)
+        vcf_gtf_pr = pr.PyRanges(vcf_gtf_df)
+
+        vcf_splice_pr = vcf_gtf_pr.join(splice_gtf_pr, strandedness=False)
+        
+        #Force gene_id of SNP to be same as the gene_id of the splice site        
+        vcf_splice_df = vcf_splice_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True)
+        vcf_splice_df = vcf_splice_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'feature', 'Strand_b', 'Start_b']]
+
+        #Splice site position
+        vcf_splice_df['Start_b'] += max_distance
+        vcf_splice_df = vcf_splice_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'splice_pos', 'Strand_b' : 'Strand'})
+
+        #Distance to splice site
+        vcf_splice_df['distance'] = np.abs(vcf_splice_df['Pos'] - vcf_splice_df['splice_pos'])
+        
+        #Choose unique SNPs by shortest distance to splice site
+        vcf_splice_df = vcf_splice_df.sort_values(by='distance', ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+        vcf_splice_df = vcf_splice_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True)
+
+        vcf_df_filtered = vcf_splice_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'})
+        vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'feature', 'strand', 'splice_pos', 'distance']]
+
+        print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered)))
+        
+        #Store intermediate SNPs (filtered)
+        vcf_df_filtered.to_csv("ge/GTEx_snps_" + tissue_name + "_splice_filtered.bed.gz", sep='\t', index=False)
+        
+        #Reload filtered SNP file
+        vcf_df_filtered = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_splice_filtered.bed.gz", sep='\t', compression='gzip')
+
+        #Create variant identifier
+        vcf_df_filtered['variant'] = vcf_df_filtered['chrom'] + "_" + vcf_df_filtered['pos'].astype(str) + "_" + vcf_df_filtered['ref'] + "_" + vcf_df_filtered['alt']
+        
+        #Load merged fine-mapping dataframe
+        finemap_df = pd.read_csv(finemap_file, sep='\t')[['variant', 'pip']]
+        
+        #Join against fine-mapping dataframe
+        neg_df = vcf_df_filtered.join(finemap_df.set_index('variant'), on='variant', how='left')
+        neg_df.loc[neg_df['pip'].isnull(), 'pip'] = 0.
+        
+        #Only keep SNPs with PIP < cutoff
+        neg_df = neg_df.query("pip < " + str(pip_cutoff)).copy().reset_index(drop=True)
+
+        #Store final table of negative SNPs
+        neg_df.to_csv("ge/GTEx_snps_" + tissue_name + "_splice_negatives.bed.gz", sep='\t', index=False)
+        
+        print("len(neg_df) = " + str(len(neg_df)))
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+    main()
diff --git a/src/scripts/data/qtl_data/sqtl_make_positive_sets.py b/src/scripts/data/qtl_data/sqtl_make_positive_sets.py
new file mode 100644
index 0000000..954ab7e
--- /dev/null
+++ b/src/scripts/data/qtl_data/sqtl_make_positive_sets.py
@@ -0,0 +1,190 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import util
+
+import numpy as np
+import pandas as pd
+
+import pyranges as pr
+
+'''
+sqtl_make_positive_sets.py
+
+Build tables with positive (causal) SNPs for sQTLs.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+    usage = 'usage: %prog [options] arg'
+    parser = OptionParser(usage)
+    #parser.add_option()
+    (options,args) = parser.parse_args()
+    
+    #Parameters
+    pip_cutoff = 0.01
+    max_distance = 10000
+    gene_pad = 50
+    splice_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_protein_splice.gff'
+    gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf'
+
+    #Define tissues
+    tissue_names = [
+        'adipose_subcutaneous',
+        'adipose_visceral',
+        'adrenal_gland',
+        'artery_aorta',
+        'artery_coronary',
+        'artery_tibial',
+        'blood',
+        'brain_amygdala',
+        'brain_anterior_cingulate_cortex',
+        'brain_caudate',
+        'brain_cerebellar_hemisphere',
+        'brain_cerebellum',
+        'brain_cortex',
+        'brain_frontal_cortex',
+        'brain_hippocampus',
+        'brain_hypothalamus',
+        'brain_nucleus_accumbens',
+        'brain_putamen',
+        'brain_spinal_cord',
+        'brain_substantia_nigra',
+        'breast',
+        'colon_sigmoid',
+        'colon_transverse',
+        'esophagus_gej',
+        'esophagus_mucosa',
+        'esophagus_muscularis',
+        'fibroblast',
+        'heart_atrial_appendage',
+        'heart_left_ventricle',
+        'kidney_cortex',
+        'LCL',
+        'liver',
+        'lung',
+        'minor_salivary_gland',
+        'muscle',
+        'nerve_tibial',
+        'ovary',
+        'pancreas',
+        'pituitary',
+        'prostate',
+        'skin_not_sun_exposed',
+        'skin_sun_exposed',
+        'small_intestine',
+        'spleen',
+        'stomach',
+        'testis',
+        'thyroid',
+        'uterus',
+        'vagina',
+    ]
+    
+    #Compile positive SNP set for each tissue
+    for tissue_name in tissue_names :
+        
+        print("-- " + str(tissue_name) + " --")
+
+        #Load fine-mapping table
+        vcf_df = pd.read_csv("txrev/GTEx_txrev_" + tissue_name + ".purity_filtered.txt.gz", sep='\t', usecols=['chromosome', 'position', 'ref', 'alt', 'variant', 'pip', 'molecular_trait_id'], low_memory=False)
+
+        #Only keep SNPs (no indels)
+        vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True)
+        
+        #Only keep SNPs associated with splice events
+        vcf_df = vcf_df.loc[vcf_df['molecular_trait_id'].str.contains(".contained.")].copy().reset_index(drop=True)
+
+        vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str)
+        vcf_df['start'] = vcf_df['position'].astype(int)
+        vcf_df['end'] = vcf_df['start'] + 1
+        vcf_df['strand'] = "."
+
+        vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand', 'variant', 'pip', 'molecular_trait_id']]
+        vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'})
+
+        print("len(vcf_df) = " + str(len(vcf_df)))
+        
+        #Load splice site annotation
+        splice_df = pd.read_csv(splice_file, sep='\t', names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'], usecols=['Chromosome', 'Start', 'End', 'feature', 'feat1', 'Strand'])[['Chromosome', 'Start', 'End', 'feature', 'feat1', 'Strand']]
+        
+        #Load gene span annotation
+        gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'])
+        gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True)
+
+        gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0])
+
+        gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True)
+
+        gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad
+        gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad
+        
+        #Join dataframes against gtf annotation
+        splice_pr = pr.PyRanges(splice_df)
+        gtf_pr = pr.PyRanges(gtf_df)
+        vcf_pr = pr.PyRanges(vcf_df)
+
+        splice_gtf_pr = splice_pr.join(gtf_pr, strandedness='same')
+        vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False)
+
+        splice_gtf_df = splice_gtf_pr.df[['Chromosome', 'Start', 'End', 'feature', 'gene_id', 'Strand']].copy().reset_index(drop=True)
+        vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id', 'variant', 'pip', 'molecular_trait_id']].copy().reset_index(drop=True)
+
+        splice_gtf_df['Start'] -= max_distance
+        splice_gtf_df['End'] += max_distance
+        
+        #Join vcf against splice annotation
+        splice_gtf_pr = pr.PyRanges(splice_gtf_df)
+        vcf_gtf_pr = pr.PyRanges(vcf_gtf_df)
+
+        vcf_splice_pr = vcf_gtf_pr.join(splice_gtf_pr, strandedness=False)
+        
+        #Force gene_id of SNP to be same as the gene_id of the splice site        
+        vcf_splice_df = vcf_splice_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True)
+        vcf_splice_df = vcf_splice_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'feature', 'Strand_b', 'Start_b', 'variant', 'pip', 'molecular_trait_id']]
+        
+        #Force gene_id of SNP to be same as the gene_id of the finemapped molecular trait
+        vcf_splice_df['molecular_trait_gene_id'] = vcf_splice_df['molecular_trait_id'].apply(lambda x: x.split(".")[0])
+        vcf_splice_df = vcf_splice_df.query("gene_id == molecular_trait_gene_id").copy().reset_index(drop=True)
+
+        #Splice site position
+        vcf_splice_df['Start_b'] += max_distance
+        vcf_splice_df = vcf_splice_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'splice_pos', 'Strand_b' : 'Strand'})
+
+        #Distance to splice site
+        vcf_splice_df['distance'] = np.abs(vcf_splice_df['Pos'] - vcf_splice_df['splice_pos'])
+        
+        #Choose unique SNPs by shortest distance to splice site (and inverse PIP for tie-breaking)
+        vcf_splice_df['pip_inv'] = 1. - vcf_splice_df['pip']
+        
+        vcf_splice_df = vcf_splice_df.sort_values(by=['distance', 'pip_inv'], ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+        vcf_splice_df = vcf_splice_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True)
+
+        vcf_df_filtered = vcf_splice_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'})
+        vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'feature', 'strand', 'splice_pos', 'distance', 'variant', 'pip', 'molecular_trait_id']]
+
+        print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered)))
+        
+        #Store intermediate SNPs (filtered)
+        vcf_df_filtered.to_csv("txrev/GTEx_snps_" + tissue_name + "_splice_finemapped_filtered.bed.gz", sep='\t', index=False)
+        
+        #Reload filtered SNP file
+        vcf_df_filtered = pd.read_csv("txrev/GTEx_snps_" + tissue_name + "_splice_finemapped_filtered.bed.gz", sep='\t', compression='gzip')
+        
+        #Only keep SNPs with PIP > cutoff
+        pos_df = vcf_df_filtered.query("pip > " + str(pip_cutoff)).copy().reset_index(drop=True)
+
+        #Store final table of positive SNPs
+        pos_df.to_csv("txrev/GTEx_snps_" + tissue_name + "_splice_positives.bed.gz", sep='\t', index=False)
+        
+        print("len(pos_df) = " + str(len(pos_df)))
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+    main()
diff --git a/src/scripts/data/qtl_data/sqtl_vcfs.py b/src/scripts/data/qtl_data/sqtl_vcfs.py
new file mode 100644
index 0000000..d275a76
--- /dev/null
+++ b/src/scripts/data/qtl_data/sqtl_vcfs.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+import os
+import pdb
+import time
+
+import numpy as np
+import pandas as pd
+import pyranges as pr
+from tqdm import tqdm
+
+'''
+sqtl_vcfs.py
+
+Generate positive and negative sQTL sets from the QTL catalog txrevise.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+    usage = 'usage: %prog [options]'
+    parser = OptionParser(usage)
+    parser.add_option('--neg_pip', dest='neg_pip',
+            default=0.01, type='float',
+            help='PIP upper limit for negative examples. [Default: %default]')
+    parser.add_option('--pos_pip', dest='pos_pip',
+            default=0.9, type='float',
+            help='PIP lower limit for positive examples. [Default: %default]')
+    parser.add_option('--match_gene', dest='match_gene',
+            default=0, type='int',
+            help='Try finding negative in same gene as positive. [Default: %default]')
+    parser.add_option('--match_allele', dest='match_allele',
+            default=0, type='int',
+            help='Try finding negative with same ref and alt alleles. [Default: %default]')
+    parser.add_option('-o', dest='out_prefix',
+            default='qtlcat_sqtl')
+    (options,args) = parser.parse_args()
+
+    tissue_name = options.out_prefix.split('txrev_')[1]
+    
+    gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort_protein.gtf'
+    
+    # read variant table
+    qtlcat_df_neg = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_splice_negatives.bed.gz", sep='\t')
+    qtlcat_df_pos = pd.read_csv("txrev/GTEx_snps_" + tissue_name + "_splice_positives.bed.gz", sep='\t')
+    
+    # read TPM bin table and construct lookup dictionaries
+    tpm_df = pd.read_csv('ge/GTEx_ge_' + tissue_name + "_tpms.csv", sep='\t')[['gene_id', 'tpm', 'bin_index', 'bin_index_l', 'bin_index_r']]
+    gene_to_tpm_dict = tpm_df.set_index('gene_id').to_dict(orient='index')
+    
+    # filter on SNPs with genes in TPM bin dict
+    qtlcat_df_neg = qtlcat_df_neg.loc[qtlcat_df_neg['gene_id'].isin(tpm_df['gene_id'].values.tolist())].copy().reset_index(drop=True)
+    qtlcat_df_pos = qtlcat_df_pos.loc[qtlcat_df_pos['gene_id'].isin(tpm_df['gene_id'].values.tolist())].copy().reset_index(drop=True)
+    
+    #Load gene span annotation (protein-coding/categorized only)
+    gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['id_str'])
+    gtf_genes = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0]).unique().tolist()
+
+    # filter on SNPs with genes in GTF file
+    qtlcat_df_neg = qtlcat_df_neg.loc[qtlcat_df_neg['gene_id'].isin(gtf_genes)].copy().reset_index(drop=True)
+    qtlcat_df_pos = qtlcat_df_pos.loc[qtlcat_df_pos['gene_id'].isin(gtf_genes)].copy().reset_index(drop=True)
+    
+    bin_to_genes_dict = {}
+    for _, row in tpm_df.iterrows() :
+        
+        if row['bin_index'] not in bin_to_genes_dict :
+            bin_to_genes_dict[row['bin_index']] = []
+        
+        bin_to_genes_dict[row['bin_index']].append(row['gene_id'])
+    
+    for sample_bin in bin_to_genes_dict :
+        bin_to_genes_dict[sample_bin] = set(bin_to_genes_dict[sample_bin])
+
+    # split molecular trait id and filter for polyadenylation (for positives)
+    qtlcat_df_pos['gene'] = [mti.split('.')[0] for mti in qtlcat_df_pos.molecular_trait_id]
+    qtlcat_df_pos['event'] = [mti.split('.')[2] for mti in qtlcat_df_pos.molecular_trait_id]
+
+    qtlcat_df_pos = qtlcat_df_pos[qtlcat_df_pos.event == 'contained']
+    qtlcat_df_pos = qtlcat_df_pos.rename(columns={'distance' : 'splice_dist'})
+    
+    qtlcat_df_neg['molecular_trait_id'] = qtlcat_df_neg['gene_id'] + "." + "grp_0.contained.negative"
+    qtlcat_df_neg['gene'] = qtlcat_df_neg['gene_id']
+    qtlcat_df_neg['event'] = 'contained'
+    qtlcat_df_neg = qtlcat_df_neg.rename(columns={'distance' : 'splice_dist'})
+
+    sqtl_df = pd.concat([qtlcat_df_neg, qtlcat_df_pos]).copy().reset_index(drop=True)
+
+    # determine positive variants
+    sqtl_pos_df = sqtl_df[sqtl_df.pip >= options.pos_pip]
+    sqtl_neg_df = sqtl_df[sqtl_df.pip < options.neg_pip]
+    pos_variants = set(sqtl_pos_df.variant)
+    
+    neg_gene_and_allele_variants = 0
+    neg_gene_variants = 0
+    
+    neg_expr_and_allele_variants = 0
+    neg_expr_variants = 0
+    
+    unmatched_variants = 0
+
+    # choose negative variants
+    neg_variants = set()
+    neg_dict = {}
+    for pvariant in tqdm(pos_variants):
+        sqtl_this_df = sqtl_pos_df[sqtl_pos_df.variant == pvariant]
+
+        neg_found = False
+        
+        # optionally prefer negative from positive's gene set
+        if options.match_gene == 1 and options.match_allele == 1 :
+            pgenes = set(sqtl_this_df.gene)
+            neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, True)
+            
+            if neg_found :
+                neg_gene_and_allele_variants += 1
+        
+        if not neg_found and options.match_gene == 1 :
+            pgenes = set(sqtl_this_df.gene)
+            neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, False)
+            
+            if neg_found :
+                neg_gene_variants += 1
+        
+        if not neg_found and options.match_allele == 1 :
+            pgenes = bin_to_genes_dict[gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index']]
+            neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, True)
+            
+            if not neg_found and gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_l'] :
+                pgenes = bin_to_genes_dict[gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_l']]
+                neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, True)
+            
+            if not neg_found and gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_r'] :
+                pgenes = bin_to_genes_dict[gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_r']]
+                neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, True)
+            
+            if neg_found :
+                neg_expr_and_allele_variants += 1
+        
+        if not neg_found :
+            pgenes = bin_to_genes_dict[gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index']]
+            neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, False)
+            
+            if not neg_found and gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_l'] :
+                pgenes = bin_to_genes_dict[gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_l']]
+                neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, False)
+            
+            if not neg_found and gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_r'] :
+                pgenes = bin_to_genes_dict[gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_r']]
+                neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, False)
+            
+            if neg_found :
+                neg_expr_variants += 1
+        
+        if not neg_found :
+            print("[Warning] Could not find a matching negative for '" + pvariant + "'")
+            unmatched_variants += 1
+
+    print('%d positive variants' % len(pos_variants))
+    print('%d negative variants' % len(neg_variants))
+    print(' - %d gene-matched negatives with same alleles' % neg_gene_and_allele_variants)
+    print(' - %d gene-matched negatives ' % neg_gene_variants)
+    print(' - %d expr-matched negatives with same alleles' % neg_expr_and_allele_variants)
+    print(' - %d expr-matched negatives ' % neg_expr_variants)
+    print(' - %d unmatched negatives ' % unmatched_variants)
+
+    pos_dict = {pv: pv for pv in pos_variants}
+    
+    # write VCFs
+    write_vcf('%s_pos.vcf' % options.out_prefix, sqtl_df, pos_variants, pos_dict)
+    write_vcf('%s_neg.vcf' % options.out_prefix, sqtl_df, neg_variants, neg_dict)
+
+def find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, match_allele) :
+    
+    gene_mask = np.array([gene in pgenes for gene in sqtl_neg_df.gene])
+    sqtl_neg_gene_df = sqtl_neg_df[gene_mask]
+
+    # match PAS distance
+    this_dist = sqtl_this_df.iloc[0].splice_dist
+    dist_cmp = np.abs(sqtl_neg_gene_df.splice_dist - this_dist)
+    dist_cmp_unique = np.sort(np.unique(dist_cmp.values))
+    
+    this_ref = sqtl_this_df.iloc[0].ref
+    this_alt = sqtl_this_df.iloc[0].alt
+
+    for ni_unique in dist_cmp_unique:
+        
+        sqtl_neg_gene_dist_df = sqtl_neg_gene_df.loc[dist_cmp == ni_unique]
+        
+        shuffle_index = np.arange(len(sqtl_neg_gene_dist_df), dtype='int32')
+        np.random.shuffle(shuffle_index)
+        
+        for nsqtl_i in range(len(sqtl_neg_gene_dist_df)) :
+            nsqtl = sqtl_neg_gene_dist_df.iloc[shuffle_index[nsqtl_i]]
+        
+            if not match_allele or (nsqtl.ref == this_ref and nsqtl.alt == this_alt):
+                if nsqtl.variant not in neg_variants and nsqtl.variant not in pos_variants:
+
+                    neg_variants.add(nsqtl.variant)
+                    neg_dict[nsqtl.variant] = sqtl_this_df.iloc[0].variant
+
+                    return True
+    
+    return False
+
+def write_vcf(vcf_file, df, variants_write, variants_dict):
+    vcf_open = open(vcf_file, 'w')
+    print('##fileformat=VCFv4.2', file=vcf_open)
+    print('##INFO=<ID=MT,Number=1,Type=String,Description="Molecular trait id">',
+        file=vcf_open)
+    print('##INFO=<ID=SD,Number=1,Type=Integer,Description="Splice distance">',
+        file=vcf_open)
+    print('##INFO=<ID=PI,Number=1,Type=String,Description="Positive SNP id">',
+        file=vcf_open)
+    cols = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
+    print('\t'.join(cols), file=vcf_open)
+
+    variants_written = set()
+
+    for v in df.itertuples():
+        if v.variant in variants_write and v.variant not in variants_written:
+            cols = [v.chrom, str(v.pos), v.variant, v.ref, v.alt, '.', '.']
+            cols += ['MT=%s;SD=%d;PI=%s' % (v.molecular_trait_id, v.splice_dist, variants_dict[v.variant])]
+            print('\t'.join(cols), file=vcf_open)
+            variants_written.add(v.variant)
+
+    vcf_open.close()
+
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+    main()
diff --git a/src/scripts/data/training_data/Makefile b/src/scripts/data/training_data/Makefile
new file mode 100644
index 0000000..170222b
--- /dev/null
+++ b/src/scripts/data/training_data/Makefile
@@ -0,0 +1,47 @@
+FASTA_HUMAN=$$HG38/assembly/ucsc/hg38.ml.fa
+GAPS_HUMAN=$$HG38/assembly/ucsc/hg38_gaps.bed
+UMAP_HUMAN=$$HG38//mappability/umap_k36_t10_l32.bed
+BLACK_HUMAN=$$HG38/blacklist/blacklist_hg38_all.bed
+
+FASTA_MOUSE=$$MM10/assembly/ucsc/mm10.ml.fa
+GAPS_MOUSE=$$MM10/assembly/ucsc/mm10_gaps.bed
+UMAP_MOUSE=$$MM10//mappability/umap_k36_t10_l32.bed
+BLACK_MOUSE=$$MM10/blacklist/blacklist_mm10_all.bed
+
+ALIGN=$$HG38/align/hg38.mm10.syn.net.gz
+
+OUT=/scratch3/drk/seqnn/data/v9
+
+# LENGTH=393216
+# TSTRIDE=43691 # (393216-2*131072)/3
+# CROP=131072
+
+LENGTH=524288
+TSTRIDE=49173 # (524288-2*163840)/4 + 21
+CROP=163840
+WIDTH=32
+FOLDS=8
+
+AOPTS=--break 2097152 -c $(CROP) --nf 524288 --no 393216 -l $(LENGTH) --stride $(TSTRIDE) -f $(FOLDS) --umap_t 0.5 -w $(WIDTH)
+DOPTS=-c $(CROP) -d 2 -f $(FOLDS) -l $(LENGTH) -p 64 -r 16 --umap_clip 0.5 -w $(WIDTH)
+
+
+all: $(OUT)/hg38/tfrecords/train-0.tfr $(OUT)/mm10/tfrecords/train-0.tfr
+
+umap_human.bed:
+	cat $(UMAP_HUMAN) $(BLACK_HUMAN) | awk 'BEGIN {OFS="\t"} {print $$1, $$2, $$3}' | bedtools sort -i - | bedtools merge -i - > umap_human.bed
+
+umap_mouse.bed:
+	cat $(UMAP_MOUSE) $(BLACK_MOUSE) | awk 'BEGIN {OFS="\t"} {print $$1, $$2, $$3}' | bedtools sort -i - | bedtools merge -i - > umap_mouse.bed
+
+targets_human.txt targets_mouse.txt:
+	./make_targets.py
+
+$(OUT)/hg38/sequences.bed $(OUT)/mm10/sequences.bed: umap_human.bed umap_mouse.bed
+	basenji_data_align.py -a hg38,mm10 -g $(GAPS_HUMAN),$(GAPS_MOUSE) -u umap_human.bed,umap_mouse.bed $(AOPTS) -o $(OUT) $(ALIGN) $(FASTA_HUMAN),$(FASTA_MOUSE)
+
+$(OUT)/hg38/tfrecords/train-0.tfr: $(OUT)/hg38/sequences.bed targets_human.txt
+	basenji_data.py --restart $(DOPTS) -b $(BLACK_HUMAN) -o $(OUT)/hg38 $(FASTA_HUMAN) -u umap_human.bed targets_human.txt
+
+$(OUT)/mm10/tfrecords/train-0.tfr: $(OUT)/mm10/sequences.bed targets_mouse.txt
+	basenji_data.py --restart $(DOPTS) -b $(BLACK_MOUSE) -o $(OUT)/mm10 $(FASTA_MOUSE) -u umap_mouse.bed targets_mouse.txt
diff --git a/src/scripts/data/training_data/README.md b/src/scripts/data/training_data/README.md
new file mode 100644
index 0000000..de8af67
--- /dev/null
+++ b/src/scripts/data/training_data/README.md
@@ -0,0 +1,11 @@
+## Data processing & Training
+
+Processing of the ENCODE, GTEx, FANTOM5, and CATlas training data is done through the Makefile in this folder. It requires a number of auxiliary files (e.g. genome alignments and lists of unmappable regions), which can be downloaded from the Borzoi training data bucket [here](https://storage.googleapis.com/borzoi-paper/data/) (GCP).<br/>
+<br/>
+The Makefile relies on the script 'basenji_data.py' from the [basenji repository](https://github.com/calico/basenji/blob/master/bin/basenji_data.py), which in turn calls the scripts 'basenji_data_read.py' and 'basenji_data_write.py' from the same repo, in order to (1) read coverage data (from bigwig-like files) along with a matched segment from a fasta genome file, and (2) write the (one-hot coded) sequence along with coverage values into compressed TF records.<br/>
+<br/>
+*Notes*: 
+- The attached Makefile shows the exact commands used to call basenji_data.py and other related scripts to create the specific training data for the published model.
+- The script(s) take as input a fasta genome file, optional blacklist+unmappable region files, as well as a .txt file where each row points to a bigwig coverage file location (see for [this file](https://raw.githubusercontent.com/calico/borzoi/main/examples/targets_human.txt)).<br/>
+<br/>
+The model training is done through the script 'hound_train.py' from the [baskerville repository](https://github.com/calico/baskerville/blob/main/src/baskerville/scripts/hound_train.py). Most of the training parameters are set through a .json file that is supplied to the script. The published model's parameter file can be found [here](https://storage.googleapis.com/seqnn-share/borzoi/params.json).<br/>

From 6882605d6f83bc19bf9c7d52852808354308184b Mon Sep 17 00:00:00 2001
From: johli <jlinder@calicolabs.com>
Date: Fri, 19 Apr 2024 11:04:24 -0700
Subject: [PATCH 2/3] Update README.md

---
 src/scripts/data/training_data/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/scripts/data/training_data/README.md b/src/scripts/data/training_data/README.md
index de8af67..7c2751e 100644
--- a/src/scripts/data/training_data/README.md
+++ b/src/scripts/data/training_data/README.md
@@ -1,11 +1,11 @@
 ## Data processing & Training
 
-Processing of the ENCODE, GTEx, FANTOM5, and CATlas training data is done through the Makefile in this folder. It requires a number of auxiliary files (e.g. genome alignments and lists of unmappable regions), which can be downloaded from the Borzoi training data bucket [here](https://storage.googleapis.com/borzoi-paper/data/) (GCP).<br/>
-<br/>
+Processing of ENCODE, GTEx, FANTOM5, and CATlas training data is done through a Makefile. It requires a number of auxiliary files (e.g. genome alignments), which can be downloaded from the Borzoi training data bucket [here](https://storage.googleapis.com/borzoi-paper/data/) (GCP).<br/>
+
 The Makefile relies on the script 'basenji_data.py' from the [basenji repository](https://github.com/calico/basenji/blob/master/bin/basenji_data.py), which in turn calls the scripts 'basenji_data_read.py' and 'basenji_data_write.py' from the same repo, in order to (1) read coverage data (from bigwig-like files) along with a matched segment from a fasta genome file, and (2) write the (one-hot coded) sequence along with coverage values into compressed TF records.<br/>
-<br/>
+
 *Notes*: 
 - The attached Makefile shows the exact commands used to call basenji_data.py and other related scripts to create the specific training data for the published model.
 - The script(s) take as input a fasta genome file, optional blacklist+unmappable region files, as well as a .txt file where each row points to a bigwig coverage file location (see for [this file](https://raw.githubusercontent.com/calico/borzoi/main/examples/targets_human.txt)).<br/>
-<br/>
+
 The model training is done through the script 'hound_train.py' from the [baskerville repository](https://github.com/calico/baskerville/blob/main/src/baskerville/scripts/hound_train.py). Most of the training parameters are set through a .json file that is supplied to the script. The published model's parameter file can be found [here](https://storage.googleapis.com/seqnn-share/borzoi/params.json).<br/>

From 2eebb71bfac223768297dbf0754e7516203194cd Mon Sep 17 00:00:00 2001
From: johli <jlinder@calicolabs.com>
Date: Fri, 19 Apr 2024 11:06:22 -0700
Subject: [PATCH 3/3] Update README.md

---
 src/scripts/data/qtl_data/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/scripts/data/qtl_data/README.md b/src/scripts/data/qtl_data/README.md
index 3a3589e..8831abf 100644
--- a/src/scripts/data/qtl_data/README.md
+++ b/src/scripts/data/qtl_data/README.md
@@ -1,9 +1,9 @@
 ## QTL data processing
 
-The scripts in this folder are used to extract fine-mapped causal sQTLs, paQTLs and ipaQTLs from the results of the eQTL eQTL Catalogue, as well as construct distance- and expression-matched negative SNPs.<br/>
-<br/>
+The scripts in this folder are used to extract fine-mapped causal sQTLs, paQTLs and ipaQTLs from the results of the eQTL Catalogue, as well as construct distance- and expression-matched negative SNPs.<br/>
+
 *Notes*: 
-- The pipeline requires the GTEx v8 (median) TPM matrix, which can be downloaded [here](https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz).<br/>
+- The pipeline requires the GTEx v8 (median) TPM matrix, which can be downloaded [here](https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz).
 <br/>
 
 As a prerequisite to generating any of the QTL datasets, run the following scripts (in order):