Oshlack · hdashnow · Sep 24, 2018 · Sep 29, 2018 · Oct 1, 2018 · Oct 1, 2018
diff --git a/.testing/STRs.benchmark.tsv b/.testing/STRs.benchmark.tsv
@@ -1,6 +1,6 @@
-chrom	start	end	sample	repeatunit	reflen	locuscoverage	outlier	p_adj	bpInsertion	repeatUnits
-chr13	70713515	70713561	11	AGC	15.3	35	1.88685648944145	0.0295898153640992	316.950118548597	120.950039516199
-chr13	70713515	70713561	69	AGC	15.3	8	0.4297790752454	0.333678177749521	76.1210174457791	40.6736724819264
-chr13	70713515	70713561	1	AGC	15.3	3	-0.426744171073212	0.665217162914456	32.9115187238725	26.2705062412908
-chr13	70713515	70713561	54	AGC	15.3	2	-0.730726313557163	0.7675268301731	24.4403744973217	23.4467914991072
-chr13	70713515	70713561	49	AGC	15.3	1	-1.15916508005647	0.876805548823274	16.0677184130193	20.6559061376731
+chrom	start	end	sample	repeatunit	reflen	locuscoverage	total_assigned	outlier	p_adj	bpInsertion	repeatUnits	repeatUnits_max
+chr13	70713515	70713561	11_L001_R1	AGC	15.3	35	52.0	1.9	0.03	317.0	121.0	172.9
+chr13	70713515	70713561	69_L001_R1	AGC	15.3	8	8.0	0.4	0.33	76.1	40.7	40.7
+chr13	70713515	70713561	1_L001_R1	AGC	15.3	3	3.0	-0.4	0.67	32.9	26.3	26.3
+chr13	70713515	70713561	54_L001_R1	AGC	15.3	2	2.0	-0.7	0.77	24.4	23.4	23.4
+chr13	70713515	70713561	49_L001_R1	AGC	15.3	1	1.0	-1.2	0.88	16.1	20.7	20.7
diff --git a/.travis.yml b/.travis.yml
@@ -11,6 +11,7 @@ script:
 # Run the test data
 - ../tools/bin/bpipe run ../pipelines/STRetch_exome_pipeline.groovy ../test-data/*.fastq.gz
 - if diff STRs.tsv ../.testing/STRs.benchmark.tsv; then echo exit 0; else echo exit 1; fi
+#- diff STRs.tsv ../.testing/STRs.benchmark.tsv
 after_script:
 - head *.locus_counts *.STR_counts *.median_cov
 - head *.tsv
diff --git a/pipelines/STRetch_exome_pipeline.groovy b/pipelines/STRetch_exome_pipeline.groovy
@@ -15,7 +15,6 @@ run {
         set_sample_info +
         align_bwa + index_bam +
         median_cov_region +
-        STR_coverage +
         STR_locus_counts
     ] +
     estimate_size

diff --git a/pipelines/STRetch_wgs_bam_pipeline.groovy b/pipelines/STRetch_wgs_bam_pipeline.groovy
@@ -32,10 +32,9 @@ run {
             mosdepth_dist + mosdepth_median,
             shards * [
                 init_shard + align_bwa_bam + index_bam 
-            ] + merge_bams
-        ] +
-        STR_coverage +
+            ] + merge_bams +
         STR_locus_counts 
+        ]
     ] +
     estimate_size
 }
diff --git a/pipelines/STRetch_wgs_pipeline.groovy b/pipelines/STRetch_wgs_pipeline.groovy
@@ -12,7 +12,6 @@ run {
         set_sample_info +
         align_bwa + index_bam +
         median_cov +
-        STR_coverage +
         STR_locus_counts 
     ] +
     estimate_size

diff --git a/pipelines/pipeline_stages.groovy b/pipelines/pipeline_stages.groovy
@@ -55,9 +55,13 @@ set_sample_info = {
         // sample_flowcell_library_lane_read.fastq.gz
         // however sample can also include underscores
         def info = get_info(input)
-        if (info.length >= 5) {
+        if (info.length == 2) {
 
-            //branch.sample = info[0..-5].join("_")
+            branch.sample = info[0..-2].join("_")
+        }
+       if (info.length >= 5) {
+
+            branch.sample = info[0..-5].join("_")
             branch.flowcell = info[-4]
             branch.library = info[-3]
             branch.lane = info[-2]
@@ -165,27 +169,17 @@ index_bam = {
     forward input
 }
 
-STR_coverage = {
-    transform("bam") to ("STR_counts") {
-        exec """
-            $bedtools coverage -counts
-            -sorted
-            -g ${REF}.genome
-            -a $DECOY_BED
-            -b $input.bam > $output.STR_counts
-        """
-    }
-}
-
 STR_locus_counts = {
-    transform("bam") to ("locus_counts") {
+
+    transform("bam") to ("locus_counts", "STR_counts") {
         exec """
             STRPATH=$PATH;
             PATH=$STRETCH/tools/bin:$PATH;
             $python $STRETCH/scripts/identify_locus.py
             --bam $input.bam
             --bed $STR_BED
             --output $output.locus_counts
+            --STR_counts $output.STR_counts
             ;PATH=$STRPATH
         """
     }

diff --git a/scripts/estimateSTR.py b/scripts/estimateSTR.py
@@ -59,13 +59,12 @@ def parse_STRcov(filename):
     """Parse all STR coverage"""
     sample_id = get_sample(filename)
     try:
-        cov_data = pd.read_table(filename, delim_whitespace = True,
-                            names = ['chrom', 'start', 'end', 'decoycov'])
+        cov_data = pd.read_table(filename, delim_whitespace = True)
     except pd.io.common.EmptyDataError:
         sys.exit('ERROR: file {0} was empty.\n'.format(filename))
     cov_data['sample'] = sample_id
     cov_data['repeatunit'] = [x.split('-')[1] for x in cov_data['chrom']]
-    cov_data = cov_data[['sample', 'repeatunit', 'decoycov']]
+    cov_data = cov_data[['sample', 'repeatunit', 'decoycov', 'decoycov_pairs']]
     return(cov_data)
 
 def parse_locuscov(filename):
@@ -224,36 +223,29 @@ def main():
     factor = 100
 
     STRcov_data = pd.merge(STRcov_data, genomecov_data)
-    #STRcov_data['decoycov_norm'] = factor * (STRcov_data['decoycov'] + 1) / STRcov_data['genomecov']
-    #STRcov_data['decoycov_log'] = np.log2(STRcov_data['decoycov_norm'])
-    #XXX combines the previous two lines into one. Keeping commented out in case coverage_norm is required later
-    STRcov_data['decoycov_log'] = np.log2(factor * (STRcov_data['decoycov'] + 1) / STRcov_data['genomecov'])
-
-    # #XXX ***Not fully implemented***
-    # # Look for repeat units where the number of reads mapping to the decoy can't be
-    # # explained by those mapping to all loci with that repeat unit
-    #
-    # # Sum the counts over all loci for each repeat unit in each sample
-    # locus_totals = locuscov_data.groupby(['sample', 'repeatunit'])['locuscoverage'].aggregate(np.sum)
-    # locus_totals = pd.DataFrame(locus_totals).reset_index() # convert to DataFrame and make indices into columns
-    # # Calculate the difference between reads assigned to a decoy and the sum of
-    # # all reads assigned to loci with that repeat unit
-    # all_differences = pd.merge(STRcov_data, locus_totals, how='left')
-    # all_differences['difference'] = all_differences['decoycov'] - all_differences['locuscoverage']
-    # # Normalise differences by median coverage and take the log2
-    # all_differences['difference_log'] = np.log2(factor * (all_differences['difference'] + 1) / all_differences['genomecov'])
-    #
-    # locus_totals = pd.merge(locus_totals, STRcov_data)
-    #
-    # # Assign decoy counts to each locus, based on what proportion of the counts for that repeat unit they already have
-
-    locus_totals = pd.merge(locuscov_data, STRcov_data, how = 'left')
-
-    locus_totals['total_assigned'] = locus_totals['locuscoverage'] #XXX remove this line if implementing the above
+
+    # Use pairs of reads where both map to the same STR decoy chromosome to increase
+    # size estimates and calculate outliers
+    # Sum the counts over all loci for each repeat unit in each sample and count loci 
+    # with same repeat unit
+    locus_totals = locuscov_data.groupby(['sample', 'repeatunit'])['locuscoverage'].agg(['sum', 
+        'count']).reset_index()
+    locus_totals = locuscov_data.merge(locus_totals, how = 'left').merge(STRcov_data, how = 'left')
+    # Assign decoy counts to each locus, based on what proportion of the counts for that 
+    # repeat unit they already have
+    locus_totals['total_assigned'] = locus_totals['locuscoverage'] + locus_totals['decoycov_pairs'] * locus_totals['locuscoverage']/locus_totals['sum']
+    # NaNs introduced where count = 0, fill these with with locuscoverage
+    locus_totals['total_assigned'].fillna(locus_totals['locuscoverage'], inplace=True)
+
+#XXX Uncomment if not implementing the above
+#    locus_totals = locuscov_data.merge(STRcov_data, how = 'left')
+#    locus_totals['total_assigned'] = locus_totals['locuscoverage']
+
+    locus_totals['locuscoverage_log'] = np.log2(factor * (locus_totals['locuscoverage'] + 1) / locus_totals['genomecov'])
     locus_totals['total_assigned_log'] = np.log2(factor * (locus_totals['total_assigned'] + 1) / locus_totals['genomecov'])
 
     # For each locus, calculate if that sample is an outlier relative to others
-    total_assigned_wide = locus_totals.pivot(index='locus', columns='sample', values='total_assigned_log')
+    locuscoverage_wide = locus_totals.pivot(index='locus', columns='sample', values='locuscoverage_log')
 
     # Calculate values for if there were zero reads at a locus in all samples
     null_locus_counts = np.log2(factor * (0 + 1) / genomecov_data['genomecov'])
@@ -268,7 +260,7 @@ def main():
 
     # Use Huber's M-estimator to calculate median and SD across all samples
     # for each locus
-    sample_estimates = total_assigned_wide.apply(hubers_est, axis=1)
+    sample_estimates = locuscoverage_wide.apply(hubers_est, axis=1)
     # Where sd is NA, replace with the minimum non-zero sd from all loci
     min_sd = np.min(sample_estimates['sd'][sample_estimates['sd'] > 0])
     sample_estimates['sd'].fillna(min_sd, inplace=True)
@@ -284,7 +276,7 @@ def main():
 
         sample_estimates.loc['null_locus_counts'] = null_locus_counts_est
 
-        n = len(total_assigned_wide.columns)
+        n = len(locuscoverage_wide.columns)
         sample_estimates['n'] = n
 
         sample_estimates.to_csv(emit_file, sep= '\t')
@@ -296,30 +288,30 @@ def main():
         control_estimates = parse_controls(control_file)
         # Get a list of all loci in the control file but not the sample data
         control_loci_df = control_estimates.iloc[control_estimates.index != 'null_locus_counts']
-        control_loci = [x for x in control_loci_df.index if x not in total_assigned_wide.index]
+        control_loci = [x for x in control_loci_df.index if x not in locuscoverage_wide.index]
 
         # Extract and order just those control estimates appearing in the current data
-        mu_sd_estimates = control_estimates.reindex(total_assigned_wide.index)
+        mu_sd_estimates = control_estimates.reindex(locuscoverage_wide.index)
         # Fill NaNs with null_locus_counts values
         mu_sd_estimates.fillna(control_estimates.loc['null_locus_counts'],
                                 inplace=True)
     else:
         # Extract and order estimates to match the current data
-        mu_sd_estimates = sample_estimates.reindex(total_assigned_wide.index)
+        mu_sd_estimates = sample_estimates.reindex(locuscoverage_wide.index)
 
     # calculate z scores
-    z = z_score(total_assigned_wide, mu_sd_estimates)
+    z = z_score(locuscoverage_wide, mu_sd_estimates)
 
     # If a control file is given, effectively add zeros counts at all loci in 
     # controls but not in the samples. 
     # These extra rows will dissapear due to a later merge
     if control_file != '': 
-        # Create a total_assigned_wide as if all loci have zero counts
-        null_total_assigned_wide = pd.DataFrame(columns = sample_names, index = control_loci)
-        null_total_assigned_wide.fillna(null_locus_counts, inplace = True)
+        # Create a locuscoverage_wide as if all loci have zero counts
+        null_locuscoverage_wide = pd.DataFrame(columns = sample_names, index = control_loci)
+        null_locuscoverage_wide.fillna(null_locus_counts, inplace = True)
         # Caculate z scores
-        null_z = z_score(null_total_assigned_wide, 
-                            control_estimates.reindex(null_total_assigned_wide.index))
+        null_z = z_score(null_locuscoverage_wide, 
+                            control_estimates.reindex(null_locuscoverage_wide.index))
         loci_with_counts = z.index
         z = z.append(null_z)
 
@@ -375,14 +367,19 @@ def main():
     X_train = np.log2(STRcov_model['coverage_norm']).values.reshape(-1, 1)
     Y_train = np.log2(STRcov_model['allele2'])
     regr.fit(X_train, Y_train)
-    # Make a prediction
-    Y_pred = regr.predict(locus_totals['total_assigned_log'].values.reshape(-1, 1))
+    # Make a prediction based on locuscoverage
+    Y_pred = regr.predict(locus_totals['locuscoverage_log'].values.reshape(-1, 1))
     predict = np.power(2, Y_pred)
     locus_totals['bpInsertion'] = predict
+    # Make a prediction based on total_assigned
+    Y_pred_max = regr.predict(locus_totals['total_assigned_log'].values.reshape(-1, 1))
+    predict_max = np.power(2, Y_pred_max)
+    locus_totals['bpInsertion_max'] = predict_max
 
     # Get the estimated size in terms of repeat units (total, not relative to ref)
     repeatunit_lens = [len(x) for x in locus_totals['repeatunit']]
     locus_totals['repeatUnits'] = (locus_totals['bpInsertion']/repeatunit_lens) + locus_totals['reflen']
+    locus_totals['repeatUnits_max'] = (locus_totals['bpInsertion_max']/repeatunit_lens) + locus_totals['reflen']
 
     # Split locus into 3 columns: chrom start end
     locuscols = pd.DataFrame([x.split('-') for x in locus_totals['locus']],
@@ -392,14 +389,20 @@ def main():
     # Specify output data columns
     write_data = locus_totals[['chrom', 'start', 'end',
                                     'sample', 'repeatunit', 'reflen',
-                                    'locuscoverage',
+                                    'locuscoverage', 'total_assigned',
+                                    'locuscoverage_log', 'total_assigned_log', 'genomecov',
                                     'outlier', 'p_adj',
-                                    'bpInsertion', 'repeatUnits'
+                                    'bpInsertion', 'repeatUnits', 'repeatUnits_max'
                                     ]]
 
     #sort by outlier score then estimated size (bpInsertion), both descending
     write_data = write_data.sort_values(['outlier', 'bpInsertion'], ascending=[False, False])
     #XXX check for duplicate rows?
+    # Convert outlier and p_adj to numeric type and do some rounding/formatting
+    write_data['outlier'] = pd.to_numeric(write_data['outlier'])
+    write_data['p_adj'] = [ format(x, '.2g') for x in pd.to_numeric(write_data['p_adj']) ]
+    write_data = write_data.round({'total_assigned': 1, 'outlier': 1,
+        'bpInsertion': 1, 'repeatUnits': 1, 'repeatUnits_max': 1})
 
     # Write individual files for each sample, remove rows where locuscoverage == 0
     samples = set(write_data['sample'])

diff --git a/scripts/identify_locus.py b/scripts/identify_locus.py
@@ -33,7 +33,10 @@ def parse_args():
         help='bed file containing genomic locations of STR loci. Genomic locations should be relative to the fasta reference used to create the bam')
     parser.add_argument(
         '--output', type=str, required=False,
-        help='Output file name. Defaults to stdout.')
+        help='Output file name for locus counts. Defaults to stdout.')
+    parser.add_argument(
+        '--STR_counts', type=str, required=False,
+        help='Output file name for total counts of reads aligned to each STR decoy chromosome. If not given, these will not be reported.')
     parser.add_argument(
         '--dist', type=int, required=False, default=500,
         help='Counts are only assigned to an STR locus that is at most this many bp away. The best choice is probably the insert size. (default: %(default)s)')
@@ -95,6 +98,13 @@ def main():
         # Read bam
         bam = pysam.Samfile(bamfile, 'rb')
 
+        # Get count of reads aligned to STR decoy chromosomes
+        STR_counts = {}
+        index_stats = bam.get_index_statistics()
+        for x in index_stats:
+            if x.contig.startswith("STR-"):
+                STR_counts[x.contig] = [x.mapped, None] # cols: mapped, both on same decoy
+
         # Get relevant chromosomes
         required_chroms = []
         unpaired = 0
@@ -105,8 +115,8 @@ def main():
         # Check if any STR- chromosomes
         if len(required_chroms) == 0:
            sys.exit('ERROR: There were no reads mapping to chromosomes with names starting with "STR-" in {0}. Are you sure this data is mapped to a reference genome with STR decoy chromosomes?'.format(bamfile))
-
         for chrom in required_chroms:
+            STR_counts[chrom][1] = 0
             motif = chrom.split('-')[1]
             all_positions = []
             all_segments = bam.fetch(reference=chrom)
@@ -121,7 +131,8 @@ def main():
                 mate_start = read.next_reference_start
                 mate_stop = mate_start + readlen
                 all_positions.append([mate_chr, mate_start, mate_stop])
-
+                if mate_chr == chrom: # check if both in pair map to the same STR decoy
+                    STR_counts[chrom][1] += 1
             # Strategy:
             # Merge all overlapping intervals
             # Keep the count of reads corresponding to each merged interval (i.e. 1 for each read contained in it)
@@ -182,6 +193,15 @@ def main():
         sys.stderr.write('WARNING: it appears that {0} of the {1} reads overlapping the target STR regions were unpaired and so no useful data could be obtained from them.\n'.format(unpaired, total))
 
     # Write results
+
+    if args.STR_counts:
+        STR_counts_df = pd.DataFrame.from_dict(STR_counts, orient='index',
+            columns=['decoycov', 'decoycov_pairs'])
+        with open(args.STR_counts, 'w') as STR_outstream:
+            STR_outstring = STR_counts_df.to_csv(sep='\t', header=True, index=True,
+                index_label='chrom')
+            STR_outstream.write(STR_outstring)
+
     if outfile:
         outstream = open(outfile, 'w')
     else: