Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Str counts #37

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .testing/STRs.benchmark.tsv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
chrom start end sample repeatunit reflen locuscoverage outlier p_adj bpInsertion repeatUnits
chr13 70713515 70713561 11 AGC 15.3 35 1.88685648944145 0.0295898153640992 316.950118548597 120.950039516199
chr13 70713515 70713561 69 AGC 15.3 8 0.4297790752454 0.333678177749521 76.1210174457791 40.6736724819264
chr13 70713515 70713561 1 AGC 15.3 3 -0.426744171073212 0.665217162914456 32.9115187238725 26.2705062412908
chr13 70713515 70713561 54 AGC 15.3 2 -0.730726313557163 0.7675268301731 24.4403744973217 23.4467914991072
chr13 70713515 70713561 49 AGC 15.3 1 -1.15916508005647 0.876805548823274 16.0677184130193 20.6559061376731
chrom start end sample repeatunit reflen locuscoverage total_assigned outlier p_adj bpInsertion repeatUnits repeatUnits_max
chr13 70713515 70713561 11_L001_R1 AGC 15.3 35 52.0 1.9 0.03 317.0 121.0 172.9
chr13 70713515 70713561 69_L001_R1 AGC 15.3 8 8.0 0.4 0.33 76.1 40.7 40.7
chr13 70713515 70713561 1_L001_R1 AGC 15.3 3 3.0 -0.4 0.67 32.9 26.3 26.3
chr13 70713515 70713561 54_L001_R1 AGC 15.3 2 2.0 -0.7 0.77 24.4 23.4 23.4
chr13 70713515 70713561 49_L001_R1 AGC 15.3 1 1.0 -1.2 0.88 16.1 20.7 20.7
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ script:
# Run the test data
- ../tools/bin/bpipe run ../pipelines/STRetch_exome_pipeline.groovy ../test-data/*.fastq.gz
- if diff STRs.tsv ../.testing/STRs.benchmark.tsv; then echo exit 0; else echo exit 1; fi
#- diff STRs.tsv ../.testing/STRs.benchmark.tsv
after_script:
- head *.locus_counts *.STR_counts *.median_cov
- head *.tsv
1 change: 0 additions & 1 deletion pipelines/STRetch_exome_pipeline.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ run {
set_sample_info +
align_bwa + index_bam +
median_cov_region +
STR_coverage +
STR_locus_counts
] +
estimate_size
Expand Down
5 changes: 2 additions & 3 deletions pipelines/STRetch_wgs_bam_pipeline.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,9 @@ run {
mosdepth_dist + mosdepth_median,
shards * [
init_shard + align_bwa_bam + index_bam
] + merge_bams
] +
STR_coverage +
] + merge_bams +
STR_locus_counts
]
] +
estimate_size
}
1 change: 0 additions & 1 deletion pipelines/STRetch_wgs_pipeline.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ run {
set_sample_info +
align_bwa + index_bam +
median_cov +
STR_coverage +
STR_locus_counts
] +
estimate_size
Expand Down
24 changes: 9 additions & 15 deletions pipelines/pipeline_stages.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,13 @@ set_sample_info = {
// sample_flowcell_library_lane_read.fastq.gz
// however sample can also include underscores
def info = get_info(input)
if (info.length >= 5) {
if (info.length == 2) {

//branch.sample = info[0..-5].join("_")
branch.sample = info[0..-2].join("_")
}
if (info.length >= 5) {

branch.sample = info[0..-5].join("_")
branch.flowcell = info[-4]
branch.library = info[-3]
branch.lane = info[-2]
Expand Down Expand Up @@ -165,27 +169,17 @@ index_bam = {
forward input
}

STR_coverage = {
transform("bam") to ("STR_counts") {
exec """
$bedtools coverage -counts
-sorted
-g ${REF}.genome
-a $DECOY_BED
-b $input.bam > $output.STR_counts
"""
}
}

STR_locus_counts = {
transform("bam") to ("locus_counts") {

transform("bam") to ("locus_counts", "STR_counts") {
exec """
STRPATH=$PATH;
PATH=$STRETCH/tools/bin:$PATH;
$python $STRETCH/scripts/identify_locus.py
--bam $input.bam
--bed $STR_BED
--output $output.locus_counts
--STR_counts $output.STR_counts
;PATH=$STRPATH
"""
}
Expand Down
93 changes: 48 additions & 45 deletions scripts/estimateSTR.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,12 @@ def parse_STRcov(filename):
"""Parse all STR coverage"""
sample_id = get_sample(filename)
try:
cov_data = pd.read_table(filename, delim_whitespace = True,
names = ['chrom', 'start', 'end', 'decoycov'])
cov_data = pd.read_table(filename, delim_whitespace = True)
except pd.io.common.EmptyDataError:
sys.exit('ERROR: file {0} was empty.\n'.format(filename))
cov_data['sample'] = sample_id
cov_data['repeatunit'] = [x.split('-')[1] for x in cov_data['chrom']]
cov_data = cov_data[['sample', 'repeatunit', 'decoycov']]
cov_data = cov_data[['sample', 'repeatunit', 'decoycov', 'decoycov_pairs']]
return(cov_data)

def parse_locuscov(filename):
Expand Down Expand Up @@ -224,36 +223,29 @@ def main():
factor = 100

STRcov_data = pd.merge(STRcov_data, genomecov_data)
#STRcov_data['decoycov_norm'] = factor * (STRcov_data['decoycov'] + 1) / STRcov_data['genomecov']
#STRcov_data['decoycov_log'] = np.log2(STRcov_data['decoycov_norm'])
#XXX combines the previous two lines into one. Keeping commented out in case coverage_norm is required later
STRcov_data['decoycov_log'] = np.log2(factor * (STRcov_data['decoycov'] + 1) / STRcov_data['genomecov'])

# #XXX ***Not fully implemented***
# # Look for repeat units where the number of reads mapping to the decoy can't be
# # explained by those mapping to all loci with that repeat unit
#
# # Sum the counts over all loci for each repeat unit in each sample
# locus_totals = locuscov_data.groupby(['sample', 'repeatunit'])['locuscoverage'].aggregate(np.sum)
# locus_totals = pd.DataFrame(locus_totals).reset_index() # convert to DataFrame and make indices into columns
# # Calculate the difference between reads assigned to a decoy and the sum of
# # all reads assigned to loci with that repeat unit
# all_differences = pd.merge(STRcov_data, locus_totals, how='left')
# all_differences['difference'] = all_differences['decoycov'] - all_differences['locuscoverage']
# # Normalise differences by median coverage and take the log2
# all_differences['difference_log'] = np.log2(factor * (all_differences['difference'] + 1) / all_differences['genomecov'])
#
# locus_totals = pd.merge(locus_totals, STRcov_data)
#
# # Assign decoy counts to each locus, based on what proportion of the counts for that repeat unit they already have

locus_totals = pd.merge(locuscov_data, STRcov_data, how = 'left')

locus_totals['total_assigned'] = locus_totals['locuscoverage'] #XXX remove this line if implementing the above

# Use pairs of reads where both map to the same STR decoy chromosome to increase
# size estimates and calculate outliers
# Sum the counts over all loci for each repeat unit in each sample and count loci
# with same repeat unit
locus_totals = locuscov_data.groupby(['sample', 'repeatunit'])['locuscoverage'].agg(['sum',
'count']).reset_index()
locus_totals = locuscov_data.merge(locus_totals, how = 'left').merge(STRcov_data, how = 'left')
# Assign decoy counts to each locus, based on what proportion of the counts for that
# repeat unit they already have
locus_totals['total_assigned'] = locus_totals['locuscoverage'] + locus_totals['decoycov_pairs'] * locus_totals['locuscoverage']/locus_totals['sum']
# NaNs introduced where count = 0, fill these with with locuscoverage
locus_totals['total_assigned'].fillna(locus_totals['locuscoverage'], inplace=True)

#XXX Uncomment if not implementing the above
# locus_totals = locuscov_data.merge(STRcov_data, how = 'left')
# locus_totals['total_assigned'] = locus_totals['locuscoverage']

locus_totals['locuscoverage_log'] = np.log2(factor * (locus_totals['locuscoverage'] + 1) / locus_totals['genomecov'])
locus_totals['total_assigned_log'] = np.log2(factor * (locus_totals['total_assigned'] + 1) / locus_totals['genomecov'])

# For each locus, calculate if that sample is an outlier relative to others
total_assigned_wide = locus_totals.pivot(index='locus', columns='sample', values='total_assigned_log')
locuscoverage_wide = locus_totals.pivot(index='locus', columns='sample', values='locuscoverage_log')

# Calculate values for if there were zero reads at a locus in all samples
null_locus_counts = np.log2(factor * (0 + 1) / genomecov_data['genomecov'])
Expand All @@ -268,7 +260,7 @@ def main():

# Use Huber's M-estimator to calculate median and SD across all samples
# for each locus
sample_estimates = total_assigned_wide.apply(hubers_est, axis=1)
sample_estimates = locuscoverage_wide.apply(hubers_est, axis=1)
# Where sd is NA, replace with the minimum non-zero sd from all loci
min_sd = np.min(sample_estimates['sd'][sample_estimates['sd'] > 0])
sample_estimates['sd'].fillna(min_sd, inplace=True)
Expand All @@ -284,7 +276,7 @@ def main():

sample_estimates.loc['null_locus_counts'] = null_locus_counts_est

n = len(total_assigned_wide.columns)
n = len(locuscoverage_wide.columns)
sample_estimates['n'] = n

sample_estimates.to_csv(emit_file, sep= '\t')
Expand All @@ -296,30 +288,30 @@ def main():
control_estimates = parse_controls(control_file)
# Get a list of all loci in the control file but not the sample data
control_loci_df = control_estimates.iloc[control_estimates.index != 'null_locus_counts']
control_loci = [x for x in control_loci_df.index if x not in total_assigned_wide.index]
control_loci = [x for x in control_loci_df.index if x not in locuscoverage_wide.index]

# Extract and order just those control estimates appearing in the current data
mu_sd_estimates = control_estimates.reindex(total_assigned_wide.index)
mu_sd_estimates = control_estimates.reindex(locuscoverage_wide.index)
# Fill NaNs with null_locus_counts values
mu_sd_estimates.fillna(control_estimates.loc['null_locus_counts'],
inplace=True)
else:
# Extract and order estimates to match the current data
mu_sd_estimates = sample_estimates.reindex(total_assigned_wide.index)
mu_sd_estimates = sample_estimates.reindex(locuscoverage_wide.index)

# calculate z scores
z = z_score(total_assigned_wide, mu_sd_estimates)
z = z_score(locuscoverage_wide, mu_sd_estimates)

# If a control file is given, effectively add zeros counts at all loci in
# controls but not in the samples.
# These extra rows will dissapear due to a later merge
if control_file != '':
# Create a total_assigned_wide as if all loci have zero counts
null_total_assigned_wide = pd.DataFrame(columns = sample_names, index = control_loci)
null_total_assigned_wide.fillna(null_locus_counts, inplace = True)
# Create a locuscoverage_wide as if all loci have zero counts
null_locuscoverage_wide = pd.DataFrame(columns = sample_names, index = control_loci)
null_locuscoverage_wide.fillna(null_locus_counts, inplace = True)
# Caculate z scores
null_z = z_score(null_total_assigned_wide,
control_estimates.reindex(null_total_assigned_wide.index))
null_z = z_score(null_locuscoverage_wide,
control_estimates.reindex(null_locuscoverage_wide.index))
loci_with_counts = z.index
z = z.append(null_z)

Expand Down Expand Up @@ -375,14 +367,19 @@ def main():
X_train = np.log2(STRcov_model['coverage_norm']).values.reshape(-1, 1)
Y_train = np.log2(STRcov_model['allele2'])
regr.fit(X_train, Y_train)
# Make a prediction
Y_pred = regr.predict(locus_totals['total_assigned_log'].values.reshape(-1, 1))
# Make a prediction based on locuscoverage
Y_pred = regr.predict(locus_totals['locuscoverage_log'].values.reshape(-1, 1))
predict = np.power(2, Y_pred)
locus_totals['bpInsertion'] = predict
# Make a prediction based on total_assigned
Y_pred_max = regr.predict(locus_totals['total_assigned_log'].values.reshape(-1, 1))
predict_max = np.power(2, Y_pred_max)
locus_totals['bpInsertion_max'] = predict_max

# Get the estimated size in terms of repeat units (total, not relative to ref)
repeatunit_lens = [len(x) for x in locus_totals['repeatunit']]
locus_totals['repeatUnits'] = (locus_totals['bpInsertion']/repeatunit_lens) + locus_totals['reflen']
locus_totals['repeatUnits_max'] = (locus_totals['bpInsertion_max']/repeatunit_lens) + locus_totals['reflen']

# Split locus into 3 columns: chrom start end
locuscols = pd.DataFrame([x.split('-') for x in locus_totals['locus']],
Expand All @@ -392,14 +389,20 @@ def main():
# Specify output data columns
write_data = locus_totals[['chrom', 'start', 'end',
'sample', 'repeatunit', 'reflen',
'locuscoverage',
'locuscoverage', 'total_assigned',
'locuscoverage_log', 'total_assigned_log', 'genomecov',
'outlier', 'p_adj',
'bpInsertion', 'repeatUnits'
'bpInsertion', 'repeatUnits', 'repeatUnits_max'
]]

#sort by outlier score then estimated size (bpInsertion), both descending
write_data = write_data.sort_values(['outlier', 'bpInsertion'], ascending=[False, False])
#XXX check for duplicate rows?
# Convert outlier and p_adj to numeric type and do some rounding/formatting
write_data['outlier'] = pd.to_numeric(write_data['outlier'])
write_data['p_adj'] = [ format(x, '.2g') for x in pd.to_numeric(write_data['p_adj']) ]
write_data = write_data.round({'total_assigned': 1, 'outlier': 1,
'bpInsertion': 1, 'repeatUnits': 1, 'repeatUnits_max': 1})

# Write individual files for each sample, remove rows where locuscoverage == 0
samples = set(write_data['sample'])
Expand Down
26 changes: 23 additions & 3 deletions scripts/identify_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ def parse_args():
help='bed file containing genomic locations of STR loci. Genomic locations should be relative to the fasta reference used to create the bam')
parser.add_argument(
'--output', type=str, required=False,
help='Output file name. Defaults to stdout.')
help='Output file name for locus counts. Defaults to stdout.')
parser.add_argument(
'--STR_counts', type=str, required=False,
help='Output file name for total counts of reads aligned to each STR decoy chromosome. If not given, these will not be reported.')
parser.add_argument(
'--dist', type=int, required=False, default=500,
help='Counts are only assigned to an STR locus that is at most this many bp away. The best choice is probably the insert size. (default: %(default)s)')
Expand Down Expand Up @@ -95,6 +98,13 @@ def main():
# Read bam
bam = pysam.Samfile(bamfile, 'rb')

# Get count of reads aligned to STR decoy chromosomes
STR_counts = {}
index_stats = bam.get_index_statistics()
for x in index_stats:
if x.contig.startswith("STR-"):
STR_counts[x.contig] = [x.mapped, None] # cols: mapped, both on same decoy

# Get relevant chromosomes
required_chroms = []
unpaired = 0
Expand All @@ -105,8 +115,8 @@ def main():
# Check if any STR- chromosomes
if len(required_chroms) == 0:
sys.exit('ERROR: There were no reads mapping to chromosomes with names starting with "STR-" in {0}. Are you sure this data is mapped to a reference genome with STR decoy chromosomes?'.format(bamfile))

for chrom in required_chroms:
STR_counts[chrom][1] = 0
motif = chrom.split('-')[1]
all_positions = []
all_segments = bam.fetch(reference=chrom)
Expand All @@ -121,7 +131,8 @@ def main():
mate_start = read.next_reference_start
mate_stop = mate_start + readlen
all_positions.append([mate_chr, mate_start, mate_stop])

if mate_chr == chrom: # check if both in pair map to the same STR decoy
STR_counts[chrom][1] += 1
# Strategy:
# Merge all overlapping intervals
# Keep the count of reads corresponding to each merged interval (i.e. 1 for each read contained in it)
Expand Down Expand Up @@ -182,6 +193,15 @@ def main():
sys.stderr.write('WARNING: it appears that {0} of the {1} reads overlapping the target STR regions were unpaired and so no useful data could be obtained from them.\n'.format(unpaired, total))

# Write results

if args.STR_counts:
STR_counts_df = pd.DataFrame.from_dict(STR_counts, orient='index',
columns=['decoycov', 'decoycov_pairs'])
with open(args.STR_counts, 'w') as STR_outstream:
STR_outstring = STR_counts_df.to_csv(sep='\t', header=True, index=True,
index_label='chrom')
STR_outstream.write(STR_outstring)

if outfile:
outstream = open(outfile, 'w')
else:
Expand Down