Skip to content

Commit

Permalink
Updated/cleaned as part of revision.
Browse files Browse the repository at this point in the history
  • Loading branch information
Johannes Linder committed Sep 18, 2024
1 parent eeee3fe commit 6e07fef
Show file tree
Hide file tree
Showing 7 changed files with 196 additions and 212 deletions.
80 changes: 31 additions & 49 deletions src/scripts/borzoi_test_apa_folds_polaydb.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

"""
borzoi_test_apa_folds_polaydb.py
Measure accuracy at polyadenylation-level for multiple model replicates.
"""

################################################################################
Expand All @@ -29,13 +31,6 @@
def main():
usage = "usage: %prog [options] <params_file> <data1_dir> ..."
parser = OptionParser(usage)
parser.add_option(
"-a",
"--alt",
dest="alternative",
default="two-sided",
help="Statistical test alternative [Default: %default]",
)
parser.add_option(
"-c",
dest="crosses",
Expand All @@ -50,13 +45,6 @@ def main():
type="int",
help="Dataset index [Default:%default]",
)
parser.add_option(
"--d_ref",
dest="dataset_ref_i",
default=None,
type="int",
help="Reference Dataset index [Default:%default]",
)
parser.add_option(
"-e",
dest="conda_env",
Expand All @@ -67,26 +55,19 @@ def main():
"-f",
dest="fold_subset",
default=None,
help="Run a subset of folds (encoded as comma-separated string) [Default:%default]",
)
parser.add_option("-g", dest="apa_file", default="polyadb_human_v3.csv.gz")
parser.add_option(
"--label_exp",
dest="label_exp",
default="Experiment",
help="Experiment label [Default: %default]",
type="int",
help="Run a subset of folds [Default:%default]",
)
parser.add_option(
"--label_ref",
dest="label_ref",
default="Reference",
help="Reference label [Default: %default]",
"--f_list",
dest="fold_subset_list",
default=None,
help="Run a subset of folds (encoded as comma-separated string) [Default:%default]",
)
parser.add_option(
"-m",
dest="metric",
default="pearsonr",
help="Train/test metric [Default: Pearsonr or AUPRC]",
"-g",
dest="apa_file",
default="polyadb_human_v3.csv.gz"
)
parser.add_option(
"--name",
Expand All @@ -101,14 +82,9 @@ def main():
help="Output experiment directory [Default: %default]",
)
parser.add_option(
"-p", dest="out_stem", default=None, help="Output plot stem [Default: %default]"
)
parser.add_option("-q", dest="queue", default="geforce")
parser.add_option(
"-r",
dest="ref_dir",
default=None,
help="Reference directory for statistical tests",
"-q",
dest="queue",
default="geforce"
)
parser.add_option(
"--rc",
Expand All @@ -124,20 +100,20 @@ def main():
type="str",
help="Ensemble prediction shifts [Default: %default]",
)
parser.add_option(
"--status",
dest="status",
default=False,
action="store_true",
help="Update metric status; do not run jobs [Default: %default]",
)
parser.add_option(
"-t",
dest="targets_file",
default=None,
type="str",
help="File specifying target indexes and labels in table format",
)
parser.add_option(
"-u",
dest="untransform_old",
default=False,
action="store_true",
help="Untransform old models [Default: %default]",
)
(options, args) = parser.parse_args()

if len(args) < 2:
Expand All @@ -161,12 +137,16 @@ def main():

# count folds
num_folds = len([dkey for dkey in data_stats if dkey.startswith("fold")])

fold_index = [fold_i for fold_i in range(num_folds)]

# subset folds
if options.fold_subset is not None:
fold_index = [int(fold_str) for fold_str in options.fold_subset.split(",")]
num_folds = min(options.fold_subset, num_folds)

fold_index = [fold_i for fold_i in range(num_folds)]

# subset folds (list)
if options.fold_subset_list is not None:
fold_index = [int(fold_str) for fold_str in options.fold_subset_list.split(",")]

if options.queue == "standard":
num_cpu = 4
Expand All @@ -192,7 +172,7 @@ def main():
model_file = "%s/train/model%d_best.h5" % (it_dir, options.dataset_i)

# check if done
acc_file = "%s/acc.txt" % out_dir
acc_file = "%s/apa_preds_polyadb.tsv.gz" % out_dir
if os.path.isfile(acc_file):
# print('%s already generated.' % acc_file)
pass
Expand All @@ -209,6 +189,8 @@ def main():
cmd += " --shifts %s" % options.shifts
if options.targets_file is not None:
cmd += " -t %s" % options.targets_file
if options.untransform_old:
cmd += " -u"
cmd += " %s" % params_file
cmd += " %s" % model_file
cmd += " %s/data%d" % (it_dir, head_i)
Expand Down
60 changes: 31 additions & 29 deletions src/scripts/borzoi_test_apa_polaydb.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,6 @@
Measure accuracy at polyadenylation-level.
"""


def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)


################################################################################
# main
################################################################################
Expand Down Expand Up @@ -89,6 +84,13 @@ def main():
default=None,
help="TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]",
)
parser.add_option(
"-u",
dest="untransform_old",
default=False,
action="store_true",
help="Untransform old models [Default: %default]",
)
(options, args) = parser.parse_args()

if len(args) != 4:
Expand Down Expand Up @@ -132,9 +134,6 @@ def main():
num_targets = targets_df.shape[0]
num_targets_strand = targets_strand_df.shape[0]

# save sqrt'd tracks
sqrt_mask = np.array([ss.find("sqrt") != -1 for ss in targets_strand_df.sum_stat])

# read model parameters
with open(params_file) as params_open:
params = json.load(params_open)
Expand Down Expand Up @@ -188,9 +187,6 @@ def main():
# filter for 3' UTR polyA sites only
apa_df = apa_df.query("site_type == '3\\' most exon'").copy().reset_index(drop=True)

eprint("len(apa_df) = " + str(len(apa_df)))
print("len(apa_df) = " + str(len(apa_df)))

apa_df["start_hg38"] = apa_df["position_hg38"]
apa_df["end_hg38"] = apa_df["position_hg38"] + 1

Expand All @@ -207,16 +203,18 @@ def main():
apa_pr = pr.PyRanges(
apa_df[["Chromosome", "Start", "End", "pas_id", "cut_mode", "pas_strand"]]
)

# get strands
pas_strand_dict = {}
for _, row in apa_df.iterrows() :
pas_strand_dict[row['pas_id']] = row['pas_strand']

#######################################################
# intersect APA sites w/ preds, targets

# intersect seqs, APA sites
seqs_apa_pr = seqs_pr.join(apa_pr)

eprint("len(seqs_apa_pr.df) = " + str(len(seqs_apa_pr.df)))
print("len(seqs_apa_pr.df) = " + str(len(seqs_apa_pr.df)))

# hash preds/targets by pas_id
apa_preds_dict = {}
apa_targets_dict = {}
Expand All @@ -228,8 +226,7 @@ def main():
y = y.numpy()[..., targets_df.index]

t0 = time.time()
eprint("Sequence %d..." % si)
print("Sequence %d..." % si, end="")
print("Sequence %d..." % si, end="", flush=True)
for bsi in range(x.shape[0]):
seq = seqs_df.iloc[si + bsi]

Expand Down Expand Up @@ -276,14 +273,11 @@ def main():
apa_preds_dict.setdefault(pas_id, []).append(yhb)
apa_targets_dict.setdefault(pas_id, []).append(yb)
else:
eprint("(Warning: len(yb) <= 0)")
print("(Warning: len(yb) <= 0)", flush=True)

# advance sequence table index
si += x.shape[0]
eprint("DONE in %ds." % (time.time() - t0))
print("DONE in %ds." % (time.time() - t0))

eprint("len(apa_preds_dict) = " + str(len(apa_preds_dict)))
print("DONE in %ds." % (time.time() - t0), flush=True)

if si % 128 == 0:
gc.collect()
Expand All @@ -300,14 +294,22 @@ def main():
apa_targets_gi = np.concatenate(apa_targets_dict[pas_id], axis=0).astype(
"float32"
)

# undo scale
apa_preds_gi /= np.expand_dims(targets_strand_df.scale, axis=0)
apa_targets_gi /= np.expand_dims(targets_strand_df.scale, axis=0)

# undo sqrt
apa_preds_gi[:, sqrt_mask] = apa_preds_gi[:, sqrt_mask] ** (4 / 3)
apa_targets_gi[:, sqrt_mask] = apa_targets_gi[:, sqrt_mask] ** (4 / 3)

# slice strand
if pas_strand_dict[pas_id] == "+":
pas_strand_mask = (targets_df.strand != "-").to_numpy()
else:
pas_strand_mask = (targets_df.strand != "+").to_numpy()
apa_preds_gi = apa_preds_gi[:, pas_strand_mask]
apa_targets_gi = apa_targets_gi[:, pas_strand_mask]

# untransform
if options.untransform_old:
apa_preds_gi = dataset.untransform_preds1(apa_preds_gi, targets_strand_df, unscale=True, unclip=False)
apa_targets_gi = dataset.untransform_preds1(apa_targets_gi, targets_strand_df, unscale=True, unclip=False)
else:
apa_preds_gi = dataset.untransform_preds(apa_preds_gi, targets_strand_df, unscale=True, unclip=False)
apa_targets_gi = dataset.untransform_preds(apa_targets_gi, targets_strand_df, unscale=True, unclip=False)

# mean coverage
apa_preds_gi = apa_preds_gi.mean(axis=0)
Expand Down
14 changes: 13 additions & 1 deletion src/scripts/borzoi_test_exons_folds.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,12 @@ def main():
type="int",
help="Run a subset of folds [Default:%default]",
)
parser.add_option(
"--f_list",
dest="fold_subset_list",
default=None,
help="Run a subset of folds (encoded as comma-separated string) [Default:%default]",
)
parser.add_option(
"-g",
dest="exons_gff",
Expand Down Expand Up @@ -195,6 +201,12 @@ def main():
# subset folds
if options.fold_subset is not None:
num_folds = min(options.fold_subset, num_folds)

fold_index = [fold_i for fold_i in range(num_folds)]

# subset folds (list)
if options.fold_subset_list is not None:
fold_index = [int(fold_str) for fold_str in options.fold_subset_list.split(",")]

if options.queue == "standard":
num_cpu = 4
Expand All @@ -209,7 +221,7 @@ def main():
jobs = []

for ci in range(options.crosses):
for fi in range(num_folds):
for fi in fold_index:
it_dir = "%s/f%dc%d" % (options.exp_dir, fi, ci)

if options.dataset_i is None:
Expand Down
22 changes: 17 additions & 5 deletions src/scripts/borzoi_test_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,13 @@ def main():
action="store_true",
help="Untransform old models [Default: %default]",
)
parser.add_option(
"--store_span",
dest="store_span",
default=False,
action="store_true",
help="Store predicted/measured gene span coverage profiles [Default: %default]",
)
(options, args) = parser.parse_args()

if len(args) != 4:
Expand Down Expand Up @@ -323,16 +330,21 @@ def main():
preds_log = np.log2(gene_preds_gi[:, ti] + 1)
targets_log = np.log2(gene_targets_gi[:, ti] + 1)
gene_corr_gi[ti] = pearsonr(preds_log, targets_log)[0]
# gene_corr_gi[ti] = pearsonr(gene_preds_gi[:,ti], gene_targets_gi[:,ti])[0]
else:
gene_corr_gi[ti] = np.nan
gene_within.append(gene_corr_gi)
gene_wvar.append(gene_targets_gi.var(axis=0))

# TEMP: save gene preds/targets
# os.makedirs('%s/gene_within' % options.out_dir, exist_ok=True)
# np.save('%s/gene_within/%s_preds.npy' % (options.out_dir, gene_id), gene_preds_gi.astype('float16'))
# np.save('%s/gene_within/%s_targets.npy' % (options.out_dir, gene_id), gene_targets_gi.astype('float16'))
# optionally store raw coverage profiles for gene span
if options.store_span:
hash_code = str(gene_id.split(".")[0][-1]) # last digit of gene id

os.makedirs('%s/gene_within' % options.out_dir, exist_ok=True)
os.makedirs('%s/gene_within/%s' % (options.out_dir, hash_code), exist_ok=True)
os.makedirs('%s/gene_within/%s/preds' % (options.out_dir, hash_code), exist_ok=True)
os.makedirs('%s/gene_within/%s/targets' % (options.out_dir, hash_code), exist_ok=True)
np.save('%s/gene_within/%s/preds/%s_preds.npy' % (options.out_dir, hash_code, gene_id), gene_preds_gi.astype('float16'))
np.save('%s/gene_within/%s/targets/%s_targets.npy' % (options.out_dir, hash_code, gene_id), gene_targets_gi.astype('float16'))

# mean coverage
gene_preds_gi = gene_preds_gi.mean(axis=0) / float(pool_width)
Expand Down
Loading

0 comments on commit 6e07fef

Please sign in to comment.