-
Notifications
You must be signed in to change notification settings - Fork 2
/
filter_queries.py
133 lines (125 loc) · 4.31 KB
/
filter_queries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""
This is the first script of the python pipeline.
It is meant to filter reads down to only those which have V and J alignments.
Run using python -m filter_queries <args>
Requirements:
Python >3.8.5, samtools, pysam, biopython, pandas
"""
import argparse
import os.path
from textwrap import dedent, indent
from typing import List, Dict, Iterator, Set
from pathlib import Path
from tcrgo.bam import BAMDict
import tcrgo.io as io
from tcrgo import Log
log = Log("root")
def main(args):
log.init(args.verbosity)
log.info("Parsing input data...")
bam = io.sort_and_index(args.bam, args.output_path)
log.info("Fetching unique queries which contain alignments to V and J and grouping them by Barcode-UMI...")
queries_VJ, queries_nonVJ = io.parse_queries(bam, args.barcode_tag, args.umi_tag)
log.info(f"Fetched {len(queries_VJ)} VJ reads and {len(queries_nonVJ)} non-VJ reads.")
bam.close()
# TODO: Require user do reference sequence similarity checks for subregion variants.
# We will assume for now dissimilar subregion variants.
# It would be useful to report pairs of regions that frequently appear together
# as alignments of a read.
# Maybe could use pysam.PileUpColumn/PileUp for this?
log.info("Writing all (partially) mapped queries not containing V and J alignments to file.")
io.output_queries(queries_nonVJ, os.path.join(args.output_path, "queries_nonVJ.tsv"))
log.info(f"Filtering out UMIs with VJ below {args.minimum_reads} reads and randomly "
f"subsampling those with reads exceeding {args.maximum_reads} reads.")
log.info(f"Will be writing filtered queries containing V and J to {args.workers} file(s).")
id_queries = io.filter_queries(queries_VJ, args.minimum_reads, args.maximum_reads, args.seed) # Split by delim '|' to convert to dictionary of BC|UMI : QNAME
io.output_grouped_VJ_queries(id_queries, args.workers, args.output_path)
log.success("Done!")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="python -m filter_queries", # Would be good to make a command-line alias and set that here
usage="",
formatter_class=argparse.RawDescriptionHelpFormatter,
description=dedent(
f"""
+{'='*78}+
|{'{:^78}'.format('SEQ-WELL TCR RECOVERY - TCRGO?')}|
+{'='*78}+
{indent(__doc__, 3*' ')}
"""
),
epilog=dedent(
"""
Developed by James Gatter & Sarah Nyquist of Shalek Lab (2020).
Original Matlab pipeline written by Andy Tu and Tod Gierhann of Love Lab (2019).
"""
)
)
# REQUIRED ARGUMENTS
parser.add_argument(
"bam",
type=str,
metavar="<ALIGNED BAM FROM ALIGNMENT SCRIPT>",
help=
"Path to single-end tagged, trimmed, aligned, repaired BAM. IMPORTANT: "
"It is strongly recommended you use the alignment script to create this BAM."
)
# OPTIONAL ARGUMENTS
parser.add_argument(
'-v', "--verbosity",
type=str,
default="INFO",
choices=["INFO", "VERBOSE", "WARNING", "SUCCESS", "ERROR"],
help="The verbosity of the log. (default: %(default)s)."
)
parser.add_argument(
'-lt', "--minimum-reads",
type=int,
default=5,
help="UMIs with less than the minimum number of reads will be filtered out (default: %(default)d)."
)
parser.add_argument(
'-gt', "--maximum-reads",
type=int,
default=1000,
help=
"UMIs with more than the maximum number of reads will have their reads"
"randomly subsampled down to the maximum (default: %(default)d)."
)
parser.add_argument(
'-s', "--seed",
type=int,
default=2020,
help=
"The seed for pseudo-random subsampling of reads for UMIs that "
"exceed the maximum number of reads (default: %(default)d)."
)
parser.add_argument(
'-b', "--barcode-tag",
type=str,
default="CR",
help="The BAM tag which represents the cell barcode sequence. (default: %(default)s )."
)
parser.add_argument(
'-u', "--umi-tag",
type=str,
default="RX",
help="The BAM tag which represents the unique molecular barcode sequence. (default: %(default)s )."
)
parser.add_argument(
'-o', "--output-path",
type=str,
default="./out/queries/",
help="The path to which the files from this program will output. (default: %(default)s )."
)
parser.add_argument(
'-w', "--workers",
type=int,
metavar="#WORKERS",
default=1,
help=
"Divide the list of filtered reads by this number for processing "
"by multiple HPCC tasks or VMs (default: %(default)d)."
)
args = parser.parse_args()
main(args)