-
Notifications
You must be signed in to change notification settings - Fork 0
/
check_processed_fastq_by_raw.py
executable file
·47 lines (41 loc) · 1.45 KB
/
check_processed_fastq_by_raw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/env python2
"""
Check a fastq file that's been through some processing against the
input. Checks that:
- all reads in the processed file(s) (by read.id) are also in the
raw file
- processed read length <= raw read length
- each processed read file has no more than one copy of each read
(this won't work with interleaved files)
check_processed_fastq_by_raw <raw> <processed...>
"""
import gzip
import sys
from Bio import SeqIO
open_fun = open
if sys.argv[1].endswith('.gz'):
open_fun = gzip.open
raw_reads = {}
with open_fun(sys.argv[1], 'rb') as ihandle:
for rec in SeqIO.parse(ihandle, 'fastq'):
raw_reads[rec.id] = len(rec)
for pfile in sys.argv[2:]:
observed = set() # Only check within a file - might be paired-end reads
open_fun = open
if pfile.endswith('.gz'):
open_fun = gzip.open
with open_fun(pfile, 'rb') as ihandle:
for rec in SeqIO.parse(ihandle, 'fastq'):
if rec.id not in raw_reads:
import pdb; pdb.set_trace()
print '%s not found - fail on %s' %(rec.id, pfile)
exit(1)
if len(rec) > raw_reads[rec.id]:
print '%s too big - fail on %s' %(rec.id, pfile)
exit(1)
if rec.id in observed:
print '%s duplicated - fail on %s' %(rec.id, pfile)
exit(1)
observed.add(rec.id)
print 'pass'
exit(0)