-
Notifications
You must be signed in to change notification settings - Fork 1
/
methyl_mallet
executable file
·158 lines (132 loc) · 3.94 KB
/
methyl_mallet
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/bin/bash
progname="$( basename "$0" )"
progpath="$( dirname "$( readlink -f "$0" )" )"
set -e -o pipefail
# Quick check to see if binaries are made
if [[ ! -x "$progpath/bin/append_tag" || ! -x "$progpath/bin/spread" ]]; then
cat << EOF >&2
ERROR: $progname cannot run because compiled binaries have not been made.
ERROR: Try using the 'make' command to create those binaries.
ERROR: Consult the README.md for details.
EOF
exit 1
fi
# Standardize sort order
export LC_ALL=C
# Speed over compression ratio
export GZIP_OPT=-1
# Usage -----------------------------------------------------------------------
function usage {
cat << EOF >&2
usage:
$progname [-h] [-j JOBS] [-k] [-n NMERGE] [-S BUFFER_SIZE]
-d DIR -o OUT_FILE FILE [FILE ...]
Do a full outer join of tab-separated methylation files.
positional arguments:
FILE files to be joined
required arguments:
-d DIR working directory to store intermediary files
-o OUT_FILE file name to be output to
optional arguments:
-h show this help message and exit
-j JOBS number of parallel jobs using GNU parallel
-k keep intermediary files
-n NMERGE number of files to merge simultaneously
-S BUFFER_SIZE buffer size allocated to sorting operation
EOF
exit 1
}
# Options ---------------------------------------------------------------------
while getopts ":d:S:o:n:j:kh" opt; do
case $opt in
d)
work_dir=$OPTARG
;;
S)
buffer_size="--buffer-size=$OPTARG"
;;
o)
out_file=$OPTARG
;;
h)
usage
;;
k)
keep_files=true
;;
j)
n_jobs=$OPTARG
;;
n)
batch_size="--batch-size=$OPTARG"
;;
\?)
echo "Invalid option: -$OPTARG" >&2
usage
;;
:)
echo "Option -$OPTARG requires an argument." >&2
usage
;;
esac
done
shift $((OPTIND-1))
if [[ -z "$work_dir" || -z "$out_file" ]]; then
echo "ERROR: Missing option(s)"
usage
fi
mkdir -p "$work_dir"
# SORT ------------------------------------------------------------------------
echo "$progname: Sorting the inputs..." >&2
# Sort all of our input data files
CHECKPOINT=$SECONDS
if [[ -n "$n_jobs" && -x "$(command -v parallel)" ]]; then
echo "$progname: Sorting files in parallel" >&2
i=0
for f in "$@"; do
sorted_files[$i]="$work_dir/sorted_$( basename "$f" .gz )"
((++i))
done
parallel -j $n_jobs \
'zcat "{1}" | "{2}/bin/append_tag" "{1/.}" | tr "\t" "," | sort -t, -k 1n,1 -k 2n,2 -k 3,3 -k 4,4 {3} -T "{4}" -o "{4}/sorted_{1/.}"' \
::: "$@" ::: "$progpath" ::: $buffer_size ::: "$work_dir"
echo "$progname: Intial sorting done in $((SECONDS - CHECKPOINT)) seconds." >&2
CHECKPOINT=$SECONDS
else
echo "$progname: Sorting files one-by-one" >&2
i=0
for f in "$@"; do
file_name="$( basename "$f" )"
file_stem="$( basename "$f" .gz )"
sorted_files[$i]="$work_dir/sorted_$file_stem"
echo -n "$progname: $(printf '% 5i' $(expr $i + 1))/$#: $file_name" >&2
zcat "$f" | \
"$progpath/bin/append_tag" "$file_stem" | \
tr "\t" "," | \
sort -t, \
-k 1n,1 -k 2n,2 -k 3,3 -k 4,4 \
$buffer_size \
-T "$work_dir" \
-o "${sorted_files[$i]}"
echo " ($((SECONDS - CHECKPOINT)) seconds)" >&2
CHECKPOINT=$SECONDS
((++i))
done
fi
# LONG FILE -------------------------------------------------------------------
echo -n "$progname: Spreading the data..." >&2
sort -t, \
-k 1n,1 -k 2n,2 -k 3,3 -k 4,4 \
-m \
$buffer_size \
-T "$work_dir" \
$batch_size \
--compress-program=gzip \
"${sorted_files[@]}" | \
"$progpath/bin/spread" "$@" | \
gzip > "$out_file"
test -z "$keep_files" && rm "${sorted_files[@]}"
echo " ($((SECONDS - CHECKPOINT)) seconds)" >&2
CHECKPOINT=$SECONDS
# DONE ------------------------------------------------------------------------
echo "$progname: Success! All files joined. ($SECONDS seconds total)" >&2