-
Notifications
You must be signed in to change notification settings - Fork 2
/
1-submit-encryption-jobs.sh
executable file
·150 lines (127 loc) · 6.3 KB
/
1-submit-encryption-jobs.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/bin/bash
# INFO: if you want to restart the encryption for a file, delete all the corresponding *.md5 and *.gpg files
#
# This script will automatically find the most-recent "to-encrypt*.txt" file and process files therein.
# If you wish to use a different to-encrypt list, you can specify this as a command line argument:
# 1-submit-encryption-jobs.sh your-filelist.txt
# Check if required EGA public key is known.
gpg --no-tty --batch --list-keys 'European Genome-Phenome Archive (EGA)' >/dev/null 2>&1;
if [ $? != 0 ]; then
>&2 echo "ERROR: EGA public key not present in GPG keyring on this submission host
-> Worker nodes probably cannot encrypt with EGA as recipient.
Please import the EGA key!
Public key should be obtained from EGA: https://ega-archive.org/submission/public_keys .
(those who believe this author is trustworthy, can used the copy included with this script)
Import the obtained key with: \`gpg --import submission_2020_public.gpg.asc\`
"
exit 17
fi
# CONFIG: which job system are we using?
CLUSTER_SYSTEM='LSF'
#CLUSTER_SYSTEM='PBS'
# for LSF: which jobgroup to use when submitting.
JOBGROUP="/$USER/egacrypt"
echo "using cluster system: $CLUSTER_SYSTEM"
# find wherever this script is, and load the util library next to it
# even when hidden behind symlinks
OUR_DIR="$(dirname "$(readlink -f "$0")")"
source "${OUR_DIR}/util.sh"
# Get default, latest input file, OR whatever the user wants
OVERRIDE_FILE="$1"
TO_ENCRYPT_LIST=$(get_default_or_override_to_encrypt_list "$OVERRIDE_FILE");
verify_to_encrypt_list "$TO_ENCRYPT_LIST"
echo "using file-list: $TO_ENCRYPT_LIST"
WORKDIR="$(pwd)/files/"
SUBMITLOG="$(pwd)/_submitted_jobs_"$(date +%Y-%m-%d_%H:%M:%S)
JOBLOGDIR="$(pwd)/cluster-logs"
if [ ! -d "$JOBLOGDIR" ]; then
mkdir "$JOBLOGDIR"
fi
# Get files from to-encrypt list that DON'T have a corresponding .gpg file
# first input is the to-encrypt filelist, using `sed` to normalise for either absolute paths or relative paths in WORKDIR
# second input is the contents of WORKDIR: all finished or partial encryption output, massaged with `sed` to match the original filename.
toEncryptFiles=( $( cut -f2 "$TO_ENCRYPT_LIST" \
| sed -E -e 's#^.+/##' -e 's/ /\\ /g' \
| sort
))
workdirFiles=( $( find "$WORKDIR" -type f \( -name '*.gpg' -or -name '*.gpg.partial' \) \
| sed -E -e 's#^.+/##' -e 's/\.gpg(.partial)?$//' -e 's/ /\\ /g' \
| sort
))
OLD_IFS="$IFS"
IFS="
" # Set separator to '\n' to preserve spaces in filenames in the 'printf $array' calls
# This multi-line weirdness sets IFS to '\n'; there are weird escaping problems with all other syntaxes that you would consider 'normal'
# see also: https://stackoverflow.com/q/16831429/2964696
unencryptedFiles=( $( comm -23 \
<( printf -- '%s\n' "${toEncryptFiles[@]}" ) \
<( printf -- '%s\n' "${workdirFiles[@]}" ) \
) )
alreadyEncryptedFiles=( $( comm -12 \
<( printf -- '%s\n' "${toEncryptFiles[@]}" ) \
<( printf -- '%s\n' "${workdirFiles[@]}" ) \
) )
IFS="$OLD_IFS"
echo "found ${#alreadyEncryptedFiles[*]} encrypted and/or in-progress files. Submitting ${#unencryptedFiles[*]} new encryption jobs:"
# If we have anything to encrypt at all..
if [ ${#unencryptedFiles[*]} -ge 1 ]; then
# .. check if our jobgroup is available. If not, create it with default limit.
#
# After creation, we don't touch the limit anymore, so people can finetune it for their cluster environment.
# We impose this concurrency-limit to avoid swamping cluster I/O.
# Too many parallel jobs bottleneck the storage, and make our walltime estimate meaningless.
# see: https://www.ibm.com/support/knowledgecenter/en/SSETD4_9.1.3/lsf_admin/job_group_limits.html
## trickery with grep needed, because bjgroup always exits with '0/success', even if group not found.. sigh..
## and (piped) grep invocation is annoyingly hard to write inside "[ ]".. double sigh...
2>&1 bjgroup -s $JOBGROUP | grep -q "No job group found" -; GREPEXIT=$?
if [ $GREPEXIT -eq "0" ]; then
DEFAULTLIMIT=16
echo "Jobgroup \"$JOBGROUP\" not found, creating with default limit $DEFAULTLIMIT"
bgadd -L "$DEFAULTLIMIT" "$JOBGROUP"
fi
# .. and print the header for our job-submissions
echo -e "FILE \tWTIME\tSUBMISSION_FEEDBACK" | tee -a "$SUBMITLOG"
fi
for SHORTNAME in ${unencryptedFiles[*]}; do
FULL_FILE="$WORKDIR/$SHORTNAME"
if [ ! -e "$FULL_FILE" ]; then
echo "WARNING: File not found: $FULL_FILE" | tee -a "$SUBMITLOG"
else
# Request a sensible amount of walltime, and let the queue runlimits sort out which queue we get
FILESIZE=$(stat -c '%s' "$(readlink -f "$FULL_FILE")") # in bytes
# Be VERY pessimistic about encryption speed: 200 MByte/minute ~ 3 MByte/second.
# under good conditions, we can easily do five times that, but bad "I/O weather"
# can easily kill throughput.
# By underestimating our speed, we'll probably request too much walltime, but that's
# better than being walltime-killed 80% of the way (with no way to resume later)
BYTES_PER_MINUTE=200000000
MINUTES="$(( FILESIZE / BYTES_PER_MINUTE ))"
HOURS="$(( MINUTES / 60 ))"
MINUTES="$(( MINUTES - ( 60 * HOURS ) + 1 ))" # +1 to avoid requesting "0" for tiny files, and as margin
# prepend filename before job-id output (intentionally no newline!)
printf "%-29s\t%dh%02dm\t" "$SHORTNAME" "$HOURS" "$MINUTES" | tee -a "$SUBMITLOG"
# actual job submission, prints job-id
if [ $CLUSTER_SYSTEM == "PBS" ]; then
2>&1 qsub \
-v "FULL_FILE=$FULL_FILE,WORKDIR=$WORKDIR" \
-N "egacrypt-$SHORTNAME" \
-e "$JOBLOGDIR" \
-o "$JOBLOGDIR" \
-l "walltime=$( printf '%2d:%20d:00' $HOURS $MINUTES )" \
< "${OUR_DIR}/JOB-ega-encryption.sh" | tee -a "$SUBMITLOG"
elif [ $CLUSTER_SYSTEM == "LSF" ]; then
2>&1 bsub \
-env "FULL_FILE=$FULL_FILE, WORKDIR=$WORKDIR" \
-g "$JOBGROUP" \
-J "egacrypt-$SHORTNAME" \
-Jd "encrypting $SHORTNAME ($FULL_FILE) for the EGA archive" \
-e "$JOBLOGDIR/%J-$SHORTNAME.err" \
-o "$JOBLOGDIR/%J-$SHORTNAME.out" \
-W "$( printf '%2d:%02d' $HOURS $MINUTES )" \
< "${OUR_DIR}/JOB-ega-encryption.sh" | tee -a "$SUBMITLOG"
else
echo "ERROR: specified unknown cluster system '$CLUSTER_SYSTEM'; no jobs submitted" | tee -a "$SUBMITLOG"
exit 42
fi
fi
done