#!/usr/local/bin/bash
# Using GNU parallel painlessly
# maria.nattestad@gmail.com
# Jul 18, 2022
# https://omgenomics.com/parallel/
# https://www.gnu.org/software/parallel/
# https://yewtu.be/watch?v=qypUdm-IE9c
## 1) Basic examples
## --
parallel echo ::: A B C
# Get inputs from a command:
parallel echo ::: $(seq 10)
# Makes a sample file with numbers 1 to 100.
seq 100 > file.txt
# Get inputs from a file:
parallel echo ::: $(cat file.txt)
# same as (using 4 colons):
parallel echo :::: file.txt
# You can get inputs from a pipe:
cat file.txt | grep "2" | parallel echo "hello"
# Every possible combination:
parallel echo ::: A B C ::: 1 2 3
# A 1
# A 2
# B 1
# A 3
# B 2
# B 3
# C 1
# C 2
# C 3
# Linked inputs:
parallel --link echo ::: A B C ::: 1 2 3
# A 1
# B 2
# C 3
## 2) Run a script in parallel
## --
echo '
JOB_ID=${1}
sleep 2
echo "Finished job #${JOB_ID}"
' > do_something.sh
# make it executable
chmod +x do_something.sh
# run this:
parallel ./do_something.sh ::: 1 2 3
# output: after 2 seconds have passed, all appear at once:
# Finished job #1
# Finished job #3
# Finished job #2
# You can also try adding –verbose to see when each job starts running.
parallel --verbose ./do_something.sh ::: 1 2 3
# ./do_something.sh 1
# ./do_something.sh 2
# ./do_something.sh 3
# Finished job #1
# Finished job #3
# Finished job #2
# And you can control how many jobs run at once with --jobs:
parallel --jobs 2 --verbose ./do_something.sh ::: 1 2 3
# Using --jobs 1 is essentially the same as a basic for loop.
# Try the same tricks with this script, which sleeps for the specified number of seconds:
echo '
sleep ${1}
echo "slept ${1} seconds."
' > do_something2.sh
chmod +x do_something2.sh
parallel ./do_something2.sh ::: 1 2 3
# output, now waiting 1 second in between each line:
# slept 1 seconds.
# slept 2 seconds.
# slept 3 seconds.
## 3) Multiple arguments:
## --
echo '
sleep ${1}
echo "${2} slept ${1} seconds"
' > do_something3.sh
chmod +x do_something3.sh
# parallel ./do_something3.sh ::: A B C ::: 1 2 3
# Whoops, teachable mistake here:
# 1 slept A seconds
# usage: sleep seconds
# 2 slept A seconds
# usage: sleep seconds
# 3 slept A seconds
# usage: sleep seconds
# 1 slept B seconds
# usage: sleep seconds
# 2 slept B seconds
# usage: sleep seconds
# 3 slept B seconds
# usage: sleep seconds
# 1 slept C seconds
# usage: sleep seconds
# 2 slept C seconds
# usage: sleep seconds
# 3 slept C seconds
# usage: sleep seconds
# This ran, for example, ./do_something3.sh A 1, so the script ran sleep A which
# is wrong, we wanted sleep 1. You can fix this by changing the order of
# arguments to ::: 1 2 3 ::: A B C, or you can specify where to insert the
# arguments – this is more flexible.
parallel ./do_something3.sh {2} {1} ::: A B C ::: 1 2 3
# A slept 1 seconds
# B slept 1 seconds
# C slept 1 seconds
# A slept 2 seconds
# B slept 2 seconds
# C slept 2 seconds
# A slept 3 seconds
# B slept 3 seconds
# C slept 3 seconds
# Much better.
## 4) Running parallel with a function
## --
# This is nice when you want something more complex than the small examples above
# but without writing a script.
# Try this: (This doesn't work for me in the ZSH shell (new Mac) though.)
function make_a_file_exported() {
seq ${2} > numbers_${1}.txt
}
export -f make_a_file_exported
parallel --link make_a_file_exported {1} {2} ::: A B C ::: 10 20 30
# Otherwise you can use env_parallel:
# follow instructions the first time you run this to make it work.
source env_parallel.bash
function make_a_file_linked() {
seq ${2} > numbers_env_${1}.txt
}
env_parallel --link make_a_file_linked {1} {2} ::: A B C ::: 10 20 30
# If neither of those work for you, google around for answers or just use scripts instead.
## 5) Practical tips for launching bioinformatics jobs
## --
# You can use parallel to orchestrate jobs for you on the cloud or on your lab/
# institution’s cluster. As an example, I’ve been using this a lot at work where
# we have a script that launches one stage at a time on Google Cloud, and then
# when a step finishes, it will take that output as the input for the next stage.
# It’s nice to use parallel to orchestrate this so I don’t have to open each of
# these orchestration scripts in different terminal tabs (or screen/tmx2 tabs).
# In this example, you have a runner_script.sh that might launch a job or a
# series of interdependent jobs on a cloud or cluster. We give it some input
# files and in this case I want to see the result at a few different filter
# values, so I get that combination of each input file with each of the 3 values
# of quality filter.
# you can make some fake input files for practice like this:
parallel touch "input_file_{1}.bam" ::: $(seq 10)
# Use a function to specify carefully how the runner script should be run:
function launch_runner_script() {
INPUT=${1}
Q_FILTER=${2}
OUTPUT=${INPUT%.bam}.output.q${Q_FILTER}
echo "runner_script.sh --input ${1} --output ${OUTPUT}.bam --filter ${Q_FILTER} 2>&1 > ${OUTPUT}.log"
}
echo "env parallel example 2"
env_parallel launch_runner_script {1} {2} ::: $(ls input_file*.bam) ::: 10 20 30
# You’ll notice a few things above:
# 1. We use the input name to determine the output name, removing the .bam
# prefix and attaching that Q_FILTER value to differentiate the outputs. I
# try to make this as clean as possible
# 2. We write a log file next to the output. ` 2>&1 >` means we’re capturing
# both stderr and stdout and writing them to the same log file. I always make
# sure to capture logs for each run individually so I can check on them along
# the way and so I have a hope of debugging any problems that arise.
# 3. We echo the runner_script.sh command, this is always a good idea to do
# first to check the output commands are what you expected before you proceed
# (then you can remove the echo and quotes to run it for real). See the
# output below:
# Output:
# runner_script.sh --input input_file_1.bam --output input_file_1.output.q30.bam --filter 30 2>&1 > input_file_1.output.q30.log
# runner_script.sh --input input_file_3.bam --output input_file_3.output.q10.bam --filter 10 2>&1 > input_file_3.output.q10.log
# runner_script.sh --input input_file_1.bam --output input_file_1.output.q20.bam --filter 20 2>&1 > input_file_1.output.q20.log
# runner_script.sh --input input_file_2.bam --output input_file_2.output.q20.bam --filter 20 2>&1 > input_file_2.output.q20.log
# runner_script.sh --input input_file_10.bam --output input_file_10.output.q30.bam --filter 30 2>&1 > input_file_10.output.q30.log
# runner_script.sh --input input_file_2.bam --output input_file_2.output.q10.bam --filter 10 2>&1 > input_file_2.output.q10.log
# runner_script.sh --input input_file_10.bam --output input_file_10.output.q10.bam --filter 10 2>&1 > input_file_10.output.q10.log
# runner_script.sh --input input_file_2.bam --output input_file_2.output.q30.bam --filter 30 2>&1 > input_file_2.output.q30.log
# runner_script.sh --input input_file_1.bam --output input_file_1.output.q10.bam --filter 10 2>&1 > input_file_1.output.q10.log
# runner_script.sh --input input_file_10.bam --output input_file_10.output.q20.bam --filter 20 2>&1 > input_file_10.output.q20.log
# runner_script.sh --input input_file_4.bam --output input_file_4.output.q20.bam --filter 20 2>&1 > input_file_4.output.q20.log
# runner_script.sh --input input_file_3.bam --output input_file_3.output.q20.bam --filter 20 2>&1 > input_file_3.output.q20.log
# runner_script.sh --input input_file_4.bam --output input_file_4.output.q10.bam --filter 10 2>&1 > input_file_4.output.q10.log
# runner_script.sh --input input_file_3.bam --output input_file_3.output.q30.bam --filter 30 2>&1 > input_file_3.output.q30.log
# runner_script.sh --input input_file_6.bam --output input_file_6.output.q20.bam --filter 20 2>&1 > input_file_6.output.q20.log
# runner_script.sh --input input_file_6.bam --output input_file_6.output.q10.bam --filter 10 2>&1 > input_file_6.output.q10.log
# runner_script.sh --input input_file_5.bam --output input_file_5.output.q30.bam --filter 30 2>&1 > input_file_5.output.q30.log
# runner_script.sh --input input_file_4.bam --output input_file_4.output.q30.bam --filter 30 2>&1 > input_file_4.output.q30.log
# runner_script.sh --input input_file_5.bam --output input_file_5.output.q20.bam --filter 20 2>&1 > input_file_5.output.q20.log
# runner_script.sh --input input_file_5.bam --output input_file_5.output.q10.bam --filter 10 2>&1 > input_file_5.output.q10.log
# runner_script.sh --input input_file_7.bam --output input_file_7.output.q10.bam --filter 10 2>&1 > input_file_7.output.q10.log
# runner_script.sh --input input_file_7.bam --output input_file_7.output.q20.bam --filter 20 2>&1 > input_file_7.output.q20.log
# runner_script.sh --input input_file_7.bam --output input_file_7.output.q30.bam --filter 30 2>&1 > input_file_7.output.q30.log
# runner_script.sh --input input_file_6.bam --output input_file_6.output.q30.bam --filter 30 2>&1 > input_file_6.output.q30.log
# runner_script.sh --input input_file_8.bam --output input_file_8.output.q30.bam --filter 30 2>&1 > input_file_8.output.q30.log
# runner_script.sh --input input_file_8.bam --output input_file_8.output.q10.bam --filter 10 2>&1 > input_file_8.output.q10.log
# runner_script.sh --input input_file_8.bam --output input_file_8.output.q20.bam --filter 20 2>&1 > input_file_8.output.q20.log
# runner_script.sh --input input_file_9.bam --output input_file_9.output.q30.bam --filter 30 2>&1 > input_file_9.output.q30.log
# runner_script.sh --input input_file_9.bam --output input_file_9.output.q10.bam --filter 10 2>&1 > input_file_9.output.q10.log
# runner_script.sh --input input_file_9.bam --output input_file_9.output.q20.bam --filter 20 2>&1 > input_file_9.output.q20.log
-
Notifications
You must be signed in to change notification settings - Fork 0
ixc7/parallel-examples
Folders and files
Name | Name | Last commit message | Last commit date | |
---|---|---|---|---|
Repository files navigation
About
GNU Parallel examples
Resources
Stars
Watchers
Forks
Releases
No releases published
Packages 0
No packages published