#!/bin/sh set -e self="$$" tmp="$(mktemp -d)" # Preparing exit children="" trapf() { rm -rf "$tmp" [ -n "$children" ] && kill $children } trap trapf EXIT KILL # What to do if the XDG standard isn't there... if [ -z "$XDG_CONFIG_HOME" ]; then if [ -d "$HOME/Library/Preferences/" ]; then config_default="$HOME/Library/Preferences/Unipept" elif [ -d "$HOME/.config" ]; then # why weren't the XDG variables set then? config_default="$HOME/.config/unipept" else config_default="$HOME/.unipept" fi else config_default="$XDG_CONFIG_HOME/unipept" fi USAGE=" Some default configurations for the UMGAP. Usage: $0 -1 [-t ] [-z] -o $0 -1 -2 [-t ] [-z] -o Where: A (optionally GZIP-compressed) FASTA file of reads. A (optionally GZIP-compressed) FASTQ file of reads (first end). A (optionally GZIP-compressed) FASTQ file of reads (second end). The type of analysis, any of: - max-sensitivity - high-sensitivity - tryptic-sensitivity - tryptic-precision - high-precision (the default) - max-precision The output file. Use '-' to write to stdout. Options: -z GZIP-compress before writing the following output file. -c dir The configuration directory. Defaults to '$config_default'. Note: you can call this script with multiple samples, by just repeating above arguments. It will then share files loaded in memory in between samples to save time: $0 -1 -o -1 -2 -t -o ... " # =========================================================================== # # Some functions. # =========================================================================== # # Logging log() { [ -z "$VERBOSE" ] && return printf "log: %s\n" "$*" >&2 } debug() { [ -z "$VERBOSE" -a -z "$DEBUG" ] && return printf "debug: %s\n" "$*" >&2 } # print stuff to stderr and exits with fault crash() { debug "encountered error" echo "$*" >&2 exit 1 } # check if a filename contains tabs or newlines checkname() { debug "checking if filename '$1' is valid" [ "${1}" = "${1% *}" ] || crash "Tabs in filenames are unsupported." } # function to fetch the configuration directory configdir="" getconfigdir() { if [ -n "$configdir" ]; then echo "$configdir" elif [ -d "$config_default" ]; then echo "$config_default" elif [ -d /etc/umgap ]; then echo /etc/umgap else crash "No configuration directory found. Please run umgap-setup or use the '-c' argument." fi } # =========================================================================== # # Environmental checks. # =========================================================================== # debug "checking if umgap is installed" if ! umgap -V > /dev/null; then crash 'Cannot find the umgap executable. Please ensure it is installed and located in your $PATH.' fi # =========================================================================== # # Argument parsing. # =========================================================================== # debug "parsing the arguments" count=0 fgspp_used="false" tryptic_used="false" ninemer_used="false" samples="" # defaults compress="false" type="high-precision" tmpfiles="" while getopts c:1:2:t:zo: f; do case "$f" in c) configdir="$OPTARG" ;; z) compress="true" ;; 1) checkname "$OPTARG"; infile1="$OPTARG" ;; 2) checkname "$OPTARG"; infile2="$OPTARG" ;; t) checkname "$OPTARG"; type="$OPTARG" ;; o) checkname "$OPTARG"; outfile="$OPTARG" ;; \?) crash "$USAGE" ;; esac if [ "$f" = "o" ]; then # o is the last option of each series count="$(( count + 1 ))" log "Checking sample $count:" case "$type" in "max-sensitivity") ninemer_used="true" ;; "high-sensitivity") ninemer_used="true" ;; "tryptic-sensitivity") fgspp_used="true" ; tryptic_used="true" ;; "tryptic-precision") fgspp_used="true" ; tryptic_used="true" ;; "high-precision") fgspp_used="true" ; ninemer_used="true" ;; "max-precision") fgspp_used="true" ; ninemer_used="true" ;; *) crash "Unrecognized type: '$type'." esac log "- Using '$type'." if [ -n "$infile1" ]; then log "- found input file 1: '$infile1'" filetype="$(file --mime-type "$infile1")" || \ crash "Could not determine filetype of '$infile1'." if [ "$filetype" != "${filetype%gzip}" ]; then log "- input file 1 is compressed" mkfifo "$tmp/gunzip1-$count" { zcat "$infile1" > "$tmp/gunzip1-$count" || kill "$self"; } > /dev/null & infile1="$tmp/gunzip1-$count" fi fi if [ -n "$infile2" ]; then log "- found input file 2: '$infile2'" filetype="$(file --mime-type "$infile2")" || \ crash "Could not determine filetype of '$infile2'." if [ "$filetype" != "${filetype%gzip}" ]; then log "- input file 2 is compressed" mkfifo "$tmp/gunzip2-$count" { zcat "$infile2" > "$tmp/gunzip2-$count" || kill "$self"; } > /dev/null & infile2="$tmp/gunzip2-$count" fi fi if [ -n "$infile2" ]; then if [ -n "$infile1" ]; then log "- found two input files, assuming paired-end FASTQ" mkfifo "$tmp/infile-$count" { umgap fastq2fasta "$infile1" "$infile2" > "$tmp/infile-$count" || kill "$self"; } > /dev/null & infile="$tmp/infile-$count" else crash "Encountered a second input file without a first." fi elif [ -n "$infile1" ]; then infile="$infile1" else crash "Encountered an output file without input files." fi if [ "$outfile" = "-" ]; then mkfifo "$tmp/outfile-$count" cat "$tmp/outfile-$count" & outfile="$tmp/outfile-$count" fi if [ "$compress" = "true" ]; then mkfifo "$tmp/gzip-$count" { gzip - < "$tmp/gzip-$count" > "$outfile" || kill "$self"; } > /dev/null & outfile="$tmp/gzip-$count" fi samples="$(printf '%s%s\t%s\t%s\t' "$samples" "$type" "$infile" "$outfile")" log "" # reset for next compress="false" infile1= infile2= type="high-precision" outfile= fi done if [ "$count" -eq 0 ]; then echo "$USAGE" exit fi # =========================================================================== # # Environmental checks. # =========================================================================== # if [ "$fgspp_used" = "true" ]; then debug "checking if FGSpp is installed" if ! [ -h "$(getconfigdir)/FGSpp" ]; then crash "FragGeneScan++ not found. Please run umgap-setup." fi fi versions="$(find -H "$(getconfigdir)" -mindepth 1 -maxdepth 1 \ -printf '%P\n' | sort -n)" for candidate in $versions; do [ ! -h "$(getconfigdir)/$candidate/taxons.tsv" ] && continue [ "$tryptic_used" = "true" -a ! -h "$(getconfigdir)/$candidate/tryptic.fst" ] && continue [ "$ninemer_used" = "true" -a ! -h "$(getconfigdir)/$candidate/ninemer.fst" ] && continue version="$candidate" done [ -n "$version" ] || crash "No data version found valid for all samples. Please run umgap-setup." debug "using version '$version'" # =========================================================================== # # The actual pipeline # =========================================================================== # fgspp() { "$(getconfigdir)/FGSpp/FGSpp" -s stdin -o stdout -w 0 \ -r "$(getconfigdir)/FGSpp/train" -t "illumina_10" -p 4 -c 2 } taxons="$(getconfigdir)/$version/taxons.tsv" ninemers="$(getconfigdir)/$version/ninemer.fst" tryptics="$(getconfigdir)/$version/tryptic.fst" if [ "$ninemer_used" = "true" ]; then log "Loading index in memory." socket="$tmp/socket" { umgap prot2kmer2lca -m -o -s "$socket" "$ninemers" || kill "$self"; } > /dev/null & ninemer_pid="$!" children="$(pgrep -P "$!") $ninemer_pid" while [ ! -S "$socket" ] && sleep 1; do true; done fi # for each sample count=0 while [ -n "${samples}" ]; do type="${samples%% *}" ; samples="${samples#* }" infile="${samples%% *}" ; samples="${samples#* }" outfile="${samples%% *}" ; samples="${samples#* }" count="$(( count + 1 ))" log "Running sample ${count}." case "$type" in "max-sensitivity") umgap translate -a | # translate 6 frames nc -NU "$socket" | # map to taxa umgap seedextend -g1 -s2 | # seedextend filter umgap uniq -d / | # join paired ends umgap taxa2agg -l1 -m rmq -a mrtl "$taxons" ;; # aggregate "high-sensitivity") umgap translate -a | # translate 6 frames nc -NU "$socket" | # map to taxa umgap seedextend -g1 -s3 | # seedextend filter umgap uniq -d / | # join paired ends umgap taxa2agg -l1 -a hybrid -f 0.25 "$taxons";; # aggregate "tryptic-sensitivity") fgspp | # gene prediction umgap prot2tryp2lca -l9 -L45 "$tryptics" | # map to taxa umgap uniq -d / | # join paired ends umgap taxa2agg -l1 -m rmq -a mrtl "$taxons" ;; # aggregate "tryptic-precision") fgspp | # gene prediction umgap prot2tryp2lca -l9 -L45 "$tryptics" | # map to taxa umgap uniq -d / | # join paired ends umgap taxa2agg -l5 -m rmq -a mrtl "$taxons" ;; # aggregate "high-precision") fgspp | # gene prediction nc -NU "$socket" | # map to taxa umgap seedextend -g1 -s3 | # seedextend filter umgap uniq -d / | # join paired ends umgap taxa2agg -l2 -a lca\* "$taxons" ;; # aggregate "max-precision") fgspp | # gene prediction nc -NU "$socket" | # map to taxa umgap seedextend -g1 -s4 | # seedextend filter umgap uniq -d / | # join paired ends umgap taxa2agg -l5 -a lca\* "$taxons" ;; # aggregate esac < "$infile" > "$outfile" done