{%- include "header" -%}
{# Keep a blank line #}
#----------------------------#
# Run
#----------------------------#
log_warn info.sh

#----------------------------#
# filtered species.tsv
#----------------------------#
log_info "Protein/species-f.tsv"
cat species.tsv |
{% for i in ins -%}
    tsv-join -f ../{{ i }} -k 1 |
{% endfor -%}
{% for i in not_ins -%}
    tsv-join -e -f ../{{ i }} -k 1 |
{% endfor -%}
    cat \
    > species-f.tsv

#----------------------------#
# info.tsv
#----------------------------#
log_info "info.tsv"
cat species-f.tsv |
    tsv-select -f 2 |
    tsv-uniq |
while read SPECIES; do
    if [[ -f "${SPECIES}"/info.tsv ]]; then
        continue
    fi

    if [[ ! -s "${SPECIES}"/res_cluster.tsv ]]; then
        continue
    fi

    log_debug "${SPECIES}"

    echo -e "#name\tid\tstrain\tannotation" > "${SPECIES}"/temp.strain.tsv
    cat "${SPECIES}"/strains.tsv |
        parallel --colsep '\t' --no-run-if-empty --linebuffer -k -j 1 '
            if [[ ! -d "../ASSEMBLY/{2}/{1}" ]]; then
                exit
            fi

            gzip -dcf ../ASSEMBLY/{2}/{1}/*_protein.faa.gz |
                grep "^>" |
                sed "s/^>//" |
                perl -nl -e '\'' /\[.+\[/ and s/\[/\(/; print; '\'' |
                perl -nl -e '\'' /\].+\]/ and s/\]/\)/; print; '\'' |
                perl -nl -e '\'' s/\s+\[.+?\]$//g; print; '\'' |
                sed "s/MULTISPECIES: //g" |
                perl -nl -e '\''
                    /^(\w+)\.(\d+)\s+(.+)$/ or next;
                    printf qq(%s_%s\t%s.%s\t%s\t%s\n), {1}, $1, $1, $2, {1}, $3;
                '\''
        ' \
        >> "${SPECIES}"/temp.strain.tsv

    echo -e "id\tsize" > "${SPECIES}"/temp.sizes.tsv
    faops size "${SPECIES}"/pro.fa.gz >> "${SPECIES}"/temp.sizes.tsv

    echo -e "rep\tid" > "${SPECIES}"/temp.clust.tsv
    cat "${SPECIES}"/res_cluster.tsv >> "${SPECIES}"/temp.clust.tsv

    #name	id	rep	strain	size	annotation
    tsv-join -H \
        "${SPECIES}"/temp.strain.tsv \
        --data-fields id \
        -f "${SPECIES}"/temp.sizes.tsv \
        --key-fields id \
        --append-fields 2 |
        tsv-join -H \
            --data-fields id \
            -f "${SPECIES}"/temp.clust.tsv \
            --key-fields id \
            --append-fields 1 |
        tsv-select -f 1,2,6,3,5,4 \
        > "${SPECIES}"/info.tsv

    rm -f "${SPECIES}"/temp.*.tsv

done

log_info Done.

exit 0