### making the snpView for the mito browser

This script demonstrates making the snpView for one of
the species in the mito browser.   In this case, the hg38a
species (==hg38 chrM)

This script will be used later with input arg parameters in order to
make up the snpView for each target reference.

#######################################################################

#!/bin/bash

export allDists="/cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin/all_dists"

export target="hg38a"
export Nway="104way"
export dbTable="${target}_${Nway}"
export mafFile="/hive/data/genomes/mito/trackData/mafFiles/cactus/${target}.104way.maf"

mkdir -p /hive/data/genomes/mito/bed/snpView/${target}
cd  /hive/data/genomes/mito/bed/snpView/${target}

mkdir -p /gbdb/mito/maf
if [ "${mafFile}" -nt species.list ]; then
rm -f /gbdb/mito/maf/${target}.maf

# translate the names to avoid the underbars and . versions in
#  the 'species' names

sed -e 's/GCA_\([0-9]\+\)\./GCAu\1v/g; s/GCF_\([0-9]\+\)\./GCFu\1v/g;' \
   "${mafFile}" > noUnderbar.maf

rm -f mito.${Nway}.nh
ln -s ../../../findSequences/finalPlus7.nh ./mito.${Nway}.nh

# 'target' species is always 'mito'
printf "mito\n" > species.list

# order species via their distance in the phylo-tree
$allDists mito.${Nway}.nh | grep -w "${target}" | sed -e "s/${target}.//;" \
   | sort -k2,2n | cut -f1 \
     | sed -e 's/GCA_\([0-9]\+\)\./GCAu\1v/g; s/GCF_\([0-9]\+\)\./GCFu\1v/g;' \
        >> species.list

ln -s `pwd`/noUnderbar.maf /gbdb/mito/maf/${target}.maf

# temporary load this maf file into 'mito' database for the convenience
#   of the mafGene command
hgLoadMaf -pathPrefix=/gbdb/mito/maf \
   -loadFile=../../mafFiles/cactus/${target}.${Nway}.maf mito ${dbTable}

fi

# the genes for this target
### XXX should the genes for all the other species also be here ?
if [ "../../genes/hg38/hg38_chrM.gp.gz" -nt "${target}.gp" ]; then
  zcat ../../genes/hg38/hg38_chrM.gp.gz > "${target}.gp"
  touch -r ../../genes/hg38/hg38_chrM.gp.gz "${target}.gp"
fi

if [ "${target}.gp" -nt nonsyn.faa ]; then
  mafGene -exons mito ${dbTable} -useFile "${target}.gp" species.list nonsyn.faa
  touch -r "${target}.gp" nonsyn.faa
fi

# the 1583 is the soTerm for missense_variant
if [ nonsyn.faa -nt nonsyn.bed ]; then
  paSNP species.list nonsyn.faa stdout | sed 's/:/ /' | sed 's/-/ /' \
        | awk '{printf "%s\t%d\t%d\t%s\t1583\t+\t%d\t%d\t255,0,0\t1\t%d\t0\n", $1, $2-1, $3, $4, $2-1, $3, $3-($2 - 1)}' > nonsyn.bed
  touch -r nonsyn.faa nonsyn.bed
fi

if [ "${target}.gp" -nt syn.faa ]; then
  mafGene -uniqAA -exons mito ${dbTable} -useFile "${target}.gp" species.list syn.faa
  touch -r "${target}.gp" syn.faa
fi

# the 1819 is the soTerm for synonymous_variant
if [ syn.faa -nt syn.bed ]; then
  paSNP species.list syn.faa stdout | sed 's/:/ /' | sed 's/-/ /' \
        | awk '{printf "%s\t%d\t%d\t%s\t1819\t+\t%d\t%d\t0,255,0\t1\t%d\t0\n", $1, $2-1, $3, $4, $2-1, $3, $3-($2 - 1)}' > syn.bed
  touch -r syn.faa syn.bed
fi

if [ "${target}.gp" -nt single.bed ]; then
  mafToSnpBed mito noUnderbar.maf "${target}.gp" stdout \
     | sed -e 's/mito.//;' > single.bed
  touch -r "${target}.gp" single.bed
fi

# select out based on 'soTerm' identifier in the last column
#  soTerms defined in src/hg/inc/soTerm.h
#     1       2  3         4          5
# hg38a_chrM 72 73 GCAu018873775v2 1628

if [ single.bed -nt codingVariant.bed ]; then
  awk '$NF == 1580' single.bed \
    | awk '{printf "%s\t%d\t%d\t%s\t%d\t+\t%d\t%d\t255,255,0\t1\t%d\t0\n", $1, $2, $3, $4, $5, $2, $3, $3-$2}' \
       | sed -e 's/GCAu\([0-9]\+\)v/GCA_\1./g; s/GCFu\([0-9]\+\)v/GCF_\1./g;' \
           | sort -k1,1 -k2,2n > codingVariant.bed
  touch -r single.bed codingVariant.bed
fi

if [ single.bed -nt utrVariant.bed ]; then
  awk '$NF == 1623 || $NF == 1624' single.bed \
    | awk '{printf "%s\t%d\t%d\t%s\t%d\t+\t%d\t%d\t255,255,0\t1\t%d\t0\n", $1, $2, $3, $4, $5, $2, $3, $3-$2}' \
       | sed -e 's/GCAu\([0-9]\+\)v/GCA_\1./g; s/GCFu\([0-9]\+\)v/GCF_\1./g;' \
           | sort -k1,1 -k2,2n > utrVariant.bed
  touch -r single.bed utrVariant.bed
fi

if [ single.bed -nt missing.bed ]; then
  awk '$NF == 0' single.bed \
    | awk '{printf "%s\t%d\t%d\t%s\t%d\t+\t%d\t%d\t255,255,0\t1\t%d\t0\n", $1, $2, $3, $4, $5, $2, $3, $3-$2}' \
       | sed -e 's/GCAu\([0-9]\+\)v/GCA_\1./g; s/GCFu\([0-9]\+\)v/GCF_\1./g;' \
           | sort -k1,1 -k2,2n > missing.bed
  touch -r single.bed missing.bed
fi

if [ single.bed -nt intergenic.bed ]; then
  awk '$NF == 1628' single.bed \
    | awk '{printf "%s\t%d\t%d\t%s\t%d\t+\t%d\t%d\t255,255,0\t1\t%d\t0\n", $1, $2, $3, $4, $5, $2, $3, $3-$2}' \
       | sed -e 's/GCAu\([0-9]\+\)v/GCA_\1./g; s/GCFu\([0-9]\+\)v/GCF_\1./g;' \
           | sort -k1,1 -k2,2n > intergenic.bed
  touch -r single.bed intergenic.bed
fi

if [ single.bed -nt intron.bed ]; then
  awk '$NF == 1627' single.bed \
    | awk '{printf "%s\t%d\t%d\t%s\t%d\t+\t%d\t%d\t255,255,0\t1\t%d\t0\n", $1, $2, $3, $4, $5, $2, $3, $3-$2}' \
       | sed -e 's/GCAu\([0-9]\+\)v/GCA_\1./g; s/GCFu\([0-9]\+\)v/GCF_\1./g;' \
           | sort -k1,1 -k2,2n > intron.bed
  touch -r single.bed intron.bed
fi

if [ species.list -nt mito.species.list ]; then
   grep -v -w mito species.list \
     | sed -e 's/GCAu\([0-9]\+\)v/GCA_\1./g; s/GCFu\([0-9]\+\)v/GCF_\1./g;' \
        > mito.species.list
   touch -r species.list mito.species.list
fi

if [ syn.bed -nt output.bed -o intergenic.bed -nt output.bed ]; then
for S in `cat mito.species.list`
do
  grep -wh "${S}" nonsyn.bed syn.bed codingVariant.bed utrVariant.bed intron.bed intergenic.bed missing.bed \
     | bedSmash stdin ../../../chrom.sizes stdout
done | sort -k1,1 -k2,2n > output.bed
   touch -r intergenic.bed output.bed
fi

if [ output.bed -nt load.bed ]; then
   awk '{print $1,$2,$3,$4,$5}' output.bed | sort -k1,1 -k2,2n > load.bed
   bedToBigBed -type=bed4+1 -as=snpView.as load.bed \
      ../../../chrom.sizes mafSnpHg38a.bb
fi
##########################################################################
