# for emacs: -*- mode: sh; -*-

# This file describes how to construct a UCSC genome browser for
#   any genome sequence.  Enter your genome information in the source tree
#   file: kent/src/hg/utils/phyloTrees/dbCommonScientificCladeOrderkey.tab

# for example (note tab separated columns):
# calMil1 Elephant shark Callorhinchus milii vertebrate chondrichthyes 4796


# 1. find your sequence at:
#   ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes

#  for example:
#  ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Callorhinchus_milii/Callorhinchus_milii-6.1.3/


# establish your DB name to use in the following commands:

export DB=calMil1

#############################################################################
# create directory and download sequence

mkdir -p /hive/data/genomes/${DB}/genbank
cd /hive/data/genomes/${DB}/genbank

# Verify this FTP directory appears to be correct:

grep `echo $DB | sed -e 's/[0-9][0-9]*//'` $HOME/kent/src/hg/utils/phyloTrees/db.ncbiTaxId.tab  | sort | cut -f5 | tail -1 | sed -e 's#/ASSEMB.*##'

# if that looks good, then let it be used:

grep `echo $DB | sed -e 's/[0-9][0-9]*//'` $HOME/kent/src/hg/utils/phyloTrees/db.ncbiTaxId.tab  | sort | cut -f5 | tail -1 | sed -e 's#/ASSEMB.*##' | while read ftpDir
do
  time rsync -a -P \
    rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/${ftpDir}/ ./
done

#############################################################################
# establish UCSC naming scheme

# setup the scientific name:
cd /hive/data/genomes/${DB}
grep ORGANISM: genbank/ASSEMBLY_INFO  | cut -f2 > species.name.txt

#    When the assembly is composed only of unplaced scaffolds:
time ~/kent/src/hg/utils/automation/unplacedScaffolds.pl ${DB} > step1.log 2>&1
# this constructs the files and directory:
# -rw-rw-r-- 1 244682665 Mar  4 12:30 calMil1.unmasked.2bit
# drwxrwxr-x 2       512 Mar  4 12:30 ucsc
# -rw-rw-r-- 1    369313 Mar  4 12:30 chrom.sizes

# check for a mitochondrion sequence if it isn't included in the download
# query NCBI Nucleotide search for
#    "<Species name> mitochondrion complete genome"
# note the accession number, and fetch it:
cd /hive/data/genomes/${DB}/ucsc
export mitoAcc=NC_xxxxxx.x

wget -O ${mitoAcc}.fa \
 "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=$mitoAcc"

echo ">chrM" > chrM.fa
grep -v "^>" ${mitoAcc}.fa >> chrM.fa

export mSize=`faCount chrM.fa | grep total | awk '{print $2}'`

/bin/echo -e "chrM\t1\t$mSize\t1\tF\t$mitoAcc\t1\t$mSize\t+" > chrM.agp

# then, combine this chrM.agp and chrM.fa sequence with the unplaced
# agp and fa files
mv ${DB}.ucsc.fa.gz ${DB}.ucsc.fa.0.gz
(zcat ${DB}.ucsc.fa.0.gz; cat chrM.fa) | gzip -c > ${DB}.ucsc.fa.gz

mv ${DB}.ucsc.agp ${DB}.ucsc.0.agp
cat ${DB}.ucsc.0.agp chrM.agp > ${DB}.ucsc.agp

# faToTwoBit on the new sequence, checkAgpAndFa with the new agp
# replace the ${DB}.unmasked.2bit and ${DB}.agp at the top
faToTwoBit ${DB}.ucsc.fa.gz ../${DB}.unmasked.2bit
twoBitInfo ../${DB}.unmasked.2bit stdout | sort -k2nr > ../chrom.sizes
checkAgpAndFa ${DB}.ucsc.agp ../${DB}.unmasked.2bit 2>&1 | tail
# if OK, remove the old ones:
rm ${DB}.ucsc.fa.0.gz ${DB}.ucsc.0.agp

cp -p ucsc/${DB}.ucsc.agp ./${DB}.agp

#############################################################################
# construct minimal database

mkdir -p /hive/data/genomes/${DB}/bed/chromInfo
cd /hive/data/genomes/${DB}
awk '{print $1 "\t" $2 "\t/gbdb/'$DB/$DB'.2bit";}' chrom.sizes \
  > bed/chromInfo/chromInfo.tab

hgsql -e "create database $DB;" test

hgGoldGapGl -noGl ${DB} ${DB}.agp

cut -f1 bed/chromInfo/chromInfo.tab | awk '{print length($0)}' \
   | sort -nr > bed/chromInfo/t.chrSize
export chrSize=`head -1 bed/chromInfo/t.chrSize`
sed -e "s/chrom(16)/chrom($chrSize)/" \
   ${HOME}/kent/src/hg/lib/chromInfo.sql > bed/chromInfo/chromInfo.sql
rm -f bed/chromInfo/t.chrSize
hgLoadSqlTab ${DB} chromInfo bed/chromInfo/chromInfo.sql \
   bed/chromInfo/chromInfo.tab

hgsql $DB < $HOME/kent/src/hg/lib/grp.sql
mkdir -p /gbdb/${DB}

ln -s /hive/data/genomes/${DB}/${DB}.unmasked.2bit /gbdb/${DB}/${DB}.2bit
mkdir $HOME/kent/src/hg/makeDb/trackDb/assemblyHubs/${DB}

cd $HOME/kent/src/hg/makeDb/trackDb

make EXTRA="-strict" DBS=$DB
make GIT=echo EXTRA="-strict" DBS=$DB alpha

rm -f /gbdb/$DB/trackDb.ix /gbdb/$DB/trackDb.ixx

#############################################################################
# add dbDb/clade/genomeClade entries to add browser to selection menus
#   on genome-test

mkdir /hive/data/genomes/${DB}/dbDb
cd /hive/data/genomes/${DB}/dbDb

export asmInfoPath="/hive/data/genomes/${DB}/genbank/ASSEMBLY_INFO"
export dbDbInfo="${HOME}/kent/src/hg/utils/phyloTrees/dbCommonScientificCladeOrderkey.tab"
export chromSizes="/hive/data/genomes/${DB}/chrom.sizes"
export asmDate=`grep "^DATE:" ${asmInfoPath} | cut -f2 | sed -e 's/^[0-9]+-//; s/-/ /g;'`

export asmShortLabel=`grep "^ASSEMBLY SHORT NAME:" ${asmInfoPath} | cut -f2`;
export asmSciName=`grep "^ORGANISM:" ${asmInfoPath} | cut -f2`;
export asmTaxID=`grep "^TAXID:" ${asmInfoPath} | cut -f2`;
export asmSourceName=`grep "^ASSEMBLY SUBMITTER:" ${asmInfoPath} | cut -f2`;
export asmAccession=`grep "^ASSEMBLY ACCESSION:" ${asmInfoPath} | cut -f2`;
export orderKey=`grep "^$DB" $dbDbInfo | cut -f6`;
export clade=`grep "^$DB" $dbDbInfo | cut -f4`;
export commonName=`grep "^$DB" $dbDbInfo | cut -f2`;
export chrName=`head -1 $chromSizes | cut -f1`;
export chrLength=`head -1 $chromSizes | cut -f2`;
export chrStart=`echo $chrLength | awk '{printf "%d", ($1/2)-($1/10)}'`
export chrEnd=`echo $chrLength | awk '{printf "%d", ($1/2)+($1/10)}'`
export defaultPos="${chrName}:${chrStart}-${chrEnd}"

echo "DELETE from dbDb where name = \"$DB\";
INSERT INTO dbDb
    (name, description, nibPath, organism,
     defaultPos, active, orderKey, genome, scientificName,
     htmlPath, hgNearOk, hgPbOk, sourceName, taxId)
VALUES
    (\"$DB\", \"$asmDate ($asmShortLabel/$DB)\", \"/gbdb/$DB\",
     \"$commonName\",
     \"$defaultPos\", 1, $orderKey, \"$commonName\", \"$asmSciName\",
     \"/gbdb/$DB/html/description.html\", 0, 0,
     \"$asmSourceName $asmAccession\", $asmTaxID);" > ${DB}.dbDb.sql

echo \
"INSERT INTO defaultDb (genome, name) VALUES (\"${commonName}\", \"$DB\");" \
    > ${DB}.defaultDb.sql

case $clade in
  "primate")
echo "INSERT INTO genomeClade (genome, clade, priority) VALUES (\"${commonName}\", \"$clade\", \"16\");"
    ;;
  "mammal")
echo "INSERT INTO genomeClade (genome, clade, priority) VALUES (\"${commonName}\", \"$clade\", \"35\");"
    ;;
  "rodent")
echo "INSERT INTO genomeClade (genome, clade, priority) VALUES (\"${commonName}\", \"$clade\", \"40\");"
    ;;
  "vertebrate")
echo "INSERT INTO genomeClade (genome, clade, priority) VALUES (\"${commonName}\", \"$clade\", \"70\");"
    ;;
  "insect")
echo "INSERT INTO genomeClade (genome, clade, priority) VALUES (\"${commonName}\", \"$clade\", \"70\");"
    ;;
esac > ${DB}.genomeClade.sql

# take a look at these to verify they are sane:
cat ${DB}.*.sql

# If they appear to be OK, load them into dbDb:
cat ${DB}.*.sql | hgsql hgcentraltest

#############################################################################
# the three repeat procedures can all start at the same time.
# the windowMasker may end with an error if it finishes before
# repeatMasker since it tries to compare its result with the RM result
# but this isn't a fatal flaw, it is just a featureBits that doesn't finish

#############################################################################
# running repeat masker:
mkdir -p /hive/data/genomes/${DB}/bed/repeatMasker
cd /hive/data/genomes/${DB}/bed/repeatMasker

export sciName=`cat ../../species.name.txt`

$ the useRMBlastn is the faster alignment engine
time (doRepeatMasker.pl -buildDir=`pwd` \
   -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
   -species "$sciName" -useRMBlastn \
   -smallClusterHub=ku ${DB}) > do.log 2>&1 &

#############################################################################
# running simple repeats:
mkdir /hive/data/genomes/${DB}/bed/simpleRepeat
cd /hive/data/genomes/${DB}/bed/simpleRepeat
time (doSimpleRepeat.pl -buildDir=`pwd` \
   -smallClusterHub=ku -workhorse=hgwdev ${DB}) > do.log 2>&1 &

#############################################################################
# running window masker:
mkdir /hive/data/genomes/${DB}/bed/windowMasker
cd /hive/data/genomes/${DB}/bed/windowMasker
time (doWindowMasker.pl -buildDir=`pwd` \
   -workhorse=hgwdev ${DB}) > do.log 2>&1 &

#############################################################################
# check gaps

mkdir /hive/data/genomes/${DB}/bed/gap
cd /hive/data/genomes/${DB}/bed/gap
findMotif -motif=gattaca -verbose=4 -strand=+ ../../${DB}.unmasked.2bit > findMotif.txt 2>&1
grep "#GAP " findMotif.txt | sed -e "s/^#GAP //" | sort -k1,1 -k2,2n > allGaps.bed
awk '{printf "%s\t%d\t'${DB}'.2bit\n", $1, $2}' ../../chrom.sizes > chromInfo.tab
featureBits test -chromSize=chromInfo.tab -not -bed=notGap.bed allGaps.bed 2> /dev/null

awk '{print $3-$2}' allGaps.bed | ave stdin > gap.stats.txt

#############################################################################
# decide to use WindowMasker or RepeatMasker result
#   usually on whichever is the greater mask, but use RM when
#   the results are similar

cd /hive/data/genomes/${DB}

    # to use Repeat Masker result:

twoBitMask bed/repeatMasker/${DB}.rmsk.2bit \
    -add bed/simpleRepeat/trfMask.bed  ${DB}.2bit
    # the warning about the bed file >=13 fields is OK

    # to use Window Masker result:
twoBitMask bed/windowMasker/${DB}.cleanWMSdust.2bit \
    -add bed/simpleRepeat/trfMask.bed  ${DB}.2bit

    # measure the results from either masking:
twoBitToFa ${DB}.2bit stdout | faSize stdin > faSize.${DB}.2bit.txt

    # and reset the symlink
    rm /gbdb/${DB}/${DB}.2bit
    ln -s /hive/data/genomes/${DB}/${DB}.2bit /gbdb/${DB}/${DB}.2bit

#############################################################################
# after the masked sequence is established, the following tracks can be created
#############################################################################
# genscan track

mkdir /hive/data/genomes/${DB}/bed/genscan
cd /hive/data/genomes/${DB}/bed/genscan
time (doGenscan.pl -buildDir=`pwd` -bigClusterHub=ku -workhorse=hgwdev \
    -dbHost=hgwdev $DB) > do.log 2>&1

#############################################################################
# CPG Islands track

mkdir /hive/data/genomes/${DB}/bed/cpgIslands
cd /hive/data/genomes/${DB}/bed/cpgIslands
time (doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \
   -dbHost=hgwdev -smallClusterHub=ku -workhorse=hgwdev $DB) > do.log 2>&1

#############################################################################
# CPG Islands Unmasked track

mkdir /hive/data/genomes/${DB}/bed/cpgIslandsUnmasked
cd /hive/data/genomes/${DB}/bed/cpgIslandsUnmasked
time (doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \
   -tableName=cpgIslandExtUnmasked -dbHost=hgwdev -smallClusterHub=ku \
    -workhorse=hgwdev \
    -maskedSeq=/hive/data/genomes/${DB}/${DB}.unmasked.2bit $DB) > do.log 2>&1

#############################################################################
# GC 5-base track

mkdir /hive/data/genomes/${DB}/bed/gc5Base
cd /hive/data/genomes/${DB}/bed/gc5Base
hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 ${DB} \
  /hive/data/genomes/${DB}/${DB}.unmasked.2bit \
    | gzip -c > ${DB}.gc5Base.wigVarStep.gz
wigToBigWig ${DB}.gc5Base.wigVarStep.gz ../../chrom.sizes ${DB}.gc5Base.bw

mkdir -p /gbdb/${DB}/bbi/gc5BaseBw
rm -f /gbdb/${DB}/bbi/gc5BaseBw/gc5Base.bw
ln -s `pwd`/${DB}.gc5Base.bw /gbdb/${DB}/bbi/gc5BaseBw/gc5Base.bw
hgsql ${DB} -e "DROP TABLE IF EXISTS gc5BaseBw;
   CREATE TABLE gc5BaseBw (fileName varchar(255) not null);
      INSERT INTO gc5BaseBw VALUES (\"/gbdb/${DB}/bbi/gc5BaseBw/gc5Base.bw\");"

#############################################################################
