# for emacs: -*- mode: sh; -*-

# Pteropus vampyrus

#	http://www.ncbi.nlm.nih.gov/bioproject/20325
#	http://www.ncbi.nlm.nih.gov/genome/757
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABRP00

#########################################################################
# DOWNLOAD SEQUENCE
    ssh kkstore05
    mkdir /cluster/store12/pteVam1
    ln -s /cluster/store12/pteVam1 /cluster/data
    mkdir /cluster/data/pteVam1/broad
    cd /cluster/data/pteVam1/broad

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/megabat/Ptevap1.0/assembly.agp \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/megabat/Ptevap1.0/assembly.bases.gz \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/megabat/Ptevap1.0/assembly.quals.gz
    md5sum ass* > assembly.md5sum

    qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin pteVam1.qual.qac

    cut -f 1 assembly.agp | uniq -c | wc -l
# Number of scaffolds: 96944

#########################################################################
# Create .ra file and run makeGenomeDb.pl
    ssh kolossus
    cd /cluster/data/pteVam1
cat << _EOF_ >pteVam1.config.ra
# Config parameters for makeGenomeDb.pl:
db pteVam1
clade mammal
genomeCladePriority 35
scientificName Pteropus vampyrus
commonName Megabat
assemblyDate Jul. 2008
assemblyLabel Broad Institute pteVam1
orderKey 233.5
#mitoAcc AJ222767
mitoAcc none
fastaFiles /cluster/data/pteVam1/broad/assembly.bases.gz
agpFiles /cluster/data/pteVam1/broad/assembly.agp
qualFiles /cluster/data/pteVam1/broad/pteVam1.qual.qac
dbDbSpeciesDir megabat
_EOF_

# use 'screen' make sure on kkstore05
    makeGenomeDb.pl -workhorse kolossus pteVam1.config.ra > makeGenomeDb.out 2>&1 &

    cut -f 2 chrom.sizes | ave stdin
# Q1 1162.000000
# median 1777.000000
# Q3 9074.500000
# average 20589.994327
# min 601.000000
# max 1019178.000000
# count 96944
# total 1996076410.000000
# standard deviation 54685.855293

#########################################################################
## Repeat Masker (DONE - 2008-10-16 - Hiram)
    screen	# to manage this several day job
    mkdir /hive/data/genomes/pteVam1/bed/repeatMasker
    cd /hive/data/genomes/pteVam1/bed/repeatMasker
    time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
	-workhorse=hgwdev -bigClusterHub=swarm \
	-buildDir=`pwd` pteVam1 > do.log 2>&1 &
    #	real    806m48.065s
    twoBitToFa pteVam1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt
    #	1996076410 bases (156639750 N's 1839436660 real 1256029530 upper
    #	583407130 lower) in 96944 sequences in 1 files
    #	%29.23 masked total, %31.72 masked real

#########################################################################
# SIMPLE REPEATS TRF (WORKING - 2008-10-15 - Hiram)
    screen # use a screen to manage this job
    mkdir /cluster/data/pteVam1/bed/simpleRepeat
    cd /cluster/data/pteVam1/bed/simpleRepeat
    #
    time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \
	-buildDir=/cluster/data/pteVam1/bed/simpleRepeat pteVam1 > do.log 2>&1 &
    #	real    98m59.097
    cat fb.simpleRepeat
    #	33179383 bases of 1839436660 (1.804%) in intersection

XXX - waiting for RM to complete 2008-10-15 13:32
    #	after RM run is done, add this mask:
    cd /hive/data/genomes/pteVam1
    rm pteVam1.2bit
    twoBitMask pteVam1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed pteVam1.2bit
    #	can safely ignore warning about >=13 fields in bed file

    twoBitToFa pteVam1.2bit stdout | faSize stdin > pteVam1.2bit.faSize.txt
# 1996076410 bases (156639750 N's 1839436660 real 1255425444 upper 584011216
# lower) in 96944 sequences in 1 files
# %29.26 masked total, %31.75 masked rea

    #	link to gbdb
    ln -s `pwd`/pteVam1.2bit /gbdb/pteVam1

###########################################################################
# prepare for kluster runs (DONE _ 2008-10-16 - Hiram)
    # compare to size of real bases to adjust the repMatch
    #	hg18: 2881421696
    #	pteVam1: 1839436660
    # thus: 1024 * 1839436660/2881421696 = 653
    #	rounding up to 700 for a bit more conservative masking
    cd /hive/data/genomes/pteVam1
    time blat pteVam1.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=pteVam1.11.ooc -repMatch=700
    #	Wrote 19840 overused 11-mers to pteVam1.11.ooc
    #	real    1m44.676s

    #	and staging data for push to kluster nodes
    mkdir /hive/data/staging/data/pteVam1
    cp -p pteVam1.2bit chrom.sizes pteVam1.11.ooc \
	/hive/data/staging/data/pteVam1
    #	request to cluster admin to push this to the kluster nodes
    #	/scratch/data/

###########################################################################
# add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram)
    hgsql -e 'update dbDb set
sourceName="Broad Institute pteVam1 (NCBI project 20325, ABRP01000000)" where name="pteVam1";' hgcentraltest

############################################################################
#  pteVam1 - Megabat - Ensembl Genes version 51  (DONE - 2008-12-02 - hiram)
    ssh kkr14u07
    cd /hive/data/genomes/pteVam1
    cat << '_EOF_' > pteVam1.ensGene.ra
# required db variable
db pteVam1
# do we need to translate geneScaffold coordinates
geneScaffolds yes
# ignore genes that do not properly convert to a gene pred, and contig
#       names that are not in the UCSC assembly
skipInvalid yes
# ignore the two genes that have invalid structures from Ensembl:
#       6381: ENSPVAT00000012919 no exonFrame on CDS exon 14
#       23522: ENSPVAT00000010661 no exonFrame on CDS exon 0
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl -ensVersion=51 pteVam1.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/pteVam1/bed/ensGene.51
    featureBits pteVam1 ensGene
    # 28722584 bases of 1839436660 (1.561%) in intersection

 *** All done!  (through the 'makeDoc' step)
 *** Steps were performed in /hive/data/genomes/pteVam1/bed/ensGene.51

############################################################################
# cpgIslands - (DONE - 2011-04-24 - Hiram)
    mkdir /hive/data/genomes/pteVam1/bed/cpgIslands
    cd /hive/data/genomes/pteVam1/bed/cpgIslands
    time doCpgIslands.pl pteVam1 > do.log 2>&1
    #   real    239m18.855s

    cat fb.pteVam1.cpgIslandExt.txt
    #   49493178 bases of 1839436660 (2.691%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-26 - Hiram)
    mkdir /hive/data/genomes/pteVam1/bed/genscan
    cd /hive/data/genomes/pteVam1/bed/genscan
    time doGenscan.pl pteVam1 > do.log 2>&1
    #   Elapsed time: 356m2s

    cat fb.pteVam1.genscan.txt
    #   55912473 bases of 1839436660 (3.040%) in intersection
    cat fb.pteVam1.genscanSubopt.txt
    #   60619613 bases of 1839436660 (3.296%) in intersection

#########################################################################
# windowMasker - (DONE - 2012-05-02 - Hiram)
    screen -S pteVam1
    mkdir /hive/data/genomes/pteVam1/bed/windowMasker
    cd /hive/data/genomes/pteVam1/bed/windowMasker
    # trying out new version of the script that does all the usual steps
    #   that used to be performed manually after it was done
    time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \
        -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev pteVam1 > do.log 2>&1
    #   real    278m34.535s
    # fixing the last couple of commands from doLoadClean step
    time ./lastClean.csh
    # real    4m47.606s

    cat fb.pteVam1.windowmaskerSdust.clean.txt
    #   514392905 bases of 1996076410 (25.770%) in intersection

    cat faSize.pteVam1.cleanWMSdust.txt
    #   1996076410 bases (156639750 N's 1839436660 real 1325043755 upper 514392905 lower) in 96944 sequences in 1 files
    #   Total size: mean 20590.0 sd 54686.1 min 601 (scaffold_96943) max 1019178 (scaffold_0) median 1777
    #   %25.77 masked total, %27.96 masked real

     cat fb.pteVam1.rmsk.windowmaskerSdust.txt
    #   238443817 bases of 1996076410 (11.946%) in intersection

    time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \
        -continue=cleanup -workhorse=hgwdev -buildDir=`pwd` \
        -dbHost=hgwdev pteVam1 > cleanup.log 2>&1
    #   real    2m19.430s

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 1839436660 / 2897316137 \) \* 1024
    #	( 1839436660 / 2897316137 ) * 1024 = 650.113088

    # round up to 700

    cd /hive/data/genomes/pteVam1
    time blat pteVam1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/pteVam1.11.ooc -repMatch=700
    #   Wrote 19840 overused 11-mers to jkStuff/pteVam1.11.ooc
    #   real    0m31.002s

    # there are no non-bridged gaps, no lift file needed for genbank
    hgsql -N -e "select bridge from gap;" pteVam1 | sort | uniq -c
    #   291864 yes
#    cd /hive/data/genomes/pteVam1/jkStuff
#    gapToLift pteVam1 pteVam1.nonBridged.lift -bedFile=pteVam1.nonBridged.bed
    # largest non-bridged contig:
#    awk '{print $3-$2,$0}' pteVam1.nonBridged.bed | sort -nr | head
    #   123773608 chrX  95534   123869142       chrX.01

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Pteropus vampyrus	2	1	0
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add pteVam1 just after ce2
# pteVam1 (megabat)
pteVam1.serverGenome = /hive/data/genomes/pteVam1/pteVam1.2bit
pteVam1.clusterGenome = /hive/data/genomes/pteVam1/pteVam1.2bit
pteVam1.ooc = /hive/data/genomes/pteVam1/jkStuff/pteVam1.11.ooc
pteVam1.lift = no
pteVam1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
pteVam1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
pteVam1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
pteVam1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
pteVam1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
pteVam1.refseq.mrna.native.load = no
pteVam1.refseq.mrna.xeno.load = yes
pteVam1.genbank.mrna.xeno.load = yes
pteVam1.genbank.est.native.load = no
pteVam1.downloadDir = pteVam1
pteVam1.perChromTables = no

    # end of section added to etc/genbank.conf
    git commit -m "adding pteVam1 megabat" etc/genbank.conf
    git push
    make etc-update

    git pull
    # Edit src/lib/gbGenome.c to add new species.
    git commit -m "adding definition for pteVamNames" src/lib/gbGenome.c
    git push
    make install-server

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S pteVam1           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial pteVam1 &
    #   var/build/logs/2012.06.08-09:55:28.pteVam1.initalign.log
    #   real    1862m46.698s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad pteVam1 &
    #	logFile:  var/dbload/hgwdev/logs/2012.06.11-13:09:05.dbload.log
    #   real    14m48.000s

    # enable daily alignment and update of hgwdev (DONE - 2012-05-11 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add pteVam1 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added pteVam1." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# set default position to RHO gene displays  (DONE - 2012-07-24 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="scaffold_8765:35746-39000" where name="pteVam1";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-24 - Hiram)
    mkdir /hive/data/genomes/pteVam1/pushQ
    cd /hive/data/genomes/pteVam1/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl pteVam1 2> stderr.txt | grep -v transMap > pteVam1.sql
    #   real    3m39.590s
    # check the stderr.txt for bad stuff, these kinds of warnings are OK:
# WARNING: hgwdev does not have /gbdb/pteVam1/wib/gc5Base.wib
# WARNING: hgwdev does not have /gbdb/pteVam1/wib/quality.wib
# WARNING: hgwdev does not have /gbdb/pteVam1/bbi/quality.bw
# WARNING: pteVam1 does not have seq
# WARNING: pteVam1 does not have extFile
# WARNING: pteVam1 does not have estOrientInfo

    scp -p pteVam1.sql hgwbeta:/tmp
    ssh hgwbeta "hgsql qapushq < /tmp/pteVam1.sql"

############################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
# add chromAlias table (DONE - 2016-10-15 - Hiram)

    mkdir /hive/data/genomes/pteVam1/bed/chromAlias
    cd /hive/data/genomes/pteVam1/bed/chromAlias

    hgsql -N -e 'select chrom,name from ucscToRefSeq;' pteVam1 \
        | sort -k1,1 > ucsc.refseq.tab
    hgsql -N -e 'select chrom,name from ucscToINSDC;' pteVam1 \
        | sort -k1,1 > ucsc.genbank.tab

    ### Adding Ensembl alias with v95 release, after idKeys made: 2019-01-16
    join -t$'\t' ../idKeys/pteVam1.idKeys.txt \
        ../../ens95/ensPteVam1.idKeys.txt | cut -f2- \
          | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \
            | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \
               | sort -k1,1 -k2,2n > ucscToEns.bed
    cut -f1,4 ucscToEns.bed | sort > ucsc.ensembl.tab
    # and chrM name for genbank is: AJ421455.1
    wc -l *.tab
#    6835 ucsc.ensembl.tab
#    6835 ucsc.genbank.tab
#    6835 ucsc.refseq.tab

    grep chrM *.tab
# ucsc.ensembl.tab:chrM   MT
# ucsc.genbank.tab:chrM   AJ421455.1
# ucsc.refseq.tab:chrM    NC_004299.1

    ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \
	> pteVam1.chromAlias.tab

for t in refseq genbank ensembl
do
  c0=`cat ucsc.$t.tab | wc -l`
  c1=`grep $t pteVam1.chromAlias.tab | wc -l`
  ok="OK"
  if [ "$c0" -ne "$c1" ]; then
     ok="ERROR"
  fi
  printf "# checking $t: $c0 =? $c1 $ok\n"
done
# checking refseq: 6835 =? 6835 OK
# checking genbank: 6835 =? 6835 OK
# checking ensembl: 6835 =? 6835 OK

    hgLoadSqlTab pteVam1 chromAlias ~/kent/src/hg/lib/chromAlias.sql \
        pteVam1.chromAlias.tab

#########################################################################
