# for emacs: -*- mode: sh; -*-

#       $Id: macEug2.txt,v 1.6 2010/05/06 16:27:44 chinhli Exp $

# Macropus eugenii (wallaby) - TWGSC  Meug_1.1 (2009-09-29)

#	http://www.ncbi.nlm.nih.gov/genome/233
#	http://www.ncbi.nlm.nih.gov/bioproject/12587
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABQO01

# file template copied from susScr2.txt

DATE:	29-SEP-2009
ORGANISM:	Macropus eugenii
TAXID:	9315
ASSEMBLY LONG NAME:	Meug_1.1
ASSEMBLY SHORT NAME:	Meug_1.1
ASSEMBLY SUBMITTER:	Tammar Wallaby Genome Sequencing Consortium
ASSEMBLY TYPE:	Haploid
NUMBER OF ASSEMBLY-UNITS:	1
Assembly Accession:	GCA_000004035.1


#  Macropus eugenii
#   (NCBI Project ID: 12586, Accession: GCA_000004035.1)
#   by Tammar Wallaby Genome Sequencing Consortium
# ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Macropus_eugenii/Meug_1.1/

##########################################################################
# Download sequence (DONE- 2010-10-22 - Chin)
    mkdir /hive/data/genomes/macEug2
    cd /hive/data/genomes/macEug2
    mkdir genbank
    cd genbank
    wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \
        --no-remove-listing -np \
"ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Macropus_eugenii/Meug_1.1/*"
    # FINISHED --2010-10-22 08:54:16--
    # Downloaded: 12 files, 1.0G in 13m 49s (1.28 MB/s)
    # Read ASSEMBLY_INFO

    # stay at genbank directory
    # Process the unplaced scaffolds, filter out the
    #   The ".1" at the end (i.e. ABQO010000034.1) of contig name, since
    #   MySQL does not allow "." as part of the table name and
    #   will casue problems in genbank task step later

    export S=Primary_Assembly/unplaced_scaffolds
    zcat ${S}/AGP/unplaced.scaf.agp.gz | grep "^#" > macEug2.agp
    # append the gap records
    zcat ${S}/AGP/unplaced.scaf.agp.gz | grep -v "^#" \
    	    | sed  -e "s/\.1//"  >> macEug2.agp
    gzip macEug2.agp &

    zcat ${S}/FASTA/unplaced.scaf.fa.gz \
    	    | sed -e "s#^>.*|gb|#>#; s#|.*##"  -e "s/\.1//"  \
    	    | gzip > macEug2.fa.gz
    zcat macEug2.fa.gz | grep "^>" | wc
    # 277711  277711 5189116

   faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz
   # 3075184024 bases (539107070 N's 2536076954 real 2536076954 upper
   #   0 lower) in 277711 sequences in 1 files



    # N50
    mkdir N50
    faCount  macEug2.fa | awk ' /^(GL|ABQO)/ {print $1, $2}' > N50/chrom.sizes
    n50.pl N50/chrom.sizes
#       reading: N50/chrom.sizes
#       contig count: 277711, total size: 3075184024, one half size:
#       1537592012
# cumulative    N50 count       contig  contig size
1537563674      24532   GL116276        36603
1537592012 one half size
1537600276      24533   GL121428        36602


#########################################################################
# Initial makeGenomeDb.pl (DONE - 2010-11-04 - Chin)
    cd /hive/data/genomes/macEug2
    cat << '_EOF_' > macEug2.config.ra
# Config parameters for makeGenomeDb.pl:
db macEug2
clade mammal
genomeCladePriority 67
scientificName Macropus eugenii
commonName Wallaby
assemblyDate Sep. 2009
assemblyLabel TWGS (NCBI Project ID: 12586, Accession: GCA_000004035.1)
assemblyShortLabel TWGS Meug_1.1
orderKey 278
mitoAcc none
fastaFiles /hive/data/genomes/macEug2/genbank/macEug2.fa.gz
agpFiles /hive/data/genomes/macEug2/genbank/macEug2.agp.gz
# qualFiles none
dbDbSpeciesDir wallaby
taxId 9315
'_EOF_'
    # << happy emacs
    time makeGenomeDb.pl -noGoldGapSplit -workhorse=hgwdev macEug2.config.ra \
	> makeGenomeDb.log 2>&1 &
    # real  26m19.911s
    #	add the trackDb entries to the source tree, and the 2bit link:
    # we will re-ln with masked one after we done the mask business later! ***
    ln -s `pwd`/macEug2.unmasked.2bit /gbdb/macEug2/macEug2.2bit

    #  Per instructions in makeGenomeDb.log:
    mkdir -p  ~/kent/src/hg/makeDb/trackDb/wallaby/macEug2
    cd /cluster/data/macEug2/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/wallaby/macEug2
    cp *.*  ~/kent/src/hg/makeDb/trackDb/wallaby/macEug2
    cd ~/kent/src/hg/makeDb/trackDb
    #  edit makefile to add macEug2 to DBS.
    # git add wallaby/macEug2/*.{ra,html}
    # git commit -m "Added macEug2 to DBS." makefile
    # git commit -m "Initial descriptions for macEug2." wallaby/macEug2/*.{ra,html}
    # git pull; git push
    # Run make update DBS=macEug2 and make alpha when done.
    # (optional) Clean up /cluster/data/macEug2/TemporaryTrackDbCheckout


#########################################################################
# RepeatMasker (DONE - 2010-11-05 - Chin)
    mkdir /hive/data/genomes/macEug2/bed/repeatMasker
    cd /hive/data/genomes/macEug2/bed/repeatMasker

    time nice -n +19 doRepeatMasker.pl -buildDir=`pwd` \
	-workhorse=hgwdev -bigClusterHub=swarm -noSplit macEug2 > do.log 2>&1 &
    #   real     294m1.203s
    cat faSize.rmsk.txt
    # 3075184024 bases (539107070 N's 2536076954 real 1334921991 upper
    # 1201154963 lower) in 277711 sequences in 1 files
    # %39.06 masked total, %47.36 masked real

#########################################################################
# simpleRepeats (DONE - 2010-11-06 - Chin)
    mkdir /hive/data/genomes/macEug2/bed/simpleRepeat
    cd /hive/data/genomes/macEug2/bed/simpleRepeat

    time nice -n +19 doSimpleRepeat.pl -buildDir=`pwd` -workhorse=hgwdev \
	-bigClusterHub=pk -smallClusterHub=pk macEug2 > do.log 2>&1 &
    #   real     175m13.951s
    cat fb.simpleRepeat
    # 38604856 bases of 2536076957 (1.522%) in intersection

    #	add to the repeatMasker
    cd /hive/data/genomes/macEug2
    twoBitMask macEug2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed macEug2.2bit
    #	safe to ignore warnings about >=13 fields
    twoBitToFa macEug2.2bit stdout | faSize stdin > macEug2.2bit.faSize.txt
    cat macEug2.2bit.faSize.txt
    # 3075184024 bases (539107070 N's 2536076954 real 1334271294 upper
    # 1201805660 lower) in 277711 sequences in 1 files
    # %39.08 masked total, %47.39 masked real

    # double check with featureBits
    featureBits -countGaps macEug2 gap
    # 539107067 bases of 3075184024 (17.531%) in intersection

    rm /gbdb/macEug2/macEug2.2bit
    ln -s `pwd`/macEug2.2bit /gbdb/macEug2/macEug2.2bit

#########################################################################
# Marking *all* gaps - they are all in the AGP file
#	(DONE - 2010-11-08 - Chin)
    mkdir /hive/data/genomes/macEug2/bed/allGaps
    cd /hive/data/genomes/macEug2/bed/allGaps

    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../macEug2.unmasked.2bit > findMotif.txt 2>&1
    #   real     1m46.677s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    featureBits macEug2 -not gap -bed=notGap.bed
    #  2536076957 bases of 2536076957 (100.000%) in intersection
    featureBits macEug2 allGaps.bed notGap.bed -bed=new.gaps.bed
    # 3 bases of 2536076957 (0.000%) in intersection
    hgsql -N -e "select ix from gap;" macEug2 | sort -n | tail -1
    #	273


########################################################################
# Create kluster run files (DONE 2010-11-04  - Chin)
    # numerator is macEug2 gapless bases "real" as reported by:
    featureBits -noRandom -noHap macEug2 gap
    #     1600136831 bases of 1184628269 (135.075%) in intersection

    # denominator is hg19 gapless bases as reported by:
    #	featureBits -noRandom -noHap hg19 gap
    #     234344806 bases of 2861349177 (8.190%) in intersection
    # 1024 is threshold used for human -repMatch:
    calc \( 1184628269 / 2861349177 \) \* 1024
    #   ( 1184628269 / 2861349177 ) * 1024 = 423.946632
    # ==> use -repMatch=400 according to size scaled down from 1024 for human.
    #	and rounded down to nearest 50
    cd /hive/data/genomes/macEug2
    blat macEug2.2bit \
	 /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/macEug2.11.ooc \
	-repMatch=400 &
    #	Wrote 19704 overused 11-mers to jkStuff/macEug2.11.ooc
    mkdir /hive/data/staging/data/macEug2
    cp -p macEug2.2bit jkStuff/macEug2.11.ooc /hive/data/staging/data/macEug2
    cp -p chrom.sizes /hive/data/staging/data/macEug2
    #	check non-bridged gaps to see what the typical size is:
    hgsql -N \
	-e 'select * from gap where bridge="no" order by size;' macEug2 \
	| sort -k7,7nr
    #   most gaps have size > 100,000
    #	decide on a minimum gap for this break
    gapToLift -verbose=2 -minGap=20000 macEug2 jkStuff/nonBridged.lft \
	-bedFile=jkStuff/nonBridged.bed
    cp -p jkStuff/nonBridged.lft \
	/hive/data/staging/data/macEug2/macEug2.nonBridged.lft
    # ask cluster-admin to copy (evry time if any file chsnged)
    #    /hive/data/staging/data/macEug2 directory to cluster nodes
    #    /scratch/data/macEug2

########################################################################
# GENBANK AUTO UPDATE (DONE - 2011-10-20 - Chin)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull

    # edit etc/genbank.conf to add macEug2 just before susScr2

# macEug2 (wallaby)
macEug2.serverGenome = /hive/data/genomes/macEug2/macEug2.2bit
macEug2.clusterGenome = /scratch/data/macEug2/macEug2.2bit
macEug2.ooc = /scratch/data/macEug2/macEug2.11.ooc
macEug2.lift = no
macEug2.perChromTables = no
macEug2.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
macEug2.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
macEug2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
macEug2.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
macEug2.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
macEug2.genbank.est.xeno.pslCDnaFilter    = ${ordered.genbank.est.xeno.pslCDnaFilter}
macEug2.downloadDir = macEug2
macEug2.refseq.mrna.native.load  = no
macEug2.refseq.mrna.xeno.load = yes
macEug2.refseq.mrna.xeno.loadDesc  = yes
macEug2.genbank.est.native.load = yes
macEug2.genbank.est.native.loadDesc = yes
macEug2.genbank.mrna.native.load = yes
macEug2.genbank.mrna.native.loadDesc = yes
macEug2.genbank.mrna.xeno.load = yes
macEug2.genbank.mrna.xeno.loadDesc = yes
macEug2.genbank.est.native.load = yes
macEug2.genbank.est.native.loadDesc = yes

    git add etc/genbank.conf
    git commit -m "Added macEug2" etc/genbank.conf
    git pull
    git push
    # update /cluster/data/genbank/:
    make etc-update


# Edit src/lib/gbGenome.c to add new species.  With these two lines:
# static char *oviAriNames[] = {"Ovis aries", NULL};
#   ... later ...
#    {"macEug2", macEug2Names},
#  gbGenome.c is  in
#  /cluster/home/chinhli/kent/src/hg/makeDb/genbank/src/lib
# make and checkin

    make install-server
    git add src/lib/gbGenome.c
    git commit -m "adding macEug2 Wallby" src/lib/gbGenome.c
    git pull
    git push

    ssh hgwsev
    screen	#  control this business with a screen since it takes a while
    cd /cluster/data/genbank
     time nice -n +19 ./bin/gbAlignStep -initial macEug2 &
    #   logFile: var/build/logs/2011.10.23-21:57:16.macEug2.initalign.log
    #   real    4810m43.818s
    #     /cluster/data/genbank/data/aligned/genbank.176.0/macEug2

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad macEug2 &
    #   logFile: var/dbload/hgwdev/logs/2011.10.27-08:35:00.dbload.log
    #   real    66m57.028s


    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add macEug2 to:
        etc/align.dbs
        etc/hgwdev.dbs
    git add etc/align.dbs
    git add etc/hgwdev.dbs
    git commit  -m "Added macEug2 - Wallaby" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

    doGenbankTests macEug2 genbankTest.out
    # no error

##########################################################################
## WINDOWMASKER (DONE - 2011-11-30 - Chin)
    mkdir /hive/data/genomes/macEug2/bed/windowMasker
    cd /hive/data/genomes/macEug2/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev macEug2 > do.log 2>&1 &
    #   real    262m19.608s

    # Masking statistics
    twoBitToFa macEug2.wmsk.2bit stdout | faSize stdin
    # 3075184024 bases (539107070 N's 2536076954 real 1634442088 upper
    # 901634866 lower) in 277711 sequences in 1 files
    # %29.32 masked total, %35.55 masked real
    hgLoadBed macEug2 windowmaskerSdust windowmasker.sdust.bed.gz
    # Loaded 18037550 elements of size 3

    featureBits -countGaps macEug2 windowmaskerSdust
    # 1457250153 bases of 3075184024 (47.387%) in intersection

    #   eliminate the gaps from the masking
    featureBits macEug2 -not gap -bed=notGap.bed
    #   2536076957 bases of 2536076957 (100.000%) in intersection
    time nice -n +19 featureBits macEug2 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #   918147928 bases of 2536076957 (36.203%) in intersection
    #   real    793m0.546s

    #   reload track to get it clean
    hgLoadBed macEug2 windowmaskerSdust cleanWMask.bed.gz
    #   Loaded 17901097 elements of size 4
    featureBits -countGaps macEug2 windowmaskerSdust
    #   918147928 bases of 3075184024 (29.857%) in intersection
    #   mask the sequence with this clean mask
    zcat cleanWMask.bed.gz \
        | twoBitMask ../../macEug2.unmasked.2bit stdin \
            -type=.bed macEug2.cleanWMSdust.2bit

    twoBitToFa macEug2.cleanWMSdust.2bit stdout | faSize stdin \
        > macEug2.cleanWMSdust.faSize.txt
    cat macEug2.cleanWMSdust.faSize.txt
    #   3075184024 bases (539107070 N's 2536076954 real 1617929026 upper
    #   918147928 lower) in 277711 sequences in 1 files
    #   %29.86 masked total, %36.20 masked real

############################################################################
# reset position to RHO location as found from blat of hg19 RHO gene
    hgsql -e \
'update dbDb set defaultPos="GL095587:25,998-40,635" where name="macEug2";' \
        hgcentraltest
    # set default Db to macEug2
     hgsql -e "update defaultDb set name='macEug2' where genome = 'Wallaby';" \
         hgcentraltest

##########################################################################
#  BLATSERVERS ENTRY (DONE 2011-12-07  - Chin)
# After getting a blat server assigned by the Blat Server Gods,
# Assembly macEug2 has been started on host blat2 on port 17784 (trans) and
# port 17785 (untrans).

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
        VALUES ("macEug2", "blat2", "17784", "1", "0"); \
        INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
        VALUES ("macEug2", "blat2", "17785", "0", "1");' \
            hgcentraltest
    #   test it with some sequence

#########################################################################
# all.joiner update, downloads and in pushQ - (DONE 2011-12-07 - Chin)
    cd $HOME/kent/src/hg/makeDb/schema
    # fixup all.joiner until this is a clean output
    joinerCheck -database=macEug2 -all all.joiner
    # Error: macEug2.tableList tables not in .joiner file ignored

    mkdir /hive/data/genomes/macEug2/goldenPath
    cd /hive/data/genomes/macEug2/goldenPath
    makeDownloads.pl macEug2 > do.log 2>&1
    # edit all htmls and READMEs per do.log instruction

    #   now ready for pushQ entry
    mkdir /hive/data/genomes/macEug2/pushQ
    cd /hive/data/genomes/macEug2/pushQ
    makePushQSql.pl macEug2 > macEug2.pushQ.sql 2> stderr.out
    #   check for errors in stderr.out, some are OK, e.g.:
# WARNING: macEug2 does not have seq
# WARNING: macEug2 does not have extFile

# WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of
# supporting and genbank tables) which tracks to assign these tables to:
#  bosTau4ChainPileUp

    #   copy it to hgwbeta
    scp -p macEug2.pushQ.sql hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < macEug2.pushQ.sql
    #   in that pushQ entry walk through each entry and see if the
    #   sizes will set properly

#########################################################################
# genscan - (working - 2013-03-06 - Chin)
    mkdir /hive/data/genomes/macEug2/bed/genscan
    cd /hive/data/genomes/macEug2/bed/genscan
    screen -S macEug2-genscan
    time -p doGenscan.pl macEug2  > do.log 2>&1
real 322126.50
Checking job status 5277 minutes after launch
277711 jobs in batch
2 jobs (including everybody's) in Parasol queue or running.
Checking finished jobs
Can't open /hive/data/genomes/macEug2/bed/genscan/batch.1656.swarm.soe.ucsc.edu.tmp to write: Too many open files
Command failed:
ssh -x -o 'StrictHostKeyChecking = no' -o 'BatchMode = yes' swarm nice /hive/data/genomes/macEug2/bed/genscan/doGenscan.csh




XXXX 2013-03-11 9:55 wait for panPan1 to pass first!
    time -p doGenscan.pl -continue genscan macEug2  > genscan.log 2>&1

    #   real    31m32.788s
# Completed: 27239 of 27239 jobs
# CPU time in finished jobs:         28s       0.47m     0.01h    0.00d  0.000
# y
# IO & Wait Time:                 96830s    1613.83m    26.90h    1.12d  0.003
# y
# Average job time:                   4s       0.06m     0.00h    0.00d
# Longest finished job:              16s       0.27m     0.00h    0.00d
# Submission to last job:           154s       2.57m     0.04h    0.00d
# Estimated complete:                 0s       0.00m     0.00h    0.00d



    cat fb.macEug2.genscan.txt
    #   49283434 bases of 2756609047 (1.788%) in intersection
    cat fb.macEug2.genscanSubopt.txt
    #   50711544 bases of 2756609047 (1.840%) in intersection

    # 2013-09-12 - Hiram - one small contig was holding up
    # the show: ABQO011198437
    # it is too small and causes genscan to run forever, finish off
    # the procedure manually
    cd /hive/data/genomes/macEug2/bed/genscan
    time -p doGenscan.pl -continue=makeBed macEug2 > makeBed.log 2>&1

    cat fb.macEug2.genscan.txt
    # 38107713 bases of 2536076957 (1.503%) in intersection
    cat fb.macEug2.genscanSubopt.txt
    # 34146452 bases of 2536076957 (1.346%) in intersection

#########################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram)
    mkdir /hive/data/genomes/macEug2/bed/ucscToINSDC
    cd /hive/data/genomes/macEug2/bed/ucscToINSDC

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh
    ./verifyAll.sh
    ./join.sh

    sed -e "s/21/13/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
        | hgLoadSqlTab macEug2 ucscToINSDC stdin ucscToINSDC.tab
    checkTableCoords macEug2 ucscToINSDC
    featureBits -countGaps macEug2 ucscToINSDC
    # 3075184024 bases of 3075184024 (100.000%) in intersection

    # verify the track link to INSDC functions

##############################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
