# for emacs: -*- mode: sh; -*-
#
#	the above keeps emacs happy while working with this text document

# This file describes how we made the browser database on the
#	Melopsittacus undulatus - budgerigar
#	WashU St. Louis - http://genome.wustl.edu/

#	http://www.ncbi.nlm.nih.gov/bioproject/72527
#	http://www.ncbi.nlm.nih.gov/genome/10765
#	http://www.ncbi.nlm.nih.gov/genome/assembly/325078/
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AGAI01
#	Genome Coverage : 16x 454; 7x Illumina

#	http://www.ncbi.nlm.nih.gov/bioproject/19105 - chrMt
#	chrMt NC_009134.1 - EF450826.1

#	DATE:	13-Sep-2011
#	ORGANISM:	Melopsittacus undulatus
#	TAXID:	13146
#	ASSEMBLY LONG NAME:	Melopsittacus undulatus Budgerigar Version 6.3
#	ASSEMBLY SHORT NAME:	Melopsittacus_undulatus_6.3
#	ASSEMBLY SUBMITTER:	Washington University School of Medicine
#	ASSEMBLY TYPE:	Haploid
#	NUMBER OF ASSEMBLY-UNITS:	1
#	ASSEMBLY ACCESSION:	GCA_000238935.1
#	FTP-RELEASE DATE: 02-Jan-2012

#	Eukaryota; Opisthokonta; Metazoa; Eumetazoa; Bilateria; Coelomata;
#	Deuterostomia; Chordata; Craniata; Vertebrata; Gnathostomata;
#	Teleostomi; Euteleostomi; Sarcopterygii; Tetrapoda; Amniota;
#	Sauropsida; Sauria; Archosauria; Dinosauria; Saurischia; Theropoda;
#	Coelurosauria; Aves; Neognathae; Psittaciformes; Psittacidae;
#	Melopsittacus


#########################################################################
## Download sequence (DONE - 2012-03-26 - Hiram)
    mkdir /hive/data/genomes/melUnd1
    mkdir /hive/data/genomes/melUnd1/genbank
    cd /hive/data/genomes/melUnd1/genbank

    time rsync -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Melopsittacus_undulatus/Melopsittacus_undulatus_6.3/ ./
    #	real    4m16.584s

    faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz
# 1117355426 bases (30758804 N's 1086596622 real 1086596622 upper 0 lower)
#	in 25211 sequences in 1 files
# Total size: mean 44320.2 sd 774121.2 min 228 (gi|366922013|gb|JH533144.1|)
#	max 39887635 (gi|366898540|gb|JH556617.1|) median 1201

    faCount Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz \
	> faCount.txt
     egrep -v "^#seq|^total" faCount.txt \
	| awk '{printf "%s\t%d\n", $1, $2}' \
	| sed -e 's/^gi.*gb|//; s/|//' | sort -k2,2nr > chrom.sizes

    n50.pl chrom.sizes
#       reading: chrom.sizes
#       contig count: 25211, total size: 1117355426, one half size: 558677713
# cumulative    N50 count       contig  contig size
549534897       27      JH556592.1      11405270
558677713 one half size
560149280       28      JH556573.1      10614383

#########################################################################
# process into UCSC naming scheme (DONE - 2012-03-27 - Hiram)
    mkdir /hive/data/genomes/melUnd1/ucsc
    cd /hive/data/genomes/melUnd1/ucsc

    # verify we don't have any .acc numbers different from .1
    zcat \
    ../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \
	| cut -f1 | egrep "^JH|AGAI" \
	| sed -e 's/^JH[0-9][0-9]*//; s/^AGAI[0-9][0-9]*//' | sort | uniq -c
    #	116513 .1

    # this is like the unplaced.pl script in other assemblies except it
    #	does not add chrUn_ to the names since they are all just scaffolds

    cat << '_EOF_' > unplaced.pl
#!/bin/env perl

use strict;
use warnings;

my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
open (FH, "zcat $agpFile|") or die "can not read $agpFile";
open (UC, "|gzip -c > unplaced.agp.gz") or die "can not write to unplaced.agp.gz";
while (my $line = <FH>) {
    if ($line =~ m/^#/) {
        print UC $line;
    } else {
        $line =~ s/\.1//;    
        printf UC "%s", $line;
    }
}
close (FH);
close (UC);

open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
open (UC, "|gzip -c > unplaced.fa.gz") or die "can not write to unplaced.fa.gz";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/.*gb\|//;
        $line =~ s/\.1\|.*//;
        printf UC ">$line\n";
    } else {
        print UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs
    chmod +x unplaced.pl

    time ./unplaced.pl
    #	real    5m23.124s
# -rw-rw-r-- 1   1483871 Mar 27 12:29 unplaced.agp.gz
# -rw-rw-r-- 1 332029817 Mar 27 12:34 unplaced.fa.gz

    # verify nothing lost in the translation, should be the same as above
    #	except for the name translations
    faSize *.fa.gz
# 1117355426 bases (30758804 N's 1086596622 real 1086596622 upper 0 lower)
#	in 25211 sequences in 1 files
# Total size: mean 44320.2 sd 774121.2 min 228 (JH533144)
#	max 39887635 (JH556617) median 1201

#############################################################################
#  Initial database build (DONE - 2012-03-27 - Hiram)
    cd /hive/data/genomes/melUnd1
    cat << '_EOF_' > melUnd1.config.ra
# Config parameters for makeGenomeDb.pl:
db melUnd1
clade vertebrate
genomeCladePriority 60
scientificName Melopsittacus undulatus
commonName Budgerigar
assemblyDate Sep. 2011
assemblyLabel WUSTL Melopsittacus undulatus Budgerigar Version 6.3 (GCA_000238935.1)
assemblyShortLabel WUSTL v6.3
ncbiAssemblyName Melopsittacus_undulatus_6.3
ncbiAssemblyId 325078
orderKey 4375
mitoAcc NC_009134
fastaFiles /hive/data/genomes/melUnd1/ucsc/unplaced.fa.gz
agpFiles /hive/data/genomes/melUnd1/ucsc/unplaced.agp.gz
dbDbSpeciesDir birds
taxId   13146
'_EOF_'
    # << happy emacs

    # first verify the sequence and AGP files are OK
    time makeGenomeDb.pl -stop=agp -workhorse=hgwdev melUnd1.config.ra \
	> agp.log 2>&1
    #	real    1m24.027s
    # verify that was OK, look at the agp.log file
    time makeGenomeDb.pl -continue=db -workhorse=hgwdev melUnd1.config.ra \
	> db.log 2>&1
    #	real    8m11.569s
    # verify that was OK, look at the do.log file
    # copy the trackDb business to the source tree, check it in and add
    #	to the trackDb/makefile

#############################################################################
# running repeat masker (DONE - 2012-03-27 - Hiram)
    mkdir /hive/data/genomes/melUnd1/bed/repeatMasker
    cd /hive/data/genomes/melUnd1/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=encodek melUnd1 > do.log 2>&1 &
    #	real    120m5.509s
    cat faSize.rmsk.txt
    #	1117373619 bases (30758804 N's 1086614815 real 1006316728 upper
    #	80298087 lower) in 25212 sequences in 1 files
    #	Total size: mean 44319.1 sd 774105.8 min 228 (JH533144)
    #	max 39887635 (JH556617) median 1201
    #	%7.19 masked total, %7.39 masked real

    grep -i versi do.log
# RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $
#    April 26 2011 (open-3-3-0) version of RepeatMasker

    featureBits -countGaps melUnd1 rmsk
    #	80384935 bases of 1117373619 (7.194%) in intersection
    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, this featureBits count
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2012-03-27 - Hiram)
    mkdir /hive/data/genomes/melUnd1/bed/simpleRepeat
    cd /hive/data/genomes/melUnd1/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \
	melUnd1 > do.log 2>&1 &
    #	real    23m16.165s

    cat fb.simpleRepeat 
    #	5906822 bases of 1086620216 (0.544%) in intersection

    # not going to add to rmsk here, using the window masker instead since
    #	it masks more sequence
    cd /hive/data/genomes/melUnd1
    twoBitMask melUnd1.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed melUnd1.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa melUnd1.2bit stdout | faSize stdin > faSize.melUnd1.2bit.txt
    cat faSize.melUnd1.2bit.txt
    #	2608572064 bases (131440969 N's 2477131095 real 1320629270 upper
    #	1156501825 lower) in 2685 sequences in 1 files
    #	Total size: mean 971535.2 sd 4827933.6 min 1003 (AGCE01151413)
    #	max 72162052 (JH378105) median 1553

    rm /gbdb/melUnd1/melUnd1.2bit
    ln -s `pwd`/melUnd1.2bit /gbdb/melUnd1/melUnd1.2bit

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2012-03-27 - Hiram)
    mkdir /hive/data/genomes/melUnd1/bed/gap
    cd /hive/data/genomes/melUnd1/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../melUnd1.unmasked.2bit > findMotif.txt 2>&1
    #	real    0m8.820s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed

    time featureBits -countGaps melUnd1 -not gap -bed=notGap.bed
    #	1086620216 bases of 1117373619 (97.248%) in intersection
    #	real    0m7.713s

    time featureBits -countGaps melUnd1 allGaps.bed notGap.bed -bed=new.gaps.bed
    #	5401 bases of 1117373619 (0.000%) in intersection
    #	real    6m52.681s

    #	what is the highest index in the existing gap table:
    hgsql -N -e "select ix from gap;" melUnd1 | sort -n | tail -1
    #	2622
    cat << '_EOF_' > mkGap.pl
#!/bin/env perl

use strict;
use warnings;

my $ix=`hgsql -N -e "select ix from gap;" melUnd1 | sort -n | tail -1`;
chomp $ix;

open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
while (my $line = <FH>) {
    my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
    ++$ix;
    printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
        $chromEnd, $ix, $chromEnd-$chromStart;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./mkGap.pl
    ./mkGap.pl > other.bed
    featureBits -countGaps melUnd1 other.bed
    #	5401 bases of 1117373619 (0.000%) in intersection
    wc -l other.bed
    #	5371
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
	-noLoad melUnd1 otherGap other.bed
    #	Read 5371 elements of size 8 from other.bed
    #	starting with this many
    hgsql -e "select count(*) from gap;" melUnd1
    #	45651
    hgsql melUnd1 -e 'load data local infile "bed.tab" into table gap;'
    #	result count:
    hgsql -e "select count(*) from gap;" melUnd1
    #	51022
    # == 45651 + 5371
    # there is an odd one marked as bridged when it is actually a continuation
    #	of a non-bridged gap.
    #	Found when running gapToLift below, without the -minGap=100.
    #	chr22:42178985-42178985
    hgsql -e 'update gap set bridge="no" where chromStart=42178984 AND chrom="chr22";' melUnd1

    # verify we aren't adding gaps where gaps already exist
    # this would output errors if that were true:
    gapToLift -minGap=1 melUnd1 nonBridged.lift -bedFile=nonBridged.bed
    # see example in danRer7.txt

    # no unannotated gaps, no more to run here
    # there are no non-bridged gaps here:
    hgsql -N -e "select bridge from gap;" melUnd1 | sort | uniq -c
    #	51022 yes

##########################################################################
## WINDOWMASKER (DONE - 2012-03-27 - Hiram)
    mkdir /hive/data/genomes/melUnd1/bed/windowMasker
    cd /hive/data/genomes/melUnd1/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev melUnd1 > do.log 2>&1 &
    #	about 47 minutes

    # Masking statistics
    twoBitToFa melUnd1.wmsk.2bit stdout | faSize stdin
    #	1117373619 bases (30758804 N's 1086614815 real 878580317 upper
    #	208034498 lower) in 25212 sequences in 1 files
    #	Total size: mean 44319.1 sd 774105.8 min 228 (JH533144)
    #	max 39887635 (JH556617) median 1201
    #	%18.62 masked total, %19.15 masked real

    twoBitToFa melUnd1.wmsk.sdust.2bit stdout | faSize stdin
    #	1117373619 bases (30758804 N's 1086614815 real 873701307 upper
    #	212913508 lower) in 25212 sequences in 1 files
    #	Total size: mean 44319.1 sd 774105.8 min 228 (JH533144)
    #	max 39887635 (JH556617) median 1201
    #	%19.05 masked total, %19.59 masked real

    hgLoadBed melUnd1 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Read 5868579 elements of size 3 from windowmasker.sdust.bed.gz

    featureBits -countGaps melUnd1 windowmaskerSdust
    #	243668416 bases of 1117373619 (21.807%) in intersection

    #	eliminate the gaps from the masking
    featureBits melUnd1 -not gap -bed=notGap.bed
    #	1086614815 bases of 1086614815 (100.000%) in intersection
    time nice -n +19 featureBits melUnd1 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #	212913508 bases of 1086614815 (19.594%) in intersection
    #	real    4m52.646s

    #	reload track to get it clean
    hgLoadBed melUnd1 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 5864264 elements of size 4
    time featureBits -countGaps melUnd1 windowmaskerSdust
    #	212913508 bases of 1117373619 (19.055%) in intersection
    #	real    0m43.151s

    #	mask with this clean result
    zcat cleanWMask.bed.gz \
	| twoBitMask ../../melUnd1.unmasked.2bit stdin \
	    -type=.bed melUnd1.cleanWMSdust.2bit
    twoBitToFa melUnd1.cleanWMSdust.2bit stdout | faSize stdin \
        > melUnd1.cleanWMSdust.faSize.txt
    cat melUnd1.cleanWMSdust.faSize.txt
    #	1117373619 bases (30758804 N's 1086614815 real 873701307 upper
    #	212913508 lower) in 25212 sequences in 1 files
    #	Total size: mean 44319.1 sd 774105.8 min 228 (JH533144)
    #	max 39887635 (JH556617) median 1201
    #	%19.05 masked total, %19.59 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps melUnd1 rmsk windowmaskerSdust
    #	62533481 bases of 1117373619 (5.596%) in intersection
    #	real    0m53.765s

#########################################################################
# MASK SEQUENCE WITH WM+TRF (DONE - 2012-03-29 - Hiram)
    cd /hive/data/genomes/melUnd1
    twoBitMask -add bed/windowMasker/melUnd1.cleanWMSdust.2bit \
	bed/simpleRepeat/trfMask.bed melUnd1.2bit
    #	safe to ignore the warnings about BED file with >=13 fields
    time twoBitToFa melUnd1.2bit stdout | faSize stdin > faSize.melUnd1.txt
    #	real    0m20.373s
    cat faSize.melUnd1.txt
    #	1117373619 bases (30758804 N's 1086614815 real 873453109 upper
    #	213161706 lower) in 25212 sequences in 1 files
    #	Total size: mean 44319.1 sd 774105.8 min 228 (JH533144)
    #	max 39887635 (JH556617) median 1201
    #	%19.08 masked total, %19.62 masked real

    #	create symlink to gbdb
    ssh hgwdev
    cd /hive/data/genomes/melUnd1
    rm /gbdb/melUnd1/melUnd1.2bit
    ln -s `pwd`/melUnd1.2bit /gbdb/melUnd1/melUnd1.2bit

#########################################################################
# cpgIslands - (DONE - 2011-04-24 - Hiram)
    mkdir /hive/data/genomes/melUnd1/bed/cpgIslands
    cd /hive/data/genomes/melUnd1/bed/cpgIslands
    time doCpgIslands.pl melUnd1 > do.log 2>&1
    #   real    58m33.619s

    cat fb.melUnd1.cpgIslandExt.txt
    #   8424594 bases of 1086614815 (0.775%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-26 - Hiram)
    mkdir /hive/data/genomes/melUnd1/bed/genscan
    cd /hive/data/genomes/melUnd1/bed/genscan
    time doGenscan.pl melUnd1 > do.log 2>&1
    # one job failed
./runGsBig.csh JH556617 006 gtf/006/JH556617.gtf pep/006/JH556617.pep subopt/006/JH556617.bed
    # re-running with a window size of 2000000 OK
    # continuing:
    time doGenscan.pl -continue=makeBed melUnd1 > makeBed.log 2>&1
    #   real    56m49.050s

    cat fb.melUnd1.genscan.txt
    #   21738187 bases of 1086614815 (2.001%) in intersection
    cat fb.melUnd1.genscanSubopt.txt
    #   26863973 bases of 1086614815 (2.472%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 1086614815 / 2897316137 \) \* 1024
    #	( 1086614815 / 2897316137 ) * 1024 = 384.042858

    # round up to 400

    cd /hive/data/genomes/melUnd1
    time blat melUnd1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/melUnd1.11.ooc -repMatch=400
    #   Wrote 14287 overused 11-mers to jkStuff/melUnd1.11.ooc
    #   real    0m24.153s

    # there are no non-bridged gaps, no lift file needed for genbank
    hgsql -N -e "select bridge from gap;" melUnd1 | sort | uniq -c
    #   51022 yes
#    cd /hive/data/genomes/melUnd1/jkStuff
#    gapToLift melUnd1 melUnd1.nonBridged.lift -bedFile=melUnd1.nonBridged.bed
    # largest non-bridged contig:
#    awk '{print $3-$2,$0}' melUnd1.nonBridged.bed | sort -nr | head
    #   123773608 chrX  95534   123869142       chrX.01

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Melopsittacus undulatus	25	1	0
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev  
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add melUnd1 just after ce2
# melUnd1 (Budgerigar)
melUnd1.serverGenome = /hive/data/genomes/melUnd1/melUnd1.2bit
melUnd1.clusterGenome = /hive/data/genomes/melUnd1/melUnd1.2bit
melUnd1.ooc = /hive/data/genomes/melUnd1/jkStuff/melUnd1.11.ooc
melUnd1.lift = no
melUnd1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
melUnd1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
melUnd1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
melUnd1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
melUnd1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
melUnd1.refseq.mrna.native.load = no
melUnd1.refseq.mrna.xeno.load = yes
melUnd1.genbank.mrna.xeno.load = yes
melUnd1.genbank.est.native.load = no
melUnd1.downloadDir = melUnd1
melUnd1.perChromTables = no

    # end of section added to etc/genbank.conf
    git commit -m "adding melUnd1 budgerigar" etc/genbank.conf
    git push
    make etc-update

    git pull
    # Edit src/lib/gbGenome.c to add new species.
    git commit -m "adding definition for melUndNames" \
        src/lib/gbGenome.c
    git push
    make install-server

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S melUnd1           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial melUnd1 &
    # after setting genbank.mrna.xeno.load = no
    #   var/build/logs/2012.06.07-16:08:32.melUnd1.initalign.log
    #   real    883m56.428s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad melUnd1 &
    #   real    10m14.118s
    #   var/dbload/hgwdev/logs/2012.06.10-12:54:07.dbload.log

    # enable daily alignment and update of hgwdev (DONE - 2012-05-09 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add melUnd1 to:
        etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added melUnd1." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# set default position to RHO gene displays  (DONE - 2012-07-23 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="JH556451:700038-706370" where name="melUnd1";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-23 - Hiram)
    mkdir /hive/data/genomes/melUnd1/pushQ
    cd /hive/data/genomes/melUnd1/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl melUnd1 2> stderr.txt | grep -v transMap > melUnd1.sql
    #   real    4m1.757s

    scp -p melUnd1.sql hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < melUnd1.sql

############################################################################
# LASTZ Medium Ground Finch geoFor1 (DONE - 2012-07-28 - Hiram)
    # establish a screen to control this job with a name to indicate what it is
    screen -S melUnd1
    mkdir /hive/data/genomes/melUnd1/bed/lastzGeoFor1.2012-07-28
    cd /hive/data/genomes/melUnd1/bed/lastzGeoFor1.2012-07-28

    # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
    #	number of jobs, 50,000 to something under 100,000
    cat << '_EOF_' > DEF
# Budgerigar vs. Medium Ground Finch
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz

# TARGET: Budgerigar melUnd1
SEQ1_DIR=/hive/data/genomes/melUnd1/melUnd1.2bit
SEQ1_LEN=/hive/data/genomes/melUnd1/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=100

# QUERY: Medium Ground Finch GeoFor1
SEQ2_DIR=/hive/data/genomes/geoFor1/geoFor1.2bit
SEQ2_LEN=/hive/data/genomes/geoFor1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/hive/data/genomes/melUnd1/bed/lastzGeoFor1.2012-07-28
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        > do.log 2>&1
    #   real    1628m48.053s
    cat fb.melUnd1.chainGeoFor1Link.txt
    #   832321696 bases of 1086614815 (76.598%) in intersection

    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/melUnd1/bed
    ln -s lastzGeoFor1.2012-07-28 lastz.geoFor1

    mkdir /hive/data/genomes/geoFor1/bed/blastz.melUnd1.swap
    cd /hive/data/genomes/geoFor1/bed/blastz.melUnd1.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/melUnd1/bed/lastzGeoFor1.2012-07-28/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        > swap.log 2>&1 &
    #   real    56m32.909s

    cat fb.geoFor1.chainMelUnd1Link.txt
    #   839798431 bases of 1041286029 (80.650%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/geoFor1/bed
    ln -s blastz.melUnd1.swap lastz.melUnd1

##############################################################################
# lastz Mouse mm10 (DONE - 2012-04-02 - Hiram)
    # original alignment
    cd /hive/data/genomes/mm10/bed/lastzMelUnd1.2012-03-29
    cat fb.mm10.chainMelUnd1Link.txt
    #	95217653 bases of 2652783500 (3.589%) in intersection

    #	and for the swap
    mkdir /hive/data/genomes/melUnd1/bed/blastz.mm10.swap
    cd /hive/data/genomes/melUnd1/bed/blastz.mm10.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm10/bed/lastzMelUnd1.2012-03-29/DEF \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        -swap -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 &
    #	real    9m9.260s
    cat  fb.melUnd1.chainMm10Link.txt
    #	79867911 bases of 1086614815 (7.350%) in intersection

    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/melUnd1/bed
    ln -s blastz.mm10.swap lastz.mm10

#########################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram)
    mkdir /hive/data/genomes/melUnd1/bed/ucscToINSDC
    cd /hive/data/genomes/melUnd1/bed/ucscToINSDC

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh NC_009134.1
    ./verifyAll.sh
    ./join.sh

    sed -e "s/21/8/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
      | hgLoadSqlTab melUnd1 ucscToINSDC stdin ucscToINSDC.tab
    checkTableCoords melUnd1 ucscToINSDC
    featureBits -countGaps melUnd1 ucscToINSDC
    # 1117373619 bases of 1117373619 (100.000%) in intersection

    # verify the track link to INSDC functions

##############################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
