# for emacs: -*- mode: sh; -*-

#	 Papio hamadryas - Baboon
# ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Phamadryas/fasta/Pham_1.0/bin0/*
# ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Phamadryas/fasta/Pham_1.0/linearScaffolds/*
# ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Phamadryas/fasta/Pham_1.0/contigs/*

#  http://www.hgsc.bcm.tmc.edu/project-species-p-Papio%20hamadryas.hgsc?pageLocation=Papio%20hamadryas

##########################################################################
# download sequence (DONE - 2009-05-06 - Hiram)
    mkdir -p /hive/data/genomes/papHam1/download
    cd /hive/data/genomes/papHam1/download

wget --timestamping \
'ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Phamadryas/fasta/Pham_1.0/contigs/*'
wget --timestamping \
'ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Phamadryas/fasta/Pham_1.0/linearScaffolds/*'
wget --timestamping \
'ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Phamadryas/fasta/Pham_1.0/bin0/*'

    cat << '_EOF_' > renameScaffold.sh
#!/bin/sh

scaf=1
export scaf
# zcat Pham.20081120.linear.fa.gz | faCount stdin > faCount.linear.txt
egrep -v "^#seq|^total" faCount.linear.txt \
        | awk '{printf "%s\t%d\n", $1,$2}' \
        | sed -e "s/^Contig//" \
        | sort -k2,2nr -k1,1n | sed -e "s/^/Contig/" | awk '{print $1}'
\
| while read C
do
    echo "${C} scaffold${scaf}"
    scaf=`echo $scaf | awk '{print $1+1}'`
done
'_EOF_'
    # << happy emacs
    chmod +x renameScaffold.sh
    ./renameScaffold.sh > ctg2scaf.txt

    cat << '_EOF_' > ucscAgp.pl
#!/usr/bin/env perl

use warnings;
use strict;

my %scafName;
my $scafCount = 0;

open (FH, "<ctg2scaf.txt") or die "can not read ctg2scaf.txt";
while (my $line = <FH>) {
    chomp $line;
    my ($ctg, $scaf) = split('\s+',$line);
    $scafName{$ctg} = $scaf;
    ++$scafCount;
}
close (FH);

printf STDERR "read in $scafCount ctg to scaffold name translations\n";

open (FH, "<Pham.20081120.contigs.agp") or
	die "can not read Pham.20081120.contigs.agp";
while (my $line = <FH>) {
    my ($ctg, $rest) = split('\s+',$line,2);
    if ($ctg =~ m/_/) {
	printf "%s\t%s", $scafName{$ctg}, $rest;
    } else {
	printf "%s\t%s", $ctg, $rest;
    }
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ucscAgp.pl
    ./ucscAgp.pl > ucsc.agp

    cat << '_EOF_' > rename.pl
#!/usr/bin/env perl

use warnings;
use strict;

my %scafName;
my $scafCount = 0;

open (FH, "<ctg2scaf.txt") or die "can not read ctg2scaf.txt";
while (my $line = <FH>) {
    chomp $line;
    my ($ctg, $scaf) = split('\s+',$line);
    $scafName{$ctg} = $scaf;
    ++$scafCount;
}
close (FH);

printf STDERR "read in $scafCount ctg to scaffold name translations\n";

open (QL,">linear.qual.fa") or die "can not write to linear.qual.fa";
open (FH, "zcat Pham.20081120.linear.fa.qual.gz|") or
	die "can not zcat Pham.20081120.linear.fa.qual.gz|";
while (my $line = <FH>) {
    if ($line =~m/^>/) {
	chomp $line;
	my $ctg = $line;
	$ctg =~ s/^>//;
	if ($ctg =~ m/_/) {
	    printf QL ">%s\n", $scafName{$ctg};
	} else {
	    printf QL ">%s\n", $ctg;
	}
    } else {
	printf QL "%s", $line;
    }
}
close (FH);
close (QL);
'_EOF_'
    # << happy emacs
    chmod +x rename.pl
    ./rename.pl

##########################################################################
# initial browser construction (DONE - 2009-05-12 - Hiram)
    cd /hive/data/genomes/papHam1

    cat << '_EOF_' > papHam1.config.ra
# Config parameters for makeGenomeDb.pl:
db papHam1
scientificName Papio hamadryas
commonName Baboon
assemblyDate Nov. 2008
assemblyLabel Baylor BCM HGSC Pham_1.0
orderKey 39
mitoAcc NC_001992
fastaFiles /hive/data/genomes/papHam1/download/Pham.20081120.contigs.fa.gz
agpFiles /hive/data/genomes/papHam1/download/ucsc.agp
qualFiles /hive/data/genomes/papHam1/download/linear.qual.qac
dbDbSpeciesDir baboon
clade mammal
genomeCladePriority 16
taxId   9562
'_EOF_'
    # << happy emacs

    makeGenomeDb.pl -fileServer=hgwdev \
	-workhorse=hgwdev papHam1.config.ra > makeGenomeDb.out 2>&1

##########################################################################
# repeatMasking (DONE - 2009-05-13 - Hiram)
    mkdir /hive/data/genomes/papHam1/bed/repeatMasker
    cd /hive/data/genomes/papHam1/bed/repeatMasker

    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
        -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
        -smallClusterHub=memk papHam1 > do.log 2>&1 &
    # about 15 hours

    cat faSize.rmsk.txt 
    # 2867564654 bases (125715603 N's 2741849051 real 1407767294 upper
    #	1334081757 lower) in 387374 sequences in 1 files
    #	%46.52 masked total, %48.66 masked real

    grep -i versi do.log
    #	RepeatMasker version development-$Id: RepeatMasker,v 1.23 2009/02/02 21:10:05 angie Exp $
    #    Jan 29 2009 (open-3-2-7) version of RepeatMasker

##########################################################################
# running simple repeat (DONE - 2009-05-14 - Hiram)
    mkdir /hive/data/genomes/papHam1/bed/simpleRepeat
    cd /hive/data/genomes/papHam1/bed/simpleRepeat

    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \
	papHam1 > do.log 2>&1 &
    # about 38 hours

    featureBits papHam1 simpleRepeat
    #	64502452 bases of 2741867288 (2.353%) in intersection

    twoBitMask papHam1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed papHam1.2bit
    twoBitToFa papHam1.2bit stdout | faSize stdin > papHam1.2bit.faSize.txt
    #	2867564654 bases (125715603 N's 2741849051 real 1406524634 upper
    #	1335324417 lower) in 387374 sequences in 1 files
    #	%46.57 masked total, %48.70 masked real

    twoBitToFa papHam1.rmsk.2bit stdout | faSize stdin > papHam1.rmsk.faSize.txt
    #	2867564654 bases (125715603 N's 2741849051 real 1407767294 upper
    #	1334081757 lower) in 387374 sequences in 1 files
    #	%46.52 masked total, %48.66 masked real

    rm /gbdb/papHam1/papHam1.2bit
    ln -s /hive/data/genomes/papHam1/papHam1.2bit /gbdb/papHam1/papHam1.2bit

##########################################################################
# SWAP lastz from Mm10 (DONE - 2012-03-10 - Hiram)
    # original alignment on Mm10
    cd /hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09
    cat fb.mm10.chainPapHam1Link.txt 
    #	890718423 bases of 2652783500 (33.577%) in intersection

    #	and for this swap
    mkdir /hive/data/genomes/papHam1/bed/blastz.mm10.swap
    cd /hive/data/genomes/papHam1/bed/blastz.mm10.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    548m15.438s
    cat fb.mm10.chainPapHam1Link.txt 
    #	878016290 bases of 2741867288 (32.023%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/papHam1/bed
    ln -s blastz.mm10.swap lastz.mm10

##############################################################################
# cpgIsland - (DONE - 2012-04-23 - Hiram)
    mkdir /hive/data/genomes/papHam1/bed/cpgIslands
    cd /hive/data/genomes/papHam1/bed/cpgIslands
    time doCpgIslands.pl papHam1 > do.log 2>&1
    #   real    940m44.062s
    # fixing broken command in script:
    time ./doLoadCpg.csh
    #   real    3m56.264s
    time doCpgIslands.pl -continue=cleanup papHam1 > cleanup.log 2>&1
    #   real    121m47.995s

    cat fb.papHam1.cpgIslandExt.txt
    #   19960131 bases of 2741867288 (0.728%) in intersection

##############################################################################
# genscan - (DONE - 2011-04-25 - Hiram)
    mkdir /hive/data/genomes/papHam1/bed/genscan
    cd /hive/data/genomes/papHam1/bed/genscan
    time doGenscan.pl papHam1 > do.log 2>&1
    #   real    810m7.177s

    cat fb.papHam1.genscan.txt
    #   51391265 bases of 2741867288 (1.874%) in intersection
    cat fb.papHam1.genscanSubopt.txt
    #   52066592 bases of 2741867288 (1.899%) in intersection

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2012-05-02 - Hiram)
    mkdir /hive/data/genomes/papHam1/bed/gap
    cd /hive/data/genomes/papHam1/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../papHam1.unmasked.2bit > findMotif.txt 2>&1
    #	real    0m15.269s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    featureBits papHam1 -not gap -bed=notGap.bed
    #	1358346944 bases of 1358346944 (100.000%) in intersection
    featureBits papHam1 allGaps.bed notGap.bed -bed=new.gaps.bed
    #	18237 bases of 2741867288 (0.001%) in intersection

    #	what is the highest index in the existing gap table:
    hgsql -N -e "select ix from gap;" papHam1 | sort -n | tail -1
    #	270
    cat << '_EOF_' > mkGap.pl
#!/bin/env perl

use strict;
use warnings;

my $ix=`hgsql -N -e "select ix from gap;" papHam1 | sort -n | tail -1`;
chomp $ix;

open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
while (my $line = <FH>) {
    my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
    ++$ix;
    printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
        $chromEnd, $ix, $chromEnd-$chromStart;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./mkGap.pl
    ./mkGap.pl > other.bed
    featureBits -countGaps papHam1 other.bed
    #	18237 bases of 2741867288 (0.001%) in intersection
    wc -l other.bed
    #	5743
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
	-noLoad papHam1 otherGap other.bed
    #   Read 5743 elements of size 8 from other.bed
    #	starting with this many
    hgsql -e "select count(*) from gap;" papHam1
    #	366334
    hgsql papHam1 -e 'load data local infile "bed.tab" into table gap;'
    #	result count:
    hgsql -e "select count(*) from gap;" papHam1
    #	372077
    # == 366334 + 5743
    # verify we aren't adding gaps where gaps already exist
    # this would output errors if that were true:
    gapToLift -minGap=1 papHam1 nonBridged.lift -bedFile=nonBridged.bed
    # see example in danRer7.txt
    # there are no non-bridged gaps here:
    hgsql -N -e "select bridge from gap;" papHam1 | sort | uniq -c
    #	372077 yes

#########################################################################
# windowMasker - (DONE - 2012-05-02 - Hiram)
    screen -S papHam1
    mkdir /hive/data/genomes/papHam1/bed/windowMasker
    cd /hive/data/genomes/papHam1/bed/windowMasker
    # trying out new version of the script that does all the usual steps
    #   that used to be performed manually after it was done
    time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \
        -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev papHam1 > do.log 2>&1
    #   real    1037m3.817s
    # fixing broken commands in doLoad step:
    time ./lastLoad.csh
    #   real    17m15.223s
    time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \
        -continue=cleanup -workhorse=hgwdev -buildDir=`pwd` \
        -dbHost=hgwdev papHam1 > cleanup.log 2>&1
    #   real    2m19.164s

    sed -e 's/^/    #\t/' fb.papHam1.windowmaskerSdust.beforeClean.txt
    #   1107561332 bases of 2867564654 (38.624%) in intersection
    sed -e 's/^/    #\t/' fb.papHam1.windowmaskerSdust.clean.txt
    #   981847549 bases of 2867564654 (34.240%) in intersection
    sed -e 's/^/    #\t/' fb.papHam1.rmsk.windowmaskerSdust.txt
    #   743599327 bases of 2867564654 (25.931%) in intersection

#########################################################################
# set default position to RHO gene displays  (DONE - 2012-07-24 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="scaffold4004:96950-104215" where name="papHam1";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-24 - Hiram)
    mkdir /hive/data/genomes/papHam1/pushQ
    cd /hive/data/genomes/papHam1/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl papHam1 2> stderr.txt | grep -v transMap > papHam1.sql
    #   real    3m56.635s
    # check the stderr.txt for bad stuff, these kinds of warnings are OK:
# WARNING: hgwdev does not have /gbdb/papHam1/wib/gc5Base.wib
# WARNING: hgwdev does not have /gbdb/papHam1/wib/quality.wib
# WARNING: hgwdev does not have /gbdb/papHam1/bbi/quality.bw
# WARNING: papHam1 does not have seq
# WARNING: papHam1 does not have extFile
# WARNING: papHam1 does not have estOrientInfo

    scp -p papHam1.sql hgwbeta:/tmp
    ssh hgwbeta "hgsql qapushq < /tmp/papHam1.sql"

#############################################################################
# LIFTOVER TO papAnu2 (DONE - 2013-01-16 - Chin)
    sh hgwdev
    cd /hive/data/genomes/papHam1
    ln -s jkStuff/papHam1.11.ooc 11.ooc
    screen
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
         papHam1 papAnu2 > doLiftOverToPapHam1.log 2>&1 &
    # real    1062m56.689s

    pushd /usr/local/apache/htdocs-hgdownload/goldenPath/papHam1/liftOver/
    md5sum papAHam1ToPapAnu2.over.chain.gz >> md5sum.txt
    popd

############################################################################
