# for emacs: -*- mode: sh; -*-

 
# This file describes building the browser database for the archaeal
# species Pyrobaculum aerophilum.

# DOWNLOAD SEQUENCE FROM GENBANK (DONE 01/07/04)

    ssh eieio
    mkdir /cluster/store5/archae/pyrAer1
    ln -s /cluster/store5/archae/pyrAer1 /cluster/data/pyrAer1
    cd /cluster/data/pyrAer1
    wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Pyrobaculum_aerophilum/NC_003364.fna
    mv NC_003364.fna chr1.fa
    # Edit header of chr1.fa to '> pyrAer1'

# CREATE DATABASES AND A BUNCH OF INITIAL STUFF (DONE 01/12/04)

    ssh hgwdev
    echo 'create database pyrAer1' | hgsql ''
    cd /cluster/data/pyrAer1
    hgNibSeq pyrAer1 /cluster/data/pyrAer1/nib chr1.fa
    faSize -detailed chr1.fa > chrom.sizes
    mkdir -p /gbdb/pyrAer1/nib
    echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \
	    | hgsql pyrAer1
    echo 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
                defaultPos, active, orderKey, genome, scientificName, \
                htmlPath, hgNearOk) values \
        ("pyrAer1", "Pyrobaculum aerophilum", "/gbdb/pyrAer1/nib", "P. aerophilum", \
               "chr1:500000-550000", 1, 85, "Archae", \
                "Pyrobaculum aerophilum", "/gbdb/pyrAer1/html/description.html", \
                0);' \
      | hgsql -h genome-testdb hgcentraltest
    echo 'INSERT INTO defaultDb (genome, name) values ("Archae", "pyrAer1");' \
      | hgsql -h genome-testdb hgcentraltest

    cd ~/kent/src/hg/makeDb/trackDb
    # edit the trackDb makefile

    # add the trackDb directories
    mkdir -p archae/pyrAer1
    cvs add archae
    cvs add archae/pyrAer1
    cvs commit

# GC PERCENT TRACK (DONE 01/12/04)

    ssh hgwdev
    mkdir -p /cluster/data/pyrAer1/bed/gcPercent
    cd /cluster/data/pyrAer1/bed/gcPercent
    hgsql pyrAer1 < ~/kent/src/hg/lib/gcPercent.sql
    hgGcPercent -win=2000 pyrAer1 ../../nib
    # edit ~/kent/src/hg/makeDb/trackDb/archae/trackDb.ra and add an entry for
    # GC percent for 2,000 base windows instead (simply cut and paste
    # ~/kent/src/hg/makeDb/trackDb/trackDb.ra).

# GC 20 BASE WIGGLE TRACK (DONE 8/20)

    mkdir /cluster/data/pyrAer1/bed/gc20Base
    cd /cluster/data/pyrAer1/bed/gc20Base
    mkdir wigData20 dataLimits20
    hgGcPercent -chr=chr -file=stdout -win=20 -overlap=19 pyrAer1 ../../nib | grep -w GC | \
    awk '
{
    bases = $3 - $2
    perCent = $5/10.0
    printf "%d\t%.1f\n", $2+1, perCent
}' | wigAsciiToBinary -dataSpan=1 -chrom=chr \
	-wibFile=wigData20/gc20Base_1 -name=1 stdin > dataLimits20/chr
    hgLoadWiggle pyrAer1 gc20Base wigData20/*.wig
    mkdir /gbdb/pyrAer1/wib
    ln -s `pwd`/wigData20/*.wib /gbdb/pyrAer1/wib
    #	the trackDb entry
track gc20Base
shortLabel GC Percent
longLabel GC Percent in 20 base windows
group map
priority 1.7
visibility hide
autoScale Off
maxHeightPixels 128:36:16
graphTypeDefault Bar
gridDefault OFF
windowingFunction Mean
color 0,128,255
altColor 255,128,0
viewLimits 30:70
type wig 0 100    

# CONTIG TRACK 
    # reformat is a Todd Lowe program
    for num in `seq 746 946`; do   
       curl -o ${num}.gbk "http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?txt=on&val=AE009${num}.1"
       reformat fasta ${num}.gbk >> contigs.fa
    done
    blat /cluster/data/pyrAer1/chr1.fa contigs.fa -minIdentity=100 contigs.psl
    perfectBlatBed4 contigs.psl contigs.bed
    mkdir /cluster/data/pyrAer1/bed/pyrAer1Contigs
    cp contigs.bed /cluster/data/pyrAer1/bed/pyrAer1Contigs
    cd /cluster/data/pyrAer1/bed/pyrAer1Contigs
    hgLoadBed pyrAer1 pyrAer1Contigs contigs.bed
    # the trackDb entry: 
track pyrAer1Contigs
shortLabel Contigs
longLabel Contigs deposited in Genbank
group map
priority 0.5
visibility pack
url http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=nucleotide&cmd=search&term=$$
type bed 4 .


# TANDEM REPEAT MASKER (NOT DONE YET)

    ssh hgwdev
    mkdir -p /cluster/data/pyrAer1/bed/simpleRepeat
    cd /cluster/data/pyrAer1
    trfBig chr1.fa /dev/null -bedAt=/cluster/data/pyrAer1/bed/simpleRepeat/chr1.bed
    cd /cluster/data/pyrAer1/bed/simpleRepeat
    hgLoadBed pyrAer1 simpleRepeat *.bed -sqlTable=~kent/src/hg/lib/simpleRepeat.sql
    # weird, I get a seg fault here.  need to ask Hiram or somebody else.

# DESCRIPTION PAGE (DONE 01/12/04)

    ssh hgwdev
    # Write ~/kent/src/hg/makeDb/trackDb/archae/pyrAer1/description.html
    chmod a+r ~/kent/src/hg/makeDb/trackDb/archae/pyrAer1/description.html
    # Check it in.
    mkdir /gbdb/pyrAer1/html
    ln -s /cluster/data/pyrAer1/html/description.html /gbdb/pyrAer1/html/

# GENBANK PROTEIN-CODING GENES (DONE 01/14/04)

    ssh hgwdev
    mkdir /cluster/data/pyrAer1/genbank
    cd /cluster/data/pyrAer1/genbank
    wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Pyrobaculum_aerophilum/NC_003364.gbk
    mv NC_003364.gbk pyrAer1.gbk
    # Create 3 files to assist parsing of the genbank
    # 1. for a bed file
    echo 'chr1
start
end
gene
1000
strand' > pyrAer1-params-bed.txt
    # 2. for the peptide parts
    echo 'gene
translation' > pyrAer1-params-pep.txt
    # 3. for the other gene information
    echo 'gene
gene
product
note
protein_id
db_xref
EC_number
pseudo' > pyrAer1-params-xra.txt
    # Now extract the genes and information:
    gbArchaeGenome pyrAer1.gbk pyrAer1-params-bed.txt pyrAer1-genbank-cds.bed
    gbArchaeGenome pyrAer1.gbk pyrAer1-params-pep.txt pyrAer1-genbank-cds.pep
    gbArchaeGenome pyrAer1.gbk pyrAer1-params-xra.txt pyrAer1-genbank-cds.xra
    hgsql pyrAer1 < ~/kent/src/hg/lib/pepPred.sql
    hgsql pyrAer1 < ~/kent/src/hg/lib/minGeneInfo.sql
    echo rename table pepPred to gbProtCodePep | hgsql pyrAer1
    echo rename table minGeneInfo to gbProtCodeXra | hgsql pyrAer1
    echo load data local infile \'pyrAer1-genbank-cds.pep\' into table gbProtCodePep | hgsql pyrAer1
    echo load data local infile \'pyrAer1-genbank-cds.xra\' into table gbProtCodeXra | hgsql pyrAer1
csh
/cluster/bin/scripts/tawk '{print $1,$2,$3,$4,$5,$6,$2,$3,0,1,$3-$2,0}' pyrAer1-genbank-cds.bed | bedToGenePred stdin tmp.gp
#below substr($1,4,4) must be edited so that it returns a numeric field.
/cluster/bin/scripts/tawk '{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,substr($1,4,4),name2,"cmpl","cmpl",0}' tmp.gp  > tmp2.gp
# hard tab between quotes use ctrl-V then press tab
join -t "     " -o 1.1,1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 1.13 1.14 1.15  tmp2.gp pyrAer1-genbank-cds.xra > pyrAer1.gp
ldHgGene pyrAer1 refSeq pyrAer1.gp -predTab -genePredExt

# GENBANK rRNA GENES (NOT QUITE DONE)
    ssh hgdev
    cd /cluster/data/pyrAer1/genbank
    gbArchaeGenome -kind=rRNA pyrAer1.gbk pyrAer1-params-bed.txt pyrAer1-rrnas.bed
    echo 'gene
product
NA' > pyrAer1-params-rrna-xra.txt
    gbArchaeGenome -kind=rRNA pyrAer1.gbk pyrAer1-params-rrna-xra.txt pyrAer1-rrnas-xra.txt
    hgLoadBed pyrAer1 gbRRNA pyrAer1-rrnas.bed
    hgsql pyrAer1 < ~/kent/src/hg/lib/minGeneInfo.sql
    echo rename table minGeneInfo to gbRRNAXra | hgsql pyrAer1
    echo load data local infile \'pyrAer1-rrnas-xra.txt\' into table gbRRNAXra | hgsql pyrAer1

# COG STUFF
    # Cut and paste http://www.ncbi.nlm.nih.gov/cgi-bin/COG/palox into emacs (COG list)
    # and save as cogpage.txt
    awk '{printf("%s\t%s\n",$6,$5)}' < cogpage.txt | sed -e 's/\[//' -e 's/\]//' > cogs.txt
    rm cogpage.txt
    # Now we have the basic list of cogs and the letter code for each one.
    

# TODD LOWE tRNA GENES (DONE 01/15/04)

    # This one is a bed 6+ file created by hand of 46 tRNAs and 1 pseudo tRNA by Todd
    # Lowe.  See ~/kent/src/hg/lib/loweTrnaGene.as for a description of the fields.
    # **Showing the tRNAScanSE instructions would be nice in the future.  
    ssh hgwdev
    mkdir /cluster/data/pyrAer1/bed/loweTrnaGene
    cd /cluster/data/pyrAer1/bed/loweTrnaGene
    hgLoadBed -tab pyrAer1 loweTrnaGene pyrAer1-lowe-trnas.bed -sqlTable=~/kent/src/hg/lib/loweTrnaGene.sql

# TODD LOWE snoRNA GENES (DONE 01/16/04)
    # This is a bed 6 file created by hand.
    ssh hgwdev
    mkdir /cluster/data/pyrAer1/bed/loweSnoGene
    cd /cluster/data/pyrAer1/bed/loweSnoGene
    hgLoadBed -tab pyrAer1 loweSnoGene pyrAer1-snos.bed

# TIGR GENES (DONE 02/09/04)
    # First go to http://www.tigr.org/tigr-scripts/CMR2/gene_attribute_form.dbi
    # and fill out the web form as follows:
    #   - Pick "Retrieve attributes for the specified DNA feature within a specific 
    #     organism and/or a specific role category".
    #       * Pick "Pyrobaculum aerophilum IM2", and "Primary and TIGR annotation ORFs" 
    #         from the 1st and 3rd box.
    #       * Select everything from "Choose TIGR Annotation Gene Attributes"
    #       * Select "Primary Locus Name" from "Choose Primary Annotation Gene Attributes"
    #       * Select everything from "Choose Other Gene Attributes"
    #   - Click submit, and click save as tab-delimited file.
    ssh hgwdev
    mkdir /cluster/data/pyrAer1/bed/tigrCmrORFs
    cp pyrAer1-tigr.tab /cluster/data/pyrAer1/bed/tigrCmrORFs
    cd /cluster/data/pyrAer1/bed/tigrCmrORFs
    tigrCmrToBed pyrAer1-tigr.tab pyrAer1-tigr.bed
    hgLoadBed -tab pyrAer1 tigrCmrGene pyrAer1-tigr.bed -sqlTable=~/kent/src/hg/lib/tigrCmrGene.sql
    echo RENAME TABLE tigrCmrGene to tigrCmrORFs | hgsql pyrAer1

# Lowe Lab Microarrays (DONE 02/09/04)
    ssh hgwdev
    mkdir /cluster/data/pyrAer1/bed/llaPaePrintA
    cp Pae-arrays.{bed,exps} /cluster/data/pyrAer1/bed/llaPaePrintA
    cd /cluster/data/pyrAer1/bed/llaPaePrintA
    hgLoadBed pyrAer1 llaPaePrintA Pae-arrays.bed
    hgsql pyrAer1 < ~/kent/src/hg/lib/expRecord.sql
    echo RENAME TABLE expRecord to llaPaePrintAExps | hgsql pyrAer1
    echo LOAD DATA LOCAL INFILE \'Pae-arrays.exps\' INTO TABLE llaPaePrintAExps | hgsql pyrAer1
    # for now I'm using both hgFixed and the pyrAer1 database for this... that will change.
    hgsql hgFixed < ~/kent/src/hg/lib/expRecord.sql
    echo RENAME TABLE expRecord to llaPaePrintAExps | hgsql hgFixed
    echo LOAD DATA LOCAL INFILE \'Pae-arrays.exps\' INTO TABLE llaPaePrintAExps | hgsql hgFixed

# AORF TRACK
    # This is another hand-created file, provided originally by Sorel Fitz-Gibbon and massaged to be a bed.
    ssh hgwdev
    mkdir /cluster/data/pyrAer1/bed/primAORF
    cp aorfs.bed /cluster/data/pyrAer1/bed/primAORF
    cd /cluster/data/pyrAer1/bed/primAORF
    hgLoadBed pyrAer1 primAORF aorfs.bed


# CHANGE "chr1" to "chr"

    ssh hgdev
    mv /cluster/data/pyrAer1/nib/chr1.nib /cluster/data/pyrAer1/nib/chr.nib
    rm /gbdb/pyrAer1/nib/chr1.nib
    ln -s /cluster/data/pyrAer1/nib/chr.nib /gbdb/pyrAer1/nib/chr.nib
    # a quick script to replace chr1 with chr

#!/bin/bash
sed 's/chr1/chr/g' $1 > /tmp/whatever
mv /tmp/whatever $1

    cd /cluster/data/pyrAer1/bed 
    find -name '*.bed' | xargs changeCh.sh
    # Now change the DB
    cd /tmp
    hgsqldump pyrAer1 | sed 's/chr1/chr/g' > paeNew.sql
    echo drop database pyrAer1 | hgsql test
    echo create database pyrAer1 | hgsql test
    hgsql pyrAer1 < paeNew.sql
    rm paeNew.sql
    echo 'update dbDb set defaultPos="chr:550000-580000" where name="pyrAer1"' | hgsql -h genome-testdb hgcentraltest

# Pae promoter track

    cd /cluster/data/pyrAer1/wiggle
    mkdir promoterScanPos
    mkdir promoterScanNeg
    cd promoterScanPos
    cp prom.neg.gz .
    wigAsciiToBinary -chrom=chr -wibFile=promoterScanPos prom.pos.gz
    hgLoadWiggle pyrAer1 promoterScanPos promoterScanPos.wig
    cd ../promoterScanNeg
    cp prom.neg.gz .
    wigAsciiToBinary -chrom=chr -wibFile=promoterScanNeg prom.neg.gz
    hgLoadWiggle pyrAer1 promoterScanNeg promoterScanNeg.wig
    cd /gbdb/pyrAer1/wib  
    ln -s /cluster/data/pyrAer1/wiggle/promoterScanPos/promoterScanPos.wib promoterScanPos.wib
    ln -s /cluster/data/pyrAer1/wiggle/promoterScanNeg/promoterScanNeg.wib promoterScanNeg.wib

# shine DG track

    cd /cluster/data/pyrAer1/wiggle
    mkdir shineDGPos
    mkdir shineDGNeg
    cd shineDGPos
    cp shine.pos.gz .
    wigAsciiToBinary -chrom=chr -wibFile=shineDGPos shine.pos.gz
    hgLoadWiggle pyrAer1 shineDGPos shineDGPos.wig
    cd ../shineDGNeg
    cp shine.neg.gz .
    wigAsciiToBinary -chrom=chr -wibFile=shineDGNeg shine.neg.gz
    hgLoadWiggle pyrAer1 shineDGNeg shineDGNeg.wig
    cd /gbdb/pyrAer1/wib  
    ln -s /cluster/data/pyrAer1/wiggle/shineDGPos/shineDGPos.wib shineDGPos.wib
    ln -s /cluster/data/pyrAer1/wiggle/shineDGNeg/shineDGNeg.wib shineDGNeg.wib

# RNA genes

cd /cluster/data/pyrAer1/bed
mkdir rnaGenes
cp paeAllRNA.bed .
cp ~/kent/src/hg/lib/rnaGenes.sql .
hgLoadBed -sqlTable=rnaGenes.sql pyrAer1 rnaGenes paeAllRNA.bed
