# for emacs: -*- mode: sh; -*-

 
# This file describes building the browser database for the archaeal
# species Methanosarcina acetivorans.

# DOWNLOAD SEQUENCE FROM GENBANK (DONE)

    ssh eieio
    mkdir /cluster/store5/archae/methJann1
    ln -s /cluster/store5/archae/methJann1 /cluster/data/methJann1
    cd /cluster/data/methJann1
    cp /projects/lowelab/db/Bacteria/Methanococcus_jannaschii/NC_00*.fna .
    mv NC_000909.fna NC_000909.fa
    mv NC_001732.fna NC_001732.fa
    mv NC_001733.fna NC_001733.fa
    # Edit header of *.fa to '> methJann1 >smallextrachr >largeextrachr'
    cat NC_000909.fa NC_001732.fa NC_001733.fa > methJann1.fa
   
    faToTwoBit methJann1.fa methJann1.2bit 


# CREATE DATABASES AND A BUNCH OF INITIAL STUFF (DONE)

    ssh hgwdev
    echo 'create database methJann1' | hgsql ''
    cd /cluster/data/methJann1

    faSize -detailed methJann1.fa > chrom.sizes
    echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \
	    | hgsql methJann1
    echo 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
                defaultPos, active, orderKey, genome, scientificName, \
                htmlPath, hgNearOk) values \
        ("methJann1", "August 1996", "/gbdb/methJann1", "Methanocaldococcus jannaschii", \
               "chr1:500000-550000", 1, 245, "Methanocaldococcus jannaschii", \
                "Methanocaldococcus jannaschii DSM 2661", "/gbdb/methJann1/html/description.html", \
                0);' \
      | hgsql hgcentraltest
    echo 'INSERT INTO defaultDb (genome, name) values ("Methanocaldococcus jannaschii", "methJann1");' \
      | hgsql hgcentraltest
    echo 'INSERT INTO genomeClade (genome, clade, priority) values ("Methanocaldococcus jannaschii", "archaea",85);'  \
      | hgsql hgcentraltest

# CREATE CHROMINFO TABLE (DONE)
  ssh hgwdev
  cd /cluster/data/methJann1

   cp ~baertsch/kent/src/hg/lib/chromInfo.sql .
   edit chromInfo.sql; allow fileName to be null
   hgsql methJann1 < chromInfo.sql
   echo "load data local infile 'chrom.sizes' into table chromInfo" | hgsql methJann1
   echo "update chromInfo set fileName = '/gbdb/methJann1/methJann1.2bit'" | hgsql methJann1

    cd ~/kent/src/hg/makeDb/trackDb

    # add the trackDb directories
    mkdir -p archae/methJann1
    cvs add archae/methJann1
    cvs commit

    cd ~/kent/src/hg/makeDb/trackDb
    # edit the trackDb makefile

    # add the trackDb directories
    mkdir -p archae/methJann1
    cvs add archae
    cvs add archae/methJann1
    cvs commit

# GC20BASE (DONE)
    ssh kkstore02
    mkdir -p /cluster/data/methJann1/bed/gc20Base
    cd /cluster/data/methJann1/bed/gc20Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=20 methJann1 \
        /cluster/data/methJann1/ | wigEncode stdin gc20Base.wig gc20Base.wib
    ssh hgwdev
    cd /cluster/data/methJann1/bed/gc20Base
    mkdir /gbdb/methJann1/wib
    ln -s `pwd`/gc20Base.wib /gbdb/methJann1/wib
    hgLoadWiggle -pathPrefix=/gbdb/methJann1/wib methJann1 gc20Base gc20Base.wig
    #	verify index is correct:
    hgsql methJann1 -e "show index from gc20Base;"
    #	should see good numbers in Cardinality column


# TANDEM REPEAT MASKER (DONE)

    ssh hgwdev
    mkdir -p /cluster/data/methJann1/bed/simpleRepeat
    cd /cluster/data/methJann1
    trfBig methJann1.fa /dev/null -bedAt=/cluster/data/methJann1/bed/simpleRepeat/chr.bed
    cd /cluster/data/methJann1/bed/simpleRepeat
    hgLoadBed methJann1 simpleRepeat *.bed -sqlTable=~kent/src/hg/lib/simpleRepeat.sql

# MULTIZ with metMar1, methKand1, methTher1
# DONE (10/12/05), kpollard

    cd /cluster/data/methJann1/bed/
    mkdir conservation
    cd conservation
    cp /cluster/data/metAce1/bed/conservation/HoxD55.q .
    cp ../../methJann1.fa methJann1.chr
    cp /cluster/data/metMar1/chr.fa metMar1.chr
    cp /projects/lowelab/db/Bacteria/Methanobacterium_thermoautotrophicum/Meth_ther.fa methTher1.chr
    cp /cluster/data/methKand1/methKand1.fa methKand1.chr
    #fix names to be metMar1, methTher1, methKand1 manually
    sed s/\>c/\>methJann1.c/ methJann1.chr > temp
    mv temp methJann1.chr
    faToNib methKand1.chr methKand1.chr.nib
    faToNib metMar1.chr metMar1.chr.nib
    faToNib methTher1.chr methTher1.chr.nib
    faToTwoBit methJann1.chr methJann1.2bit
    cat methJann1.chr | gawk 'BEGIN{ind=0;}{if($1==">methJann1.chr2"){ind=1;} if(ind==0){print $0;}}' > methJann1.chr1
    cat methJann1.chr | gawk 'BEGIN{ind=1;}{if($1==">methJann1.chr2"){ind=0;} if($1==">methJann1.chr3"){ind=1;} if(ind==0){print $0;}}' > methJann1.chr2
    cat methJann1.chr | gawk 'BEGIN{ind=1;}{if($1==">methJann1.chr3"){ind=0;} if(ind==0){print $0;}}' > methJann1.chr3
    faToNib methJann1.chr1 methJann1.chr1.nib
    faToNib methJann1.chr2 methJann1.chr2.nib
    faToNib methJann1.chr3 methJann1.chr3.nib
    #chrom sizes
    faSize -detailed *.chr > chrom.sizes

    #blastz 
    blastz methJann1.chr1 metMar1.chr Q=HoxD55.q > methJann1.c1-metMar1.lav
    blastz methJann1.chr2 metMar1.chr Q=HoxD55.q > methJann1.c2-metMar1.lav
    blastz methJann1.chr3 metMar1.chr Q=HoxD55.q > methJann1.c3-metMar1.lav
    blastz methJann1.chr1 methTher1.chr Q=HoxD55.q > methJann1.c1-methTher1.lav
    blastz methJann1.chr2 methTher1.chr Q=HoxD55.q > methJann1.c2-methTher1.lav
    blastz methJann1.chr3 methTher1.chr Q=HoxD55.q > methJann1.c3-methTher1.lav
    blastz methJann1.chr1 methKand1.chr Q=HoxD55.q > methJann1.c1-methKand1.lav
    blastz methJann1.chr2 methKand1.chr Q=HoxD55.q > methJann1.c2-methKand1.lav
    blastz methJann1.chr3 methKand1.chr Q=HoxD55.q > methJann1.c3-methKand1.lav

    /cluster/bin/i386/lavToAxt methJann1.c1-metMar1.lav . . methJann1.c1-metMar1.axt
    /cluster/bin/i386/lavToAxt methJann1.c2-metMar1.lav . . methJann1.c2-metMar1.axt
    /cluster/bin/i386/lavToAxt methJann1.c3-metMar1.lav . . methJann1.c3-metMar1.axt

    /cluster/bin/i386/lavToAxt methJann1.c1-methTher1.lav . . methJann1.c1-methTher1.axt
    /cluster/bin/i386/lavToAxt methJann1.c2-methTher1.lav . . methJann1.c2-methTher1.axt
    /cluster/bin/i386/lavToAxt methJann1.c3-methTher1.lav . . methJann1.c3-methTher1.axt
    /cluster/bin/i386/lavToAxt methJann1.c1-methKand1.lav . . methJann1.c1-methKand1.axt
    /cluster/bin/i386/lavToAxt methJann1.c2-methKand1.lav . . methJann1.c2-methKand1.axt
    /cluster/bin/i386/lavToAxt methJann1.c3-methKand1.lav . . methJann1.c3-methKand1.axt

    axtBest methJann1.c1-metMar1.axt methJann1.chr1 -winSize=500 -minScore=5000 methJann1.c1-metMar1-best.axt
    axtBest methJann1.c2-metMar1.axt methJann1.chr2 -winSize=500 -minScore=5000 methJann1.c2-metMar1-best.axt
    axtBest methJann1.c3-metMar1.axt methJann1.chr3 -winSize=500 -minScore=5000 methJann1.c3-metMar1-best.axt
    axtBest methJann1.c1-methTher1.axt methJann1.chr1 -winSize=500 -minScore=5000 methJann1.c1-methTher1-best.axt
    axtBest methJann1.c2-methTher1.axt methJann1.chr2 -winSize=500 -minScore=5000 methJann1.c2-methTher1-best.axt
    axtBest methJann1.c3-methTher1.axt methJann1.chr3 -winSize=500 -minScore=5000 methJann1.c3-methTher1-best.axt
    axtBest methJann1.c1-methKand1.axt methJann1.chr1 -winSize=500 -minScore=5000 methJann1.c1-methKand1-best.axt
    axtBest methJann1.c2-methKand1.axt methJann1.chr2 -winSize=500 -minScore=5000 methJann1.c2-methKand1-best.axt
    axtBest methJann1.c3-methKand1.axt methJann1.chr3 -winSize=500 -minScore=5000 methJann1.c3-methKand1-best.axt

    foreach f(*-best.axt)
     set b=`basename $f -best.axt`
     axtToMaf $f chrom.sizes chrom.sizes $b.maf
    end
     
    #multiz
    #remove extra header lines
    #chr1
    multiz methJann1.c1-metMar1.maf methJann1.c1-methTher1.maf - > methJann1.c1-metMar1-methTher1.maf
    multiz methJann1.c1-methKand1.maf methJann1.c1-metMar1-methTher1.maf - > methJann1.c1-metMar1-methTher1-methKand1.maf
    #chr2
    multiz methJann1.c2-metMar1.maf methJann1.c2-methTher1.maf - > methJann1.c2-metMar1-methTher1.maf
    multiz methJann1.c2-methKand1.maf methJann1.c2-metMar1-methTher1.maf - > methJann1.c2-metMar1-methTher1-methKand1.maf
    #nothing on chr3
    multiz methJann1.c3-metMar1.maf methJann1.c3-methTher1.maf - > methJann1.c3-metMar1-methTher1.maf
    multiz methJann1.c3-methKand1.maf methJann1.c3-metMar1-methTher1.maf - > methJann1.c3-metMar1-methTher1-methKand1.maf

    #phyloHMM
    /cluster/bin/phast/msa_view -i MAF -M methJann1.chr1 -o SS methJann1.c1-metMar1-methTher1-methKand1.maf > methJann1.c1.ss
    /cluster/bin/phast/msa_view -i MAF -M methJann1.chr2 -o SS methJann1.c2-metMar1-methTher1-methKand1.maf > methJann1.c2.ss
    /cluster/bin/phast/phyloFit -i SS methJann1.c1.ss -t "(methKand1,(methTher1,(metMar1,methJann1)))" -o chr1_MjMmMtMk
    /cluster/bin/phast/phyloFit -i SS methJann1.c2.ss -t "(methKand1,(methTher1,(metMar1,methJann1)))" -o chr2_MjMmMtMk
    /cluster/bin/phast/draw_tree chr1_MjMmMtMk.mod > methJann1-tree.ps
    #just use chr1 model for starting model (more data)
    /cluster/bin/phast/msa_view -i SS methJann1.c1.ss --summary-only
    /cluster/bin/phast/msa_view -i SS methJann1.c2.ss --summary-only
    #add GC content to next call
    /cluster/bin/phast/phastCons methJann1.c1.ss chr1_MjMmMtMk.mod \
    --gc 0.3786 --target-coverage 0.7 --estimate-trees chr1-tree \
    --expected-lengths 25 --no-post-probs --ignore-missing \
    --nrates 1,1
    /cluster/bin/phast/phastCons methJann1.c2.ss chr1_MjMmMtMk.mod \
    --gc 0.2856 --target-coverage 0.7 --estimate-trees chr2-tree \
    --expected-lengths 25 --no-post-probs --ignore-missing \
    --nrates 1,1
    #average with phyloBoot to get cons.mod and noncons.mod
    #ls *.cons.mod > cons.txt
    #phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt
    #ls *.noncons.mod > noncons.txt
    #phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt
    #PROBLEM: Can't do this with different numbers of species in each mod
    #         Here there is no methKand1 in chr2 data.
    #Again, just use chrom1 model
    /cluster/bin/phast/phastCons methJann1.c1.ss \
    chr1-tree.cons.mod,chr1-tree.noncons.mod \
    --target-coverage 0.7 --expected-lengths 25 \
    --viterbi chr1_methJann1-elements.bed --score \
    --require-informative 0 --seqname chr1 > chr1_cons.dat
   /cluster/bin/phast/phastCons methJann1.c2.ss \
   chr1-tree.cons.mod,chr1-tree.noncons.mod \
    --target-coverage 0.7 --expected-lengths 25 \
    --viterbi chr2_methJann1-elements.bed --score \
    --require-informative 0 --seqname chr2 > chr2_cons.dat
    wigEncode chr1_cons.dat chr1_phastCons.wig chr1_phastCons.wib
    wigEncode chr2_cons.dat chr2_phastCons.wig chr2_phastCons.wib    
    #combine phastCons elements into 1 bed file
    cat chr*.bed > phastCons.bed

    #move data
    mkdir wib
    mv chr*_phastCons.wib wib/.
    mv chr*_phastCons.wig wib/.
    ln -s /cluster/data/methJann1/bed/conservation/wib/*.wib /gbdb/methJann1/wib
    mkdir /gbdb/methJann1/pwMaf
    mkdir -p otherSpp/metMar1 otherSpp/methTher1 otherSpp/methKand1
    mv methJann1.c1-methTher1.maf otherSpp/methTher1/chr1.maf
    mv methJann1.c2-methTher1.maf otherSpp/methTher1/chr2.maf
    mv methJann1.c1-metMar1.maf otherSpp/metMar1/chr1.maf
    mv methJann1.c2-metMar1.maf otherSpp/metMar1/chr2.maf
    mv methJann1.c1-methKand1.maf otherSpp/methKand1/chr1.maf
    mv methJann1.c2-methKand1.maf otherSpp/methKand1/chr2.maf
    ln -s /cluster/data/methJann1/bed/conservation/otherSpp/methTher1 /gbdb/methJann1/pwMaf/methTher1_pwMaf
    ln -s /cluster/data/methJann1/bed/conservation/otherSpp/methKand1 /gbdb/methJann1/pwMaf/methKand1_pwMaf
    ln -s /cluster/data/methJann1/bed/conservation/otherSpp/metMar1 /gbdb/methJann1/pwMaf/metMar1_pwMaf
    mkdir multiz
    mv methJann1.c1-metMar1-methTher1-methKand1.maf multiz/chr1.maf
    mv methJann1.c2-metMar1-methTher1-methKand1.maf multiz/chr2.maf
    ln -s /cluster/data/methJann1/bed/conservation/multiz /gbdb/methJann1/multizMjMmMtMk

    #load
    hgLoadWiggle methJann1 phastCons /cluster/data/methJann1/bed/conservation/wib/chr*_phastCons.wig
    hgLoadMaf -warn methJann1 multizMjMmMtMk
    hgLoadMaf -warn methJann1 methTher1_pwMaf -pathPrefix=/gbdb/methJann1/pwMaf/methTher1_pwMaf
    hgLoadMaf -warn methJann1 methKand1_pwMaf -pathPrefix=/gbdb/methJann1/pwMaf/methKand1_pwMaf
    hgLoadMaf -warn methJann1 metMar1_pwMaf -pathPrefix=/gbdb/methJann1/pwMaf/metMar1_pwMaf
    hgLoadBed methJann1 phastConsElements phastCons.bed 

    #trackDb
    cd ~/kent/src/hg/makeDb/trackDb/archae
    mkdir methJann1
    cvs add methJann1
    cd methJann1
    #trackDb.ra entry
    # track multizMjMmMtMk
    # shortLabel Conservation
    # longLabel Methanogen multiz alignments
    # group compGeno
    # priority 10.0
    # visibility pack
    # type wigMaf 0.0 1.0
    # maxHeightPixels 100:40:11
    # wiggle phastCons
    # yLineOnOff Off
    # autoScale Off
    # pairwise pwMaf
    # speciesOrder metMar1 methTher1 methKand1
    cvs add trackDb.ra
    cvs commit -m "New multiz track" trackDb.ra
    #html page
    cvs add multizMjMmMtMk.html
    cvs commit -m "Details page for multiz track" multizMjMmMtMk.html

# DESCRIPTION PAGE ()

    ssh hgwdev
    # Write ~/kent/src/hg/makeDb/trackDb/archae/methJann1/description.html
    chmod a+r ~/kent/src/hg/makeDb/trackDb/archae/methJann1/description.html
    # Check it in.
    mkdir /gbdb/methJann1/html
    ln -s /cluster/data/methJann1/html/description.html /gbdb/methJann1/html/

# GENBANK PROTEIN-CODING GENES ()

    ssh hgwdev
    mkdir /cluster/data/methJann1/genbank
    cd /cluster/data/methJann1/genbank
    cp /projects/lowelab/db/Bacteria/Methanococcus_jannaschii/NC_00*.gbk .
    mv NC_003552.gbk methJann1.gbk
    sed -e 's/db_xref="GeneID:/db_xref2="/' methJann1.gbk > methJann1.fix.gbk
    # Create 3 files to assist parsing of the genbank
    # 1. for a bed file
    echo 'chr
start
end
gene
1000
strand' > methJann1-params-bed.txt
    # 2. for the peptide parts
    echo 'gene
translation' > methJann1-params-pep.txt
    # 3. for the other gene information
    echo 'locus_tag
db_xref2
product
note
protein_id
db_xref
EC_number
pseudo' > methJann1-params-xra.txt
    # Now extract the genes and information:
    gbArchaeGenome methJann1.fix.gbk methJann1-params-bed.txt methJann1-genbank-cds.bed
    gbArchaeGenome methJann1.fix.gbk methJann1-params-pep.txt methJann1-genbank-cds.pep
    gbArchaeGenome methJann1.fix.gbk methJann1-params-xra.txt methJann1-genbank-cds.xra
    hgsql methJann1 < ~/kent/src/hg/lib/pepPred.sql
    hgsql methJann1 < ~/kent/src/hg/lib/minGeneInfo.sql
    echo rename table pepPred to gbProtCodePep | hgsql methJann1
    echo rename table minGeneInfo to gbProtCodeXra | hgsql methJann1
    echo load data local infile \'methJann1-genbank-cds.pep\' into table gbProtCodePep | hgsql methJann1
    echo load data local infile \'methJann1-genbank-cds.xra\' into table gbProtCodeXra | hgsql methJann1

#genbank to genePred

csh
tawk '{print $1,$2,$3,$4,$5,$6,$2,$3,0,1,$3-$2,0}' methJann1-genbank-cds.bed | bedToGenePred stdin tmp.gp
tawk '{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,substr($1,3,4),name2,"cmpl","cmpl",0}' tmp.gp  > tmp2.gp
join -t "     " -o 1.1,1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 1.13 1.14 1.15  tmp2.gp methJann1-genbank-cds.xra > methJann1.gp
ldHgGene methJann1 refSeq methJann1.gp -predTab -genePredExt

# GENBANK rRNA GENES ()
    ssh hgdev
    cd /cluster/data/methJann1/genbank
    gbArchaeGenome -kind=rRNA methJann1.gbk methJann1-params-bed.txt methJann1-rrnas.bed
    echo 'gene product NA' > methJann1-params-rrna-xra.txt
    gbArchaeGenome -kind=rRNA methJann1.gbk methJann1-params-rrna-xra.txt methJann1-rrnas-xra.txt
    hgLoadBed methJann1 gbRRNA methJann1-rrnas.bed
    hgsql methJann1 < ~/kent/src/hg/lib/minGeneInfo.sql
    echo rename table minGeneInfo to gbRRNAXra | hgsql methJann1
    echo load data local infile \'methJann1-rrnas-xra.txt\' into table gbRRNAXra | hgsql methJann1

# COG STUFF
  grep COG /projects/lowelab/db/Bacteria/Methanococcus_jannaschii/NC_00*.ptt | awk 'NR>3{OFS="\t";print $6,$8,$7}'  > COG
  hgsql methJann1 < /cluster/home/baertsch/kent/src/hg/lib/cogs.sql
  echo "load data local infile 'COG' into table COG" | hgsql methJann1
# load cog codes
 hgsql methJann1 < /cluster/data/metAce1/genbank/COGXra.sql

    

# TODD LOWE tRNA GENES ()

    # This one is a bed 6+ file created by hand of 46 tRNAs and 1 pseudo tRNA by Todd
    # Lowe.  See ~/kent/src/hg/lib/loweTrnaGene.as for a description of the fields.
    # **Showing the tRNAScanSE instructions would be nice in the future.  
    ssh hgwdev
    mkdir /cluster/data/methJann1/bed/loweTrnaGene
    cd /cluster/data/methJann1/bed/loweTrnaGene
    hgLoadBed -tab methJann1 loweTrnaGene methJann1-lowe-trnas.bed -sqlTable=~/kent/src/hg/lib/loweTrnaGene.sql

# TODD LOWE snoRNA GENES ()
    # This is a bed 6 file created by hand.
    ssh hgwdev
    mkdir /cluster/data/methJann1/bed/loweSnoGene
    cd /cluster/data/methJann1/bed/loweSnoGene
    hgLoadBed -tab methJann1 loweSnoGene methJann1-snos.bed

# TIGR GENES ()
    # First go to http://www.tigr.org/tigr-scripts/CMR2/gene_attribute_form.dbi
    # and fill out the web form as follows:
    #   - Pick "Retrieve attributes for the specified DNA feature within a specific 
    #     organism and/or a specific role category".
    #       * Pick "Pyrobaculum aerophilum IM2", and "Primary and TIGR annotation ORFs" 
    #         from the 1st and 3rd box.
    #       * Select everything from "Choose TIGR Annotation Gene Attributes"
    #       * Select "Primary Locus Name" from "Choose Primary Annotation Gene Attributes"
    #       * Select everything from "Choose Other Gene Attributes"
    #   - Click submit, and click save as tab-delimited file.
    ssh hgwdev
    mkdir /cluster/data/methJann1/bed/tigrCmrORFs
    cp methJann1-tigr.tab /cluster/data/methJann1/bed/tigrCmrORFs
    cd /cluster/data/methJann1/bed/tigrCmrORFs
    /projects/lowelab/users/aamp/bin/i386/tigrCmrToBed methJann1-tigr.tab methJann1-tigr.bed
    hgLoadBed -tab methJann1 tigrCmrORFs methJann1-tigr.bed -sqlTable=~/kent/src/hg/lib/tigrCmrGene.sql
