# for emacs: -*- mode: sh; -*-


# Apis mellifera -- 
# 
# Baylor HGSC's May 5, 2005 (Amel_3.0) assembly
# ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20050501-freeze/
#

# DOWNLOAD SEQUENCE (DONE 12/12/05 Andy)
    df -h /cluster/store*    
#Filesystem            Size  Used Avail Use% Mounted on
#kkstore01-10:/export/cluster/store9
#                      1.3T  697G  546G  57% /cluster/store9
    # store9, whatever
    mkdir -p /cluster/store9/apiMel3/downloads
    ln -s /cluster/store9/apiMel3 /cluster/data/apiMel3
    ln -s /cluster/data/apiMel3 ~/apiMel3
    cd apiMel3/downloads/
    wget -N ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20050501-freeze/contigs/All_Groups_20050501.agp
    cut -f1-9 All_Groups_20050501.agp > tmp.agp
    agpCondense tmp.agp apiMel3.agp
    rm tmp.agp
    for i in `seq 1 16` Un; do
	f=Group${i}_20050501.fa.gz
        wget -N ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20050501-freeze/contigs/${f}
	zcat $f >> contigs.fa
    done
    agpAllToFaFile apiMel3.agp contigs.fa apiMel3.fa
    rm contigs.fa

# CREATING DATABASE (DONE 12/13/2005 Andy)
    ssh hgwdev
    hgsql '' -e 'create database apiMel3'

# LOAD GAP & GOLD TABLES FROM AGP (DONE 12/13/2005 Andy)
    ssh hgwdev
    cd /cluster/data/apiMel3/downloads
    hgGoldGapGl -noGl apiMel3 apiMel3.agp
    hgsql apiMel3 -e 'analyze table gold; analyze table gap;'

# REPEATMASK (DONE 12/13/2005 Andy)
    ssh hgwdev
    cd san/honeybee/
    mkdir -p unmaskedSplits/splits rmsk/{run,output}
    faSplit -outDirDepth=1 -lift=unmaskedSplits/splits.lft gap /cluster/data/apiMel3/downloads/apiMel3.fa 500000 unmaskedSplits/honey_
    cd rmsk/
    ln -s ../unmaskedSplits splits
    cd run/
    cat > rmsk.sh << "EOF"
#!/bin/bash
fa=`basename $1`
param=-spec apis

pushd /scratch
oldDir=`dirs +1`
cp $oldDir/$1 .
/cluster/bluearc/RepeatMasker/RepeatMasker $param $fa
mkdir -p $oldDir/`dirname $2`
cp $fa.out $oldDir/$2
rm $fa*
popd
EOF
    chmod +x rmsk.sh
    cat > gsub << "EOF"
#LOOP
./rmsk.sh {check in line+ $(path1)} {check out exists ../out/$(file1).out} 
#ENDLOOP
EOF
    find ../splits/ -name '*.fa' > fa.lst
    gensub2 fa.lst single gsub spec
    para create spec
    para try
    para push
    para time
Completed: 1247 of 1247 jobs
CPU time in finished jobs:     809754s   13495.90m   224.93h    9.37d  0.026 y
IO & Wait Time:                  9761s     162.68m     2.71h    0.11d  0.000 y
Average job time:                 657s      10.95m     0.18h    0.01d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:            1847s      30.78m     0.51h    0.02d
Submission to last job:          4476s      74.60m     1.24h    0.05d
    cd ../
    head -n 3 out/honey_0177.fa.out > head
    find out/ -name "*.fa.out" -type f -size +80c -exec tail +4 '{}' ';' > tmp
    cat head tmp > tmp2
    liftUp rmsk.out splits/splits.lft warn tmp2
    rm head tmp tmp2
    ssh hgwdev
    cd san/apiMel3/rmsk
    hgLoadOut apiMel3 rmsk.out
    hgsql apiMel3 -e 'rename table rmsk_rmsk to rmsk'
    hgsql apiMel3 -e 'drop index bin       on rmsk; \
                  drop index genoStart on rmsk; \
                  drop index genoEnd   on rmsk; \
                  create index bin       on rmsk (genoName(11), bin); \
                  create index genoStart on rmsk (genoName(11), genoStart); \
                  create index genoEnd   on rmsk (genoName(11), genoEnd);' 
    rm splits
    cd ../
    cp -r rmsk/ /cluster/data/apiMel3/
    rm -rf rmsk/

# SIMPLE REPEATS (TRF) (DONE 10/22/2005 Andy)
    ssh pk
    cd san/apiMel3
    mkdir -p simpleRepeat/{run,out}
    cd simpleRepeat/
    ln -s ../unmaskedSplits splits
    cd run/
    cat << "_EOF_" > gsub
#LOOP
./trfBig.sh {check in line+ $(path1)} {check out line+ ../out/$(lastDir1)/$(root1).bed}
#ENDLOOP
_EOF_
   cat << "_EOF_" > trfBig.sh
#!/bin/bash
outDir=`dirname $2`
outName=`basename $2`
inName=`basename $1`

mkdir -p $outDir
cp $1 /scratch
pushd /scratch
trfBig $inName $outName -bed -tempDir=/tmp > /dev/null
popd
cp /scratch/$outName $2
rm /scratch/$outName /scratch/$inName
_EOF_
    chmod +x trfBig.sh
    find ../splits/ -name '*.fa' > fa.lst
    gensub2 fa.lst single gsub spec
    para create spec
    para push
    para time
#Completed: 1246 of 1247 jobs
#CPU time in finished jobs:       3355s      55.91m     0.93h    0.04d  0.000 y
#IO & Wait Time:                 10020s     167.00m     2.78h    0.12d  0.000 y
#Average job time:                  11s       0.18m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             130s       2.17m     0.04h    0.00d
#Submission to last job:           179s       2.98m     0.05h    0.00d
    ssh hgwdev
    cd san/apiMel3/simpleRepeat/
    find out/ -name "*.bed" -type f -exec grep "^honey" '{}' ';' > tmp
    liftUp trf.bed splits/splits.lft warn tmp
    rm tmp
    hgLoadBed -sqlTable=/cluster/home/aamp/kent/src/hg/lib/simpleRepeat.sql \
       apiMel3 simpleRepeat trf.bed    
    rm splits
    cd ../    
    cp -r simpleRepeat/ /cluster/data/apiMel3/bed

# MAKE MASKED NIBS/2BIT (DONE 10/22/2005 Andy)
    # make a filtered version of the trf output keep trf's with period <= 12:
    ssh hgwdev
    cd /cluster/data/apiMel3
    zcat bed/simpleRepeat/trf.bed | awk '{if ($5 <= 12) print;}' > trfMask.bed
    maskOutFa -soft downloads/apiMel3.fa trfMask.bed tmp
    maskOutFa -softAdd tmp rmsk/rmsk.out apiMel3.masked.fa
    faToTwoBit apiMel3.masked.fa apiMel3.2bit
    hgsql apiMel3 < ~/kent/src/hg/lib/chromInfo.sql
    twoBitInfo apiMel3.2bit /dev/stdout | 
        awk '{printf("%s\t%s\t/gbdb/apiMel3/apiMel3.2bit\n", $1, $2)}' > chrom.sizes
    echo "load data local infile 'chrom.sizes' into table chromInfo;" | hgsql apiMel3
    mkdir maskedFa nib
    faSplit byname apiMel3.masked.fa maskedFa/
    cd maskedFa/
    for f in *; do g=${f%.fa}; faToNib -softMask $f ../nib/${g}.nib; done
    cd ../
    rm apiMel3.masked.fa tmp trfMask.bed 


#########################################################################
# MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE 10/5/07 angie)
    ssh kkstore04
    cd /cluster/data/apiMel3
    # Put files in the usual chrom-based assembly locations:
    zcat bed/simpleRepeat/trf.bed.gz \
    | awk '{if ($5 <= 12) print;}' \
    | splitFileByColumn stdin -ending=.bed bed/simpleRepeat/trfMaskChrom
    splitFileByColumn -chromDirs downloads/apiMel3.agp .
    head -3 rmsk/rmsk.out > /tmp/rmskHeader
    tail +4 rmsk/rmsk.out \
    | sort -k5,5 \
    | splitFileByColumn -chromDirs -head=/tmp/rmskHeader -col=5 \
      -ending=.fa.out stdin .
    mkdir jkStuff

    makeDownloads.pl apiMel3 -verbose=2 \
      >& jkStuff/downloads.log & tail -f jkStuff/downloads.log
    # Edited the README's -- instructions at the end of downloads.log.
#     /cluster/data/apiMel3/goldenPath/database/README.txt
#     /cluster/data/apiMel3/goldenPath/bigZips/README.txt
#     /cluster/data/apiMel3/goldenPath/chromosomes/README.txt


#########################################################################
