#!/bin/csh
exit;

set ortho=ortho4
set mercatorFile=encodeMaps20050504.tar.gz

# start with Mercator data from Colin Dewey, and copy it to working directory: 
#    /cluster/data/encode/$ortho/mercatorInput/${mercatorFile}

# ${HOME}/kent/src/hg/lib/encodeRegionMercator.sql
#create table encodeRegionsMercator (
#    bin        smallint(5)  unsigned not null,
#    chrom      varchar(255)          not null,
#    chromStart int(10)      unsigned not null,
#    chromEnd   int(10)      unsigned not null,
#    name       varchar(255)          not null,
#    score      int(10)      unsigned not null,
#    strand     char(1)               not null
#)

###
::::::::::::::
mercatorInput/makeBed5.pl
::::::::::::::
#!/usr/bin/perl -W

while (<>) {
    chomp();
    if (/^(.*)\s+(\d+)\s+(\d+)\s+(EN\D\d\d\d)\.(\d+)_of.*/) {
        printf("$1\t$2\t$3\t$4\t$5\n");
    } else {
        die "Bad input line: $_";
    }
}
###

set dir = /cluster/data/encode/${ortho}/mercatorInput
mkdir -p $dir
cd $dir

tar xvfz $mercatorFile
mv macMul1.bed rheMac1.bed
chmod g+w $dir/*
foreach db (`ls -1 *bed | sed 's/.bed//g' | xargs echo`)
    echo ========= $db ========== `date`
    rm -f $db.log $db.unmerged.bed5
    makeBed5.pl < $db.bed > $db.unmerged.bed5
    liftOverMerge -mergeGap=20000 -verbose=2 $db.unmerged.bed5 $db.bed5 >& $db.log
    hgLoadBed -tab -sqlTable=${HOME}/kent/src/hg/lib/encodeRegionsMercator.sql $db encodeRegionsMercator $db.bed >>& $db.log
    hgLoadBed -tab $db encodeRegionsMercatorMerged $db.bed5 >>& $db.log
end
rm -f bed.tab

########################################################################
#############  liftOver orthologous regions  ###########################
########################################################################
set dir = /cluster/data/encode/${ortho}/liftOverRegions
mkdir -p $dir
cd $dir
hgsql hg17 -e "SELECT * FROM encodeRegions ORDER BY name" | tail +2 > hg17.bed
hgsql -h genome-testdb hgcentraltest -e "select toDb, path from liftOverChain " | grep /gbdb/hg17 | grep -v hg16 > hg17.over.chain.files

########################################################################
# CHIMP
    # THESE NOTES ARE FROM ORTHO2
    # used chains from reciprocal best net -- these were more
    # chopped up, but regions in these don't span unbridged gaps
    # minMatch .7 leaves ENr324 and ENm011 unmapped
    # Lowered match threshold until region coverage looked good (to .4)
    # (except m11, for some reason),  with minimum of fragmentation
    # Same at .3 and .2 -- at the point where regions look reasonable,
    # lowering the threshold in this way doesn't change them.
    # NOTE: reducing minSizeQ to 10000, due to fragmented panTro1 assembly
liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=10000 -multiple -chainTable=hg17.chainPanTro1 hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.chain panTro1.unmerged.bed5 panTro1.unmapped.bed -verbose=2 >& panTro1.log
liftOverMerge -mergeGap=20000 -verbose=2 panTro1.unmerged.bed5 panTro1.bed5 >>& panTro1.log
wc -l panTro1.*bed*
    #     81 panTro1.bed5
    #      0 panTro1.unmapped.bed
    #    107 panTro1.unmerged.bed5
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' panTro1.bed5 > panTro1.bed
hgLoadBed -noBin panTro1 encodeRegionsLiftOver panTro1.bed >>& panTro1.log

########################################################################
# MACAQUE
    # NOTE: reducing minSizeQ to 10000, due to fragmented rheMac1 assembly
liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=10000 -multiple -chainTable=hg17.chainRheMac1 hg17.bed /cluster/data/hg17/bed/liftOver/hg17.rheMac1.over.chain rheMac1.unmerged.bed5 rheMac1.unmapped.bed -verbose=2 >& rheMac1.log
liftOverMerge -mergeGap=20000 -verbose=2 rheMac1.unmerged.bed5 rheMac1.bed5 >>& rheMac1.log
wc -l rheMac1.*bed*
    #    495 rheMac1.bed5
    #      0 rheMac1.unmapped.bed
    #    544 rheMac1.unmerged.bed5
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' rheMac1.bed5 > rheMac1.bed
hgLoadBed -noBin rheMac1 encodeRegionsLiftOver rheMac1.bed >>& rheMac1.log

########################################################################
# RAT
liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=20000 -multiple -chainTable=hg17.chainRn3 hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToRn3.over.chain rn3.unmerged.bed5 rn3.unmapped.bed -verbose=2 >& rn3.log
liftOverMerge -mergeGap=20000 -verbose=2 rn3.unmerged.bed5 rn3.bed5 >>& rn3.log
wc -l rn3.*bed*
    #     53 rn3.bed5
    #      0 rn3.unmapped.bed
    #     65 rn3.unmerged.bed5
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' rn3.bed5 > rn3.bed
hgLoadBed -noBin rn3 encodeRegionsLiftOver rn3.bed >>& rn3.log

########################################################################
# CHICKEN
    # allow smaller chain sizes in query (only 1K, vs. 20K in close species)
liftOver -minMatch=.01 minSizeT=4000 -multiple -minSizeQ=1000 -chainTable=hg17.chainGalGal2 hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToGalGal2.over.chain galGal2.unmerged.bed5 galGal2.unmapped.bed -verbose=2 >& galGal2.log
liftOverMerge -mergeGap=20000 -verbose=2 galGal2.unmerged.bed5 galGal2.bed5 >>& galGal2.log
wc -l galGal2.*bed*
    #     94 galGal2.bed5
    #      4 galGal2.unmapped.bed
    #    131 galGal2.unmerged.bed5
    ##Partially deleted in new
    #chr13   29418015        29918015        ENr111
    #chr5    141880150       142380150       ENr212
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' galGal2.bed5 > galGal2.bed
hgLoadBed -noBin galGal2 encodeRegionsLiftOver galGal2.bed >>& galGal2.log

########################################################################
# MOUSE MM6
liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=20000 -multiple hg17.bed /cluster/data/hg17/bed/liftOver/hg17.mm6.over.chain.gz mm6.unmerged.bed5 mm6.unmapped.bed -verbose=2 >& mm6.log
liftOverMerge -mergeGap=20000 -verbose=2 mm6.unmerged.bed5 mm6.bed5 >>& mm6.log
wc -l mm6.*bed*
    #     53 mm6.bed5
    #      0 mm6.unmapped.bed
    #     67 mm6.unmerged.bed5
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' mm6.bed5 > mm6.bed
hgLoadBed -noBin mm6 encodeRegionsLiftOver mm6.bed >>& mm6.log

########################################################################
# DOG
liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=20000 -multiple -chainTable=hg17.chainCanFam1 hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToCanFam1.chain canFam1.unmerged.bed5 canFam1.unmapped.bed -verbose=2 >& canFam1.log
liftOverMerge -mergeGap=20000 -verbose=2 canFam1.unmerged.bed5 canFam1.bed5 >>& canFam1.log
wc -l canFam1.*bed*
    #     51 canFam1.bed5
    #      0 canFam1.unmapped.bed
    #     58 canFam1.unmerged.bed5
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' canFam1.bed5 > canFam1.bed
hgLoadBed -noBin canFam1 encodeRegionsLiftOver canFam1.bed >>& canFam1.log

########################################################################
# OPOSSUM
liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=5000 -multiple -chainTable=hg17.chainMonDom1 hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToMonDom1.chain monDom1.unmerged.bed5 monDom1.unmapped.bed -verbose=2 >& monDom1.log
liftOverMerge -mergeGap=20000 -verbose=2 monDom1.unmerged.bed5 monDom1.bed5 >>& monDom1.log
wc -l monDom1.*bed*
    #    143 monDom1.bed5
    #      0 monDom1.unmapped.bed
    #    191 monDom1.unmerged.bed5
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' monDom1.bed5 > monDom1.bed
hgLoadBed -noBin monDom1 encodeRegionsLiftOver monDom1.bed >>& monDom1.log

########################################################################
# TETRA
rm -f tetNig1.log tetNig1.*bed*
liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=1000 -multiple -chainTable=hg17.chainTetNig1 hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToTetNig1.chain tetNig1.unmerged.bed5 tetNig1.unmapped.bed -verbose=2 >& tetNig1.log
liftOverMerge -mergeGap=20000 -verbose=2 tetNig1.unmerged.bed5 tetNig1.bed5 >>& tetNig1.log
wc -l tetNig1.*bed*
    #    137 tetNig1.bed5
    #     14 tetNig1.unmapped.bed
    #    167 tetNig1.unmerged.bed5
    #
    #Partially deleted in new
    #chr5    141880150       142380150       ENr212
    #
    #Deleted in new
    #chr2    51570355        52070355        ENr112
    #chr4    118604258       119104258       ENr113
    #chr16   25780427        26280428        ENr211
    #chr14   52947075        53447075        ENr311
    #chr11   130604797       131104797       ENr312
    #chr16   60833949        61333949        ENr313
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' tetNig1.bed5 > tetNig1.bed
hgLoadBed -noBin tetNig1 encodeRegionsLiftOver tetNig1.bed >>& tetNig1.log

########################################################################
# ZEBRAFISH
rm -f danRer2.log danRer2.*bed*
liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=1000 -multiple -chainTable=hg17.chainDanRer2 hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain danRer2.unmerged.bed5 danRer2.unmapped.bed -verbose=2 >& danRer2.log
liftOverMerge -mergeGap=20000 -verbose=2 danRer2.unmerged.bed5 danRer2.bed5 >>& danRer2.log
wc -l danRer2.*bed*
    #    238 danRer2.bed5
    #      2 danRer2.unmapped.bed
    #    278 danRer2.unmerged.bed5
    #Deleted in new
    #chr11   130604797       131104797       ENr312
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' danRer2.bed5 > danRer2.bed
hgLoadBed -noBin danRer2 encodeRegionsLiftOver danRer2.bed >>& danRer2.log

########################################################################
# COW
liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=20000 -multiple -chainTable=hg17.chainBosTau1 hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToBosTau1.chain bosTau1.unmerged.bed5 bosTau1.unmapped.bed -verbose=2 >& bosTau1.log
liftOverMerge -mergeGap=20000 -verbose=2 bosTau1.unmerged.bed5 bosTau1.bed5 >>& bosTau1.log
wc -l bosTau1.*bed*
    #    241 bosTau1.bed5
    #      2 bosTau1.unmapped.bed
    #    243 bosTau1.unmerged.bed5
    #Deleted in new
    #chr13   112338064       112838064       ENr132
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' bosTau1.bed5 > bosTau1.bed
hgLoadBed -noBin bosTau1 encodeRegionsLiftOver bosTau1.bed >>& bosTau1.log

rm -f bed.tab


########################################################################
#############       Create consensus regions     #######################
#############  (Union of liftOver and Mercator)  #######################
########################################################################

###
::::::::::::::
consensus/makeBed5.pl
::::::::::::::
#!/usr/bin/perl -W
#assumes that input is sorted by name (region)
$region="";
$regionCounter=0;

while (<>) 
{
    chomp();
    if (/^(.*)\s+(\d+)\s+(\d+)\s+(EN[mr]\d\d\d)/)
    {
        if ($region eq $4) { $regionCounter++; } else { $region=$4; $regionCounter=1; }
        printf("$1\t$2\t$3\t$4\t$regionCounter\n");
    }
    elsif (/^(.*)\s+(\d+)\s+(\d+)\s+(MEN[mr]\d\d\d)/)
    {
        # regions with names that begin with 'M' have been manually marked for
        # removal and can be ignored.
        next;
    }
    elsif (/chromStart/)
    {
        # header line
        next;
    }
    else
    {
        die "Bad input line: $_";
    }
}
###

# create consensus regions and load them
set dir = /cluster/data/encode/${ortho}/consensus
mkdir -p $dir
cd $dir

date
foreach db (`ls -1 /cluster/data/encode/${ortho}/mercatorInput/*bed | sed 's/.bed//g' | cut -f7 -d"/" | grep -v '\.' | xargs echo`)
    rm -f $db.log $db.*.bed* $db.encodeRegionsConsensus*
    regionOrtho $db.encodeRegionsLiftOver $db.encodeRegionsMercatorMerged $db.encodeRegionsConsensusUnmerged.bed $db.regionOrtho.err >& $db.log
    sort -k4 -k1.4n -k2n $db.encodeRegionsConsensusUnmerged.bed | makeBed5.pl > $db.encodeRegionsConsensusUnmerged.bed5
    liftOverMerge -mergeGap=20000 -verbose=2 $db.encodeRegionsConsensusUnmerged.bed5 $db.encodeRegionsConsensus1.bed5 >>& $db.log
    awk '{printf "%s\t%d\t%d\t%s_%s\n", $1, $2, $3, $4, $5}' $db.encodeRegionsConsensus1.bed5 > $db.encodeRegionsConsensus1.bed
    hgLoadBed $db encodeRegionsConsensus -tab $db.encodeRegionsConsensus1.bed >>& $db.log
    hgsql $db -e "select * from encodeRegionsConsensus where chromEnd-chromStart<2000" > $db.smallRegions.bed
    hgsql $db -e "delete from encodeRegionsConsensus where chromEnd-chromStart<2000"
    hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" > $db.encodeRegionsConsensus2.bed
    sort -k4 -k1.4n -k2n $db.encodeRegionsConsensus2.bed | makeBed5.pl > $db.encodeRegionsConsensus.bed5
    awk '{printf "%s\t%d\t%d\t%s_%s\n", $1, $2, $3, $4, $5}' $db.encodeRegionsConsensus.bed5 > $db.encodeRegionsConsensus.bed
    hgLoadBed $db encodeRegionsConsensus -tab $db.encodeRegionsConsensus.bed >>& $db.log
end
date


########################################################################
#############  Make html frames pages for analysis  ####################
########################################################################

set dir = /cluster/data/encode/${ortho}/html
mkdir -p $dir
cd $dir

rm -f positionFile
hgsql -e "select chrom, chromStart+1, chromEnd, name from encodeRegions order by name" hg17 | tail +2 | awk '{printf "%s:%d-%d\t%s\n",$1,$2,$3,$4}' > positionFile

cp -f ${HOME}/kent/src/hg/encode/regionOrtho/mkOrthologAllFrame.pl /cluster/bin/scripts/mkOrthologAllFrame.pl
chmod ug+x /cluster/bin/scripts/mkOrthologAllFrame.pl

ln -s ../../ortho3/html/descriptionFile .
ln -s ../../ortho3/html/headerFile .

# 20-30 seconds each - lots of nib lookups
foreach db (`ls -1 /cluster/data/encode/${ortho}/mercatorInput/*bed | sed 's/.bed//g' | cut -f7 -d"/" | grep -v '\.' | xargs echo`)
    echo `date` $db
    rm -f $db.html
    mkOrthologAllFrame.pl descriptionFile positionFile headerFile hg17 $db encodeRegionsConsensus encodeRegionsLiftOver encodeRegionsMercator > $db.html
end
#Wed May 4 14:19:44 PDT 2005 bosTau1
#Wed May 4 14:30:57 PDT 2005 canFam1
#Wed May 4 14:31:17 PDT 2005 danRer2
#Wed May 4 14:31:33 PDT 2005 fr1
#Wed May 4 14:31:39 PDT 2005 galGal2
#Wed May 4 14:31:50 PDT 2005 mm6
#Wed May 4 14:32:14 PDT 2005 monDom1
#Wed May 4 14:32:40 PDT 2005 panTro1
#Wed May 4 14:33:03 PDT 2005 rheMac1
#Wed May 4 14:38:44 PDT 2005 rn3
#Wed May 4 14:39:07 PDT 2005 tetNig1
#Wed May 4 14:39:18 PDT 2005 xenTro1
#Wed May 4 14:39:35 PDT 2005

set outDir = /usr/local/apache/htdocs/encode/${ortho}
mkdir -p $outDir
cp -f *.html $outDir
chmod o+r $outDir/*.html
chmod o+rx $outDir

## prune excessively large mercator regions

set dir = /cluster/data/encode/${ortho}/mercatorPrune
mkdir -p $dir
cd $dir
rm -f removeBigRegions.sql

grep -P "\d" ../html/*encodeRegionsMercator.bed | awk '{printf "%s\t%d\t%d\t%s\t%d\n",$1,$2,$3,$4,$3-$2}' | removeBigRegions.pl > removeBigRegions.sql
hgsql hg17 < removeBigRegions.sql

## recompute consensus, and remake html files by copying and executing the code from those sections above
#Wed May 4 21:15:38 PDT 2005 bosTau1
#Wed May 4 21:25:57 PDT 2005 canFam1
#Wed May 4 21:26:15 PDT 2005 danRer2
#Wed May 4 21:26:29 PDT 2005 fr1
#Wed May 4 21:26:34 PDT 2005 galGal2
#Wed May 4 21:26:45 PDT 2005 mm6
#Wed May 4 21:27:06 PDT 2005 monDom1
#Wed May 4 21:27:29 PDT 2005 panTro1
#Wed May 4 21:27:51 PDT 2005 rheMac1
#Wed May 4 21:33:04 PDT 2005 rn3
#Wed May 4 21:33:25 PDT 2005 tetNig1
#Wed May 4 21:33:35 PDT 2005 xenTro1


########################################################################
###############  Write AGP files from consensus regions  ###############
########################################################################

# TODO: contigAcc files should go in $outDir (neater this way)

#ortho is set previously and is a reminder here
#set ortho=ortho4 
set orthoDir     = /cluster/data/encode/${ortho}
set agpDir       = $orthoDir/agp
set consensusDir = $orthoDir/consensus
set downloadDir  = /usr/local/apache/htdocs/encode/${ortho}/agp
set regionAgp    = regionAgp
mkdir -p $agpDir $downloadDir $agpDir/tests
chmod o+rx $downloadDir
rm -rf $agpDir/* $downloadDir/*
cd $agpDir
#rm -f *.contig.tab */*.packing.list >& /dev/null

###
::::::::::::::
agp/makeBed5.pl
::::::::::::::
#!/usr/bin/perl -W
#assumes that input is already numbered
while (<>) {
    chomp();
    if (/chromStart/) {
	next;
    }
    if (/^(.*)\s+(\d+)\s+(\d+)\s+(EN[mr]\d\d\d).(\d+)/) {
        printf("$1\t$2\t$3\t$4\t$5\n");
    } else {
        die "Bad input line: $_";
    }
}
###
chmod ug+x makeBed5.pl

########################################################################
# create script to run AGP-maker on an assembly
rm -rf genomeAgps.csh >& /dev/null
cat > genomeAgps.csh << 'EOF'
#!/bin/csh

# create AGPs for a genome assembly
# NOTE: should have freeze dir (${ortho}) as an arg
# WARNING: this script uses $consensusDir/*.bed5, but instructions
# below create a separate version, from the encodeRegionsConsensus table
# Suggest next time using just the $consensusDir everywhere

if ($#argv != 4) then
    echo "usage: $0 <db> <org> <outdir> <ortho>"
    exit 1
endif
echo
echo "command line:" $0 $1 $2 $3 $4
echo "working dir:"  `pwd`

set db           = $1
set org          = $2
set outDir       = $3
set ortho        = $4
set consensusDir = /cluster/data/encode/${ortho}/consensus
set buildDir     = /cluster/data/$db
set bedFile      = $consensusDir/$db.encodeRegionsConsensus.bed5
set regionAgp    = regionAgp
mkdir -p $outDir

if (-f $org.contig.tab) then
    set contigArg = "-contigFile=$org.contig.tab"
else
    set contigArg = ""
endif

set runRegionAgp = "$regionAgp $contigArg -namePrefix=${org}_ $bedFile stdin -dir $outDir"

#if (-f $buildDir/chrom.lst) then
#    cat $buildDir/chrom.lst | xargs -iX cat $buildDir/X/chr{X,X_random}.agp | $runRegionAgp
#else
    cat $buildDir/?{,?}/*.agp | $runRegionAgp
#endif
'EOF'
# << this line makes emacs coloring happy

########################################################################
# create test scripts

cd /cluster/data/encode/${ortho}/agp/tests
cat > getSeqFromAcc.pl <<EOF
#!/usr/bin/perl -W
# download contig sequences from Genbank to $db.contig.fa directory
$URL = "http://www.ncbi.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&rettype=fasta&id=";
$db=shift;
$ortho=shift;
$dir = "/cluster/data/encode/${ortho}/agp/tests/$db.contig.fa";
system("mkdir -p $dir");
while (<>)
{
    chomp();
    if (-e "$dir/$_") { next; }
    system("wget -O $dir/$_ \"$URL$_\"");
}
EOF
#
chmod ug+x getSeqFromAcc.pl

cd /cluster/data/encode/${ortho}/agp/tests
cat > testFa.csh << 'EOF'
#!/bin/csh
set db = $1                                                                                 
set org = $2                                                                                
set ortho = $3
ln -s /cluster/data/encode/${ortho}/consensus/$db.encodeRegionsConsensus.bed /cluster/data/encode/${ortho}/consensus/$db.bed
foreach f (../$db/*.agp)                                                                    
    set seq = $f:t:r                                                                        
    set region = `echo $seq | sed "s/${org}_//"`                                            
    set coords = `awk -v REGION=$region '$4 == REGION {printf "%s.%d.%d", $1, $2, $3}' /cluster/data/encode/${ortho}/consensus/$db.bed`
    set chr = $coords:r:r                                                                   
    nibFrag /cluster/data/$db/nib/${chr}.nib $coords:r:e $coords:e + $db.nibTest/$db.$region.fa                                  
    faCmp $db.nibTest/$db.$region.fa $db.test/${org}_$region.fa                             
end            
'EOF'
# << for emacs

########################################################################

# MOUSE AGPs

cd /cluster/data/encode/${ortho}/agp
#set db  = mm5
set db  = mm6
set org = mouse

#create consensus list from database, verify it's the same as consensus dir
mkdir $db
hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus order by name" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5
sort $db/$db.encodeRegionsConsensus.bed5 > temp
sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp
# should be empty
hgsql -e "select count(*) from $db.encodeRegionsConsensus"
# 57

# Create AGPs
genomeAgps.csh $db $org $db $ortho

# Test 
cd tests
rm -fr $db.test $db.nibTest 
mkdir $db.test $db.nibTest
# note: tab required in grep (retype when pasting)  ***** READ THIS ************
cat ../$db/${org}_EN*.agp | grep -v "	N" | cut -f6 > $db.accessions
# download contigs -- not needed if already run
# NOTE: preferable to extract these from build dir
getSeqFromAcc.pl $db $ortho < $db.accessions >& $db.log
awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa
cat ../$db/*.agp > $db.contig.agp
agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa 
faSplit byname $db.fa $db.test/
testFa.csh $db $org ${ortho} >&! testFa.$org.log &
head testFa.$org.log
# verify that all regions are the same
grep -v same testFa.$org.log
cd ..

# Create packing list
set encodeRegionPackingList = /cluster/data/encode/bin/scripts/encodeRegionPackingList
$encodeRegionPackingList $db/$db.encodeRegionsConsensus.bed5 $db $org "Mus musculus" 10090  C57BL/6J MAY-2004 $db "NCBI Build 33" > $db/$db.packing.list
grep Region $db/$db.packing.list | wc -l
# 57

# Copy  to downloads area
set downloadDir = /usr/local/apache/htdocs/encode/${ortho}/agp
cp -f $db/$db.packing.list $downloadDir
tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp
chmod o+r $downloadDir/$db.*
tar tvfz $downloadDir/$db.agp.tar.gz | wc -l
# 57

######################################################################a
# RAT AGPs

cd /cluster/data/encode/${ortho}/agp
set db  = rn3
set org = rat
set downloadDir = /usr/local/apache/htdocs/encode/${ortho}/agp
set encodeRegionPackingList = /cluster/data/encode/bin/scripts/encodeRegionPackingList

#create consensus list
mkdir $db
hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus order by name" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5

# Get contig to accession map
hgsql $db -s -e "select * from contigAcc" > $org.contig.tab

# Create AGPs
csh genomeAgps.csh $db $org $db $ortho

# Test
cd tests
mkdir -p $db.test $db.nibTest
# note: tab required in grep (retype when pasting)  *********************************
cat ../$db/${org}_EN*.agp | grep -v "	N" | cut -f6 > $db.accessions
head $db.accessions
# download contig sequences to $db.contig.fa directory
# note: this doesn't handle errors well -- better to extract from 
#  build dir contig fasta files
./getSeqFromAcc.pl $db ${ortho} < $db.accessions >& $db.log
# verify all contigs were transferred
ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc
#./accHeader.pl $db.contig.fa $db >>& $db.log
awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa
cat ../$db/*.agp > $db.contig.agp
agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa
faSplit byname $db.fa $db.test/
csh  testFa.csh $db $org ${ortho} >&! testFa.$org.log
grep -v same testFa.$org.log
cd ..

# Create packing list
$encodeRegionPackingList $db/$db.encodeRegionsConsensus.bed5 $db $org "Rattus norvegicus" 10116 BN/SsNHsdMCW JUN-2003 $db "Baylor HGSC v3.1" > $db/$db.packing.list
 
# Copy  to downloads area
cp -f $db/$db.packing.list $downloadDir
tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp
chmod o+r $downloadDir/*

hgsql -e "select count(*) from $db.encodeRegionsConsensus"
tar tvfz $downloadDir/$db.agp.tar.gz | wc -l
grep Region $downloadDir/$db.packing.list | wc -l


########################################################################
# CHICKEN AGPs

cd /cluster/data/encode/${ortho}/agp
set db  = galGal2
set org = chicken

#create consensus list from database, verify it's the same as consensus dir
mkdir $db
hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5
sort $db/$db.encodeRegionsConsensus.bed5 > temp
sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp
# should be empty
hgsql -e "select count(*) from $db.encodeRegionsConsensus"
# 92

# Get contig to accession mapping (documented in makeGalGal2.doc) 
hgsql $db -s -e "select * from contigAcc" > $org.contig.tab

# Create AGPs
# errors here: missing /cluster/data/galGal2/chr*_random.agp and chrM.agp files
# but there are no ENCODE orthologous sequences on these chicken chromosomes
csh genomeAgps.csh $db $org $db

# Test 
cd tests
rm -fr $db.test $db.nibTest
mkdir -p $db.test $db.nibTest
cat ../$db/${org}_EN*.agp | grep -v "	N" | cut -f6 > $db.accessions
head $db.accessions 
./getSeqFromAcc.pl $db ${ortho} < $db.accessions >& $db.log
ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc
awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa
cat ../$db/*.agp > $db.contig.agp
agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa
faSplit byname $db.fa $db.test/
csh  testFa.csh $db $org ${ortho} >&! testFa.$org.log &
head testFa.$org.log
grep -v same testFa.$org.log
# no output -> passed test
cd ..

# Create packing list
/cluster/data/encode/bin/scripts/encodeRegionPackingList \
    $db/$db.encodeRegionsConsensus.bed5 $db $org \
    "Gallus gallus" 9031 N/A FEB-2004 $db "CGSC Feb. 2004" > $db/$db.packing.list
grep Region $db/$db.packing.list | wc -l
# 92

# Copy  to downloads area
set downloadDir = /usr/local/apache/htdocs/encode/${ortho}/agp
cp -f $db/$db.packing.list $downloadDir
tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp
tar tvfz $downloadDir/$db.agp.tar.gz | wc -l
# 92 

########################################################################
#  DOG AGPs
set db  = canFam1
set org = dog

cd /cluster/data/encode/${ortho}/agp
set downloadDir = /usr/local/apache/htdocs/encode/${ortho}/agp
set encodeRegionPackingList = /cluster/data/encode/bin/scripts/encodeRegionPackingList

# create consensus list
mkdir $db
hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5
hgsql $db -s -e "select * from contigAcc" > $org.contig.tab

# Create AGPs
csh genomeAgps.csh $db $org $db
# ignore error messages for random chrom's and chrM (not in dog assembly)

# Tests
cd tests
mkdir -p $db.test $db.nibTest
# type the literal tab in
cat ../$db/${org}_EN*.agp | grep -v "	N" | cut -f6 > $db.accessions
head $db.accessions
./getSeqFromAcc.pl $db ${ortho} < $db.accessions >& $db.log &
# verify download worked
ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc
wc -l $db.redo.acc
# download if any left
awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa
cat ../$db/*.agp > $db.contig.agp
agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa
faSplit byname $db.fa $db.test/
csh  testFa.csh $db $org ${ortho} >&! testFa.$org.log
head testFa.$org.log
grep -v same testFa.$org.log
# no output -> passed test
cd ..

# Create packing list
/cluster/data/encode/bin/scripts/encodeRegionPackingList \
    $db/$db.encodeRegionsConsensus.bed5 \
    $db $org "Canis Familiaris" 9615 N/A JUL-2004 $db \
    "Broad Institute v. 1.0" > $db/$db.packing.list

# Copy  to downloads area
cp -f $db/$db.packing.list $downloadDir
tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp

hgsql -e "select count(*) from $db.encodeRegionsConsensus"
tar tvfz $downloadDir/$db.agp.tar.gz | wc -l
grep Region $downloadDir/$db.packing.list | wc -l
# 52

########################################################################
# CHIMP AGPs
set db  = panTro1
set org = chimp

cd /cluster/data/encode/${ortho}/agp
set downloadDir = /usr/local/apache/htdocs/encode/${ortho}/agp
set encodeRegionPackingList = /cluster/data/encode/bin/scripts/encodeRegionPackingList

# create consensus list
mkdir $db
hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5
hgsql $db -s -e "select * from contigAcc" > $org.contig.tab

# Create AGPs
# NOTE: next time, put chimpChromContigs.agp into $outDir
cat /cluster/data/$db/?{,?}/*.agp | \
   chimpChromContigAgp stdin /cluster/data/$db/assembly.agp chimpChromContigs.agp
regionAgp -contigFile=$org.contig.tab -namePrefix=${org}_ \
    $db/$db.encodeRegionsConsensus.bed5 chimpChromContigs.agp -dir $db

# Create packing list
/cluster/data/encode/bin/scripts/encodeRegionPackingList \
    $db/$db.encodeRegionsConsensus.bed5 \
    $db  $org "Pan troglodytes" 9598 N/A NOV-2003 $db \
    "NCBI Build 1 v1" > $db/$db.packing.list

# Test
cd tests
mkdir -p $db.test $db.nibTest
# type the literal tab in
cat ../$db/${org}_EN*.agp | grep -v "	N" | cut -f6 > $db.accessions
head $db.accessions
./getSeqFromAcc.pl $db ${ortho} < $db.accessions >& $db.log &
# verify download worked
ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc
wc -l $db.redo.acc
# download if any left
awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa
cat ../$db/*.agp > $db.contig.agp
agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa
faSplit byname $db.fa $db.test/
csh  testFa.csh $db $org ${ortho} >&! testFa.$org.log
head testFa.$org.log
grep -v same testFa.$org.log
# no output -> passed test
cd ..

# Copy  to downloads area
cp -f $db/$db.packing.list $downloadDir
tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp

set downloadDir  = /usr/local/apache/htdocs/encode/${ortho}/agp
hgsql -e "select count(*) from $db.encodeRegionsConsensus"
tar tvfz $downloadDir/$db.agp.tar.gz | wc -l
grep Region $downloadDir/$db.packing.list | wc -l
# 81

########################################################################
# OPOSSUM AGPs
set db  = monDom1
set org = opossum

cd /cluster/data/encode/${ortho}/agp

#create consensus list from database, verify it's the same as consensus dir
mkdir $db
hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5
sort $db/$db.encodeRegionsConsensus.bed5 > temp
sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp
# should be empty
hgsql -e "select count(*) from $db.encodeRegionsConsensus"
# 146

# get contig to accession mapping
hgsql $db -s -e "select * from contigAcc" > $org.contig.tab

# Create AGPs
gunzip -c /cluster/data/monDom1/broad.mit.edu/assembly.agp.gz > $db.assembly.agp
regionAgp -contigFile=$org.contig.tab -namePrefix=${org}_ \
        $db/$db.encodeRegionsConsensus.bed5 $db.assembly.agp -dir $db
rm $db.assembly.agp

# Test
cd tests
rm -fr $db.test $db.nibTest
mkdir -p $db.test $db.nibTest
# type the literal tab in
cat ../$db/${org}_EN*.agp | grep -v "	N" | cut -f6 > $db.accessions
head $db.accessions
./getSeqFromAcc.pl $db ${ortho} < $db.accessions >& $db.log &
# verify download worked
ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc
wc -l $db.redo.acc
# download if any left
awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa
cat ../$db/*.agp > $db.contig.agp
agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa
faSplit byname $db.fa $db.test/

cat > testScaffoldFa.csh << 'EOF'
set db = $1
set org = $2
set freeze = $3
foreach f (../$db/*.agp)
    set seq = $f:t:r
    set region = `echo $seq | sed "s/${org}_//"`
    set coords = `awk -v REGION=$region '$4 == REGION {printf "%s.%d.%d", $1, $2, $3}' /cluster/data/encode/$freeze/html/$db.encodeRegionsConsensus.bed`
    set chr = $coords:r:r
    twoBitToFa /cluster/data/$db/monDom1.2bit \
        -seq=$chr -start=$coords:r:e -end=$coords:e $db.nibTest/$db.$region.fa
    faCmp $db.nibTest/$db.$region.fa $db.test/${org}_$region.fa
end
'EOF'
# << make emacs happy

csh testScaffoldFa.csh  $db $org ${ortho} >&! testFa.$org.log
head testFa.$org.log
grep -v same testFa.$org.log
# no output -> passed test
cd ..

# Create packing list
/cluster/data/encode/bin/scripts/encodeRegionPackingList \
    $db/$db.encodeRegionsConsensus.bed5 $db $org "Monodelphis domestica" 13616 N/A \
    OCT-2004 $db "Broad Institute" > $db/$db.packing.list
grep Region $db/$db.packing.list | wc -l
# 146

# Copy  to downloads area
set downloadDir  = /usr/local/apache/htdocs/encode/${ortho}/agp
cp -f $db/$db.packing.list $downloadDir
tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp
tar tvfz $downloadDir/$db.agp.tar.gz | wc -l
#  146


########################################################################
# TETRA AGPs
set db  = tetNig1
set org = tetra

cd /cluster/data/encode/${ortho}/agp

#create consensus list from database, verify it's the same as consensus dir
mkdir $db
hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5
sort $db/$db.encodeRegionsConsensus.bed5 > temp
sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp
# should be empty
hgsql -e "select count(*) from $db.encodeRegionsConsensus"
# 185

# get contig to accession mapping
hgsql $db -s -e "select * from contigAcc" > $org.contig.tab

# Create AGPs
csh genomeAgps.csh $db $org $db

# Test
cd tests
rm -fr $db.test $db.nibTest
mkdir -p $db.test $db.nibTest
# type the literal tab in
cat ../$db/${org}_EN*.agp | grep -v "	N" | cut -f6 > $db.accessions
head $db.accessions
./getSeqFromAcc.pl $db ${ortho} < $db.accessions >& $db.log &
# verify download worked
ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc
wc -l $db.redo.acc
# download if any left
awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa
cat ../$db/*.agp > $db.contig.agp
agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa
faSplit byname $db.fa $db.test/
csh  testFa.csh $db $org ${ortho} >&! testFa.$org.log
head testFa.$org.log
grep -v same testFa.$org.log
# no output -> passed test
cd ..

# Create packing list
/cluster/data/encode/bin/scripts/encodeRegionPackingList \
    $db/$db.encodeRegionsConsensus.bed5 $db $org "Tetraodon nigroviridis" 99883 N/A \
    FEB-2004 $db "Genoscope V7" > $db/$db.packing.list
grep Region $db/$db.packing.list | wc -l
# 185

# Copy  to downloads area
set downloadDir  = /usr/local/apache/htdocs/encode/${ortho}/agp
cp -f $db/$db.packing.list $downloadDir
tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp
tar tvfz $downloadDir/$db.agp.tar.gz | wc -l

########################################################################
# ZEBRAFISH AGPs
set db  = danRer2
set org = zebrafish

cd /cluster/data/encode/${ortho}/agp

#create consensus list from database, verify it's the same as consensus dir
mkdir $db
hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5
sort $db/$db.encodeRegionsConsensus.bed5 > temp
sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp
# should be empty
hgsql -e "select count(*) from $db.encodeRegionsConsensus"
# 247

# get contig to accession mapping
hgsql $db -s -e "select * from contigAcc" > $org.contig.tab

# Create AGPs
csh genomeAgps.csh $db $org $db

# Test
cd tests
rm -fr $db.test $db.nibTest
mkdir -p $db.test $db.nibTest
# type the literal tab in
cat ../$db/${org}_EN*.agp | grep -v "	N" | cut -f6 > $db.accessions
head $db.accessions
./getSeqFromAcc.pl $db ${ortho} < $db.accessions >& $db.log &
# verify download worked
ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc
wc -l $db.redo.acc
# download if any left
awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa
cat ../$db/*.agp > $db.contig.agp
agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa
faSplit byname $db.fa $db.test/
csh  testFa.csh $db $org ${ortho} >&! testFa.$org.log
head testFa.$org.log
grep -v same testFa.$org.log
# no output -> passed test
cd ..

# Create packing list
/cluster/data/encode/bin/scripts/encodeRegionPackingList \
    $db/$db.encodeRegionsConsensus.bed5 $db $org "Danio rerio" 7955 N/A \
    JUN-2004 $db "Sanger Zv4" > $db/$db.packing.list
grep Region $db/$db.packing.list | wc -l
# 247

# Copy  to downloads area
set downloadDir  = /usr/local/apache/htdocs/encode/${ortho}/agp
cp -f $db/$db.packing.list $downloadDir
tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp
tar tvfz $downloadDir/$db.agp.tar.gz | wc -l
# 247


########################################################################
# cleanup

chmod -R o+r $downloadDir/* >& /dev/null
exit

########################################################################

# print summary stats

cat > stats.csh << 'EOF'
hgsql hg16 -N -e "select name from encodeRegions" | sort > allRegions
foreach f ($1/*.list)
    set db = $f:t:r:r
    set agps = `grep AgpFile $f | wc -l`
    set len = `awk '/SeqLength/ {len += $2} END {print len}' $f`
    set missing = `grep AgpFile $f | sed 's/.*\(EN.*\)_.*/\1/' | sort|uniq | diff - allRegions`
    printf "%-8s %5s %10s %s\n" $db $agps $len $missing
end 
rm allRegions
'EOF'

csh stats.csh $downloadDir

