
2011-10-19: import of GENCODE V9 (markd)
    # Due to the small number of changes from V7, the GENCODE group decided
    # to not push V9 to the public.  It is being loaded so that it goes
    # to the preview-browser

    # Due to UCSC Genome Browser using the NC_001807 mitochondrial genome sequence
    # (chrM) and GENCODE annotating the NC_012920 mitochondrial sequence, the
    # GENCODE mitochondrial sequences are not loaded

    # download files
    mkdir -p /hive/groups/encode/dcc/data/gencodeV9/release
    cd /hive/groups/encode/dcc/data/gencodeV9/release

    wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_9/gencode.v9.2wayconspseudos.gtf.gz
    wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_9/gencode.v9.annotation.gtf.gz
    wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_9/gencode.v9.long_noncoding_RNAs.fa.gz
    wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_9/gencode.v9.long_noncoding_RNAs.gtf.gz
    wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_9/gencode.v9.pc_transcripts.fa.gz
    wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_9/gencode.v9.pc_translations.fa.gz
    wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_9/gencode.v9.polyAs.gtf.gz
    wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_9/gencode.v9.tRNAs.gtf.gz
    wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_9/gencode9_GRCh37.tgz

    # silly sanity check:
    for f in * ; do zcat $f >/dev/null ; done

    # untar main distribution
    tar -zxf gencode9_GRCh37.tgz

    # 2011-10-24: problems found with  new file
    #   gencode_release_9/metadata/gencode.v9.metadata.PolyA_feature 
    # since this is a new file for experimental feature,
    # obtained and updated version from sanger, save it
    # as release/metaDataPolyA_version9.txt.gz 
    zcat release/metaDataPolyA_version9.txt.gz >release/gencode_release_9/metadata/gencode.v9.metadata.fix.PolyA_feature 

    # create Makefile from previous one
    cd /hive/groups/encode/dcc/data/gencodeV9/
    cp ../gencodeV7/Makefile  ../gencodeV7/Makefile .
    # edit as needed
   
    # on code in the CCDS subversion tree:
    #   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/svnroot/hausslerlab/ccds/trunk
    # and markd's python library (it will be moved to the hausslerlab
    # repository soon)
    # may need to update
        ccds2/modules/gencode/src/lib/gencode/data/gencodeGenes.py
    # to add new biotypes, use this command to verify and update as needed
    make checkAttrs

    (time nice make) >&build.out&
    # took 48 minutes, log below

------------------------------------------------------------------------------
cat release/gencode_release_9/gencode.v9.annotation.level_1_2.gtf release/gencode_release_9/gencode.v9.annotation.level_3.gtf | gencodeGtfToGenePred /dev/stdin data/gencodeManAuto.gp.hgwdev.31753.tmp
mv -f data/gencodeManAuto.gp.hgwdev.31753.tmp data/gencodeManAuto.gp
cat release/gencode_release_9/gencode.v9.annotation.level_1_2.gtf release/gencode_release_9/gencode.v9.annotation.level_3.gtf | gencodeGtfToAttrs /dev/stdin data/gencodeManAuto.tsv.hgwdev.31753.tmp
mv -f data/gencodeManAuto.tsv.hgwdev.31753.tmp data/gencodeManAuto.tsv
gencodeMakeTracks --excludeChrom=chrM $(echo Basic | tr A-Z a-z) data/gencodeManAuto.gp data/gencodeManAuto.tsv tables/wgEncodeGencodeBasicV9.gp.hgwdev.31753.tmp
mv -f tables/wgEncodeGencodeBasicV9.gp.hgwdev.31753.tmp tables/wgEncodeGencodeBasicV9.gp
gencodeMakeTracks --excludeChrom=chrM $(echo Comp | tr A-Z a-z) data/gencodeManAuto.gp data/gencodeManAuto.tsv tables/wgEncodeGencodeCompV9.gp.hgwdev.31753.tmp
mv -f tables/wgEncodeGencodeCompV9.gp.hgwdev.31753.tmp tables/wgEncodeGencodeCompV9.gp
gencodeMakeTracks --excludeChrom=chrM $(echo PseudoGene | tr A-Z a-z) data/gencodeManAuto.gp data/gencodeManAuto.tsv tables/wgEncodeGencodePseudoGeneV9.gp.hgwdev.31753.tmp
mv -f tables/wgEncodeGencodePseudoGeneV9.gp.hgwdev.31753.tmp tables/wgEncodeGencodePseudoGeneV9.gp
gencodePolyaGtfToGenePred release/gencode_release_9/gencode.v9.polyAs.gtf tables/wgEncodeGencodePolyaV9.gp.hgwdev.31753.tmp
mv -f tables/wgEncodeGencodePolyaV9.gp.hgwdev.31753.tmp tables/wgEncodeGencodePolyaV9.gp
tawk '$3=="transcript"{$3 = "exon"} {print $0}' release/gencode_release_9/gencode.v9.2wayconspseudos.GRCh37.gtf | gtfToGenePred stdin tables/wgEncodeGencode2wayConsPseudoV9.gp.hgwdev.31753.tmp
mv -f tables/wgEncodeGencode2wayConsPseudoV9.gp.hgwdev.31753.tmp tables/wgEncodeGencode2wayConsPseudoV9.gp
gencodeMakeAttrs --excludeChrom=chrM data/gencodeManAuto.gp data/gencodeManAuto.tsv tables/wgEncodeGencodeAttrsV9.tab.hgwdev.31753.tmp tables/wgEncodeGencodeTagV9.tab
mv -f tables/wgEncodeGencodeAttrsV9.tab.hgwdev.31753.tmp tables/wgEncodeGencodeAttrsV9.tab
mkdir -p tables/
cp release/gencode_release_9/metadata/gencode.v9.metadata.Gene_source tables/wgEncodeGencodeGeneSourceV9.tab.hgwdev.31753.tmp
mv -f tables/wgEncodeGencodeGeneSourceV9.tab.hgwdev.31753.tmp tables/wgEncodeGencodeGeneSourceV9.tab
mkdir -p tables/
cp release/gencode_release_9/metadata/gencode.v9.metadata.Transcript_source tables/wgEncodeGencodeTranscriptSourceV9.tab.hgwdev.31753.tmp
mv -f tables/wgEncodeGencodeTranscriptSourceV9.tab.hgwdev.31753.tmp tables/wgEncodeGencodeTranscriptSourceV9.tab
mkdir -p tables/
cp release/gencode_release_9/metadata/gencode.v9.metadata.Transcript_supporting_feature tables/wgEncodeGencodeTranscriptSupportV9.tab.hgwdev.31753.tmp
mv -f tables/wgEncodeGencodeTranscriptSupportV9.tab.hgwdev.31753.tmp tables/wgEncodeGencodeTranscriptSupportV9.tab
tawk '{split($5,coord,":|-"); print $1,$2,$3,$4,coord[1],coord[2]-1,coord[3]}' release/gencode_release_9/metadata/gencode.v9.metadata.Exon_supporting_feature | sort -k 1,1 -k 2,2 -k 5,5 -k 6,6n > tables/wgEncodeGencodeExonSupportV9.tab.hgwdev.31753.tmp
mv -f tables/wgEncodeGencodeExonSupportV9.tab.hgwdev.31753.tmp tables/wgEncodeGencodeExonSupportV9.tab
mkdir -p tables/
cp release/gencode_release_9/metadata/gencode.v9.metadata.PDB tables/wgEncodeGencodePdbV9.tab.hgwdev.31753.tmp
mv -f tables/wgEncodeGencodePdbV9.tab.hgwdev.31753.tmp tables/wgEncodeGencodePdbV9.tab
mkdir -p tables/
cp release/gencode_release_9/metadata/gencode.v9.metadata.Pubmed_id tables/wgEncodeGencodePubMedV9.tab.hgwdev.31753.tmp
mv -f tables/wgEncodeGencodePubMedV9.tab.hgwdev.31753.tmp tables/wgEncodeGencodePubMedV9.tab
mkdir -p tables/
cp release/gencode_release_9/metadata/gencode.v9.metadata.RefSeq tables/wgEncodeGencodeRefSeqV9.tab.hgwdev.31753.tmp
mv -f tables/wgEncodeGencodeRefSeqV9.tab.hgwdev.31753.tmp tables/wgEncodeGencodeRefSeqV9.tab
(tawk '{print $0,"SwissProt"}' release/gencode_release_9/metadata/gencode.v9.metadata.SwissProt && tawk '{print $0,"TrEMBL"}' release/gencode_release_9/metadata/gencode.v9.metadata.TrEMBL) | sort -k 1,1 > tables/wgEncodeGencodeUniProtV9.tab.hgwdev.31753.tmp
mv -f tables/wgEncodeGencodeUniProtV9.tab.hgwdev.31753.tmp tables/wgEncodeGencodeUniProtV9.tab
flock load.lock hgLoadGenePred -genePredExt hg19 wgEncodeGencodeBasicV9 tables/wgEncodeGencodeBasicV9.gp
touch loaded/wgEncodeGencodeBasicV9.genePredExt.loaded
flock load.lock hgLoadGenePred -genePredExt hg19 wgEncodeGencodeCompV9 tables/wgEncodeGencodeCompV9.gp
touch loaded/wgEncodeGencodeCompV9.genePredExt.loaded
flock load.lock hgLoadGenePred -genePredExt hg19 wgEncodeGencodePseudoGeneV9 tables/wgEncodeGencodePseudoGeneV9.gp
touch loaded/wgEncodeGencodePseudoGeneV9.genePredExt.loaded
flock load.lock hgLoadGenePred -genePredExt hg19 wgEncodeGencodePolyaV9 tables/wgEncodeGencodePolyaV9.gp
touch loaded/wgEncodeGencodePolyaV9.genePredExt.loaded
flock load.lock hgLoadGenePred hg19 wgEncodeGencode2wayConsPseudoV9 tables/wgEncodeGencode2wayConsPseudoV9.gp
touch loaded/wgEncodeGencode2wayConsPseudoV9.genePred.loaded
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeAttrsV9 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeAttrs.sql tables/wgEncodeGencodeAttrsV9.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodeAttrsV9.tab.loaded
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeTagV9 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeTag.sql tables/wgEncodeGencodeTagV9.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodeTagV9.tab.loaded
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeGeneSourceV9 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeGeneSource.sql tables/wgEncodeGencodeGeneSourceV9.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodeGeneSourceV9.tab.loaded
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeTranscriptSourceV9 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeTranscriptSource.sql tables/wgEncodeGencodeTranscriptSourceV9.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodeTranscriptSourceV9.tab.loaded
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeTranscriptSupportV9 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeTranscriptSupport.sql tables/wgEncodeGencodeTranscriptSupportV9.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodeTranscriptSupportV9.tab.loaded
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeExonSupportV9 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeExonSupport.sql tables/wgEncodeGencodeExonSupportV9.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodeExonSupportV9.tab.loaded
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodePdbV9 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodePdb.sql tables/wgEncodeGencodePdbV9.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodePdbV9.tab.loaded
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodePubMedV9 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodePubMed.sql tables/wgEncodeGencodePubMedV9.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodePubMedV9.tab.loaded
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeRefSeqV9 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeRefSeq.sql tables/wgEncodeGencodeRefSeqV9.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodeRefSeqV9.tab.loaded
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeUniProtV9 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeUniProt.sql tables/wgEncodeGencodeUniProtV9.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodeUniProtV9.tab.loaded
mkdir -p check/
hgsql -Ne 'select geneId from wgEncodeGencodeAttrsV9 where geneId not in (select geneId from wgEncodeGencodeGeneSourceV9)' hg19 | sort -u >check/wgEncodeGencodeGeneSourceV9.missing
touch check/wgEncodeGencodeGeneSourceV9.checked
mkdir -p check/
hgsql -Ne 'select transcriptId from wgEncodeGencodeAttrsV9 where transcriptId not in (select transcriptId from wgEncodeGencodeTranscriptSourceV9)' hg19 | sort -u >check/wgEncodeGencodeTranscriptSourceV9.missing
touch check/wgEncodeGencodeTranscriptSourceV9.checked

real	51m36.286s
user	48m11.171s
sys	0m32.405s
==============================================================================

# updated and load trackDb and then test searches with and without versions
#  Basic & comprehensive
    ENST00000399837.2      ENSG00000093072.9
    OTTHUMT00000316079.1   OTTHUMG00000030726.7

#  Comprehensive only

    ENST00000469063.1      ENSG00000093072.9
    OTTHUMT00000316084.1   OTTHUMG00000030726.7



#============================================================================
#To bring it in line with ENCODE track standards;
#
#cd /hive/groups/encode/dcc/analysis/ftp/pipeline/hg19
#mkdir wgEncodeGencodeV9
#cd wgEncodeGencodeV9
#mkdir supplemental
#mkdir release1
#ln release1 beta
#ln release1 releaseLatest
#*** I (cricket) copied the files in
#/hive/groups/encode/dcc/data/gencodeV9/release to the supplemental dir.  Maybe this
#should be a hardlink ****
#
#table dump for downloads files?  Need to check with Brian
#
#Add metaData
#cd USER/kent/src/hg/makeDb/trackDb/human/hg19/metaDb/alpha
#cp wgEncodeGencodeV7.ra wgEncodeGencodeV9.ra
#vi wgEncodeGencodeV9.ra
#edit to have the V9 tables,new submission date,new subId,drop the md5sums
#encodeExp find hg19 -composite=wgEncodeGencodeV9 out.ra
#encodeExp add out.ra
#rm out.ra
#mdbUpdate hg19 -composite=wgEncodeGencodeV9 -encodeExp= -accession
#mdbPrint hg19 -composite=wgEncodeGencodeV9 > wgEncodeGencodeV9
