
2011-12-28: import of GENCODE V10 (markd)
    # Due to UCSC Genome Browser using the NC_001807 mitochondrial genome sequence
    # (chrM) and GENCODE annotating the NC_012920 mitochondrial sequence, the
    # GENCODE mitochondrial sequences are not loaded

    # download files
    mkdir -p /hive/groups/encode/dcc/data/gencodeV10/release
    cd /hive/groups/encode/dcc/data/gencodeV10/release

    wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_10/gencode.v10.2wayconspseudos.gtf.gz
    wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_10/gencode.v10.annotation.gtf.gz
    wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_10/gencode.v10.long_noncoding_RNAs.gtf.gz
    wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_10/gencode.v10.pc_transcripts.fa.gz
    wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_10/gencode.v10.pc_translations.fa.gz
    wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_10/gencode.v10.polyAs.gtf.gz
    wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_10/gencode.v10.tRNAs.gtf.gz
    wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_10/gencode10_GRCh37.tgz

    # NOTE: this was missing, was in V9, ask Sanger
    wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_10/gencode.v10.long_noncoding_RNAs.fa.gz

    # silly sanity check:
    for f in * ; do zcat $f >/dev/null ; done

    # untar main distribution
    tar -zxf gencode10_GRCh37.tgz

    # create Makefile from previous one
    cd /hive/groups/encode/dcc/data/gencodeV10/
    cp ../gencodeV9/Makefile .
    # edit as needed
   
    # on code in the CCDS subversion tree:
    #   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/svnroot/hausslerlab/ccds/trunk
    # and markd's python library (it will be moved to the hausslerlab
    # repository soon)
    # may need to update
        ccds2/modules/gencode/src/lib/gencode/data/gencodeGenes.py
    # to add new biotypes, use this command to verify and update as needed
    make checkAttrs

    (time nice make -j 10) >&build.out&
    # took ~45 minutes, log below

    ### IMPORTANT: make sure that hgTracks/simpleTracks.c registers
    ### track handler for this version of gencode:
    registerTrackHandlerOnFamily("wgEncodeGencodeV10", gencodeGeneMethods);

------------------------------------------------------------------------------
cat release/gencode_release_10/gencode.v10.annotation.level_1_2.gtf release/gencode_release_10/gencode.v10.annotation.level_3.gtf | gencodeGtfToGenePred /dev/stdin data/gencodeManAuto.gp.hgwdev.22170.tmp
cat release/gencode_release_10/gencode.v10.annotation.level_1_2.gtf release/gencode_release_10/gencode.v10.annotation.level_3.gtf | gencodeGtfToAttrs /dev/stdin data/gencodeManAuto.tsv.hgwdev.22170.tmp
mkdir -p tables/
mkdir -p tables/
mkdir -p tables/
gencodePolyaGtfToGenePred release/gencode_release_10/gencode.v10.polyAs.gtf tables/wgEncodeGencodePolyaV10.gp.hgwdev.22170.tmp
tawk '$3=="transcript"{$3 = "exon"} {print $0}' release/gencode_release_10/gencode.v10.2wayconspseudos.gtf | gtfToGenePred stdin tables/wgEncodeGencode2wayConsPseudoV10.gp.hgwdev.22170.tmp
cp release/gencode_release_10/metadata/gencode.v10.metadata.Gene_source tables/wgEncodeGencodeGeneSourceV10.tab.hgwdev.22170.tmp
cp release/gencode_release_10/metadata/gencode.v10.metadata.Transcript_source tables/wgEncodeGencodeTranscriptSourceV10.tab.hgwdev.22170.tmp
cp release/gencode_release_10/metadata/gencode.v10.metadata.Transcript_supporting_feature tables/wgEncodeGencodeTranscriptSupportV10.tab.hgwdev.22170.tmp
tawk '{split($5,coord,":|-"); print $1,$2,$3,$4,coord[1],coord[2]-1,coord[3]}' release/gencode_release_10/metadata/gencode.v10.metadata.Exon_supporting_feature | sort -k 1,1 -k 2,2 -k 5,5 -k 6,6n > tables/wgEncodeGencodeExonSupportV10.tab.hgwdev.22170.tmp
mkdir -p tables/
cp release/gencode_release_10/metadata/gencode.v10.metadata.PDB tables/wgEncodeGencodePdbV10.tab.hgwdev.22170.tmp
mkdir -p tables/
cp release/gencode_release_10/metadata/gencode.v10.metadata.Pubmed_id tables/wgEncodeGencodePubMedV10.tab.hgwdev.22170.tmp
mv -f tables/wgEncodeGencodePdbV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodePdbV10.tab
mkdir -p tables/
cp release/gencode_release_10/metadata/gencode.v10.metadata.RefSeq tables/wgEncodeGencodeRefSeqV10.tab.hgwdev.22170.tmp
mv -f tables/wgEncodeGencodeGeneSourceV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodeGeneSourceV10.tab
(tawk '{print $0,"SwissProt"}' release/gencode_release_10/metadata/gencode.v10.metadata.SwissProt && tawk '{print $0,"TrEMBL"}' release/gencode_release_10/metadata/gencode.v10.metadata.TrEMBL) | sort -k 1,1 > tables/wgEncodeGencodeUniProtV10.tab.hgwdev.22170.tmp
mv -f tables/wgEncodeGencodeRefSeqV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodeRefSeqV10.tab
tawk '{print $1,gensub("\\\\n|\\\\","","g",$2)}' release/gencode_release_10/metadata/gencode.v10.metadata.Annotation_remark | sort -k 1,1 > tables/wgEncodeGencodeAnnotationRemarkV10.tab.hgwdev.22170.tmp
mv -f tables/wgEncodeGencodeTranscriptSupportV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodeTranscriptSupportV10.tab
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeGeneSourceV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeGeneSource.sql tables/wgEncodeGencodeGeneSourceV10.tab 
Scanning through 1 files
mv -f tables/wgEncodeGencode2wayConsPseudoV10.gp.hgwdev.22170.tmp tables/wgEncodeGencode2wayConsPseudoV10.gp
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeTranscriptSupportV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeTranscriptSupport.sql tables/wgEncodeGencodeTranscriptSupportV10.tab 
mv -f tables/wgEncodeGencodeTranscriptSourceV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodeTranscriptSourceV10.tab
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodePdbV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodePdb.sql tables/wgEncodeGencodePdbV10.tab 
touch loaded/wgEncodeGencodeGeneSourceV10.tab.loaded
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeRefSeqV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeRefSeq.sql tables/wgEncodeGencodeRefSeqV10.tab 
Scanning through 1 files
mv -f tables/wgEncodeGencodePubMedV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodePubMedV10.tab
flock load.lock hgLoadGenePred hg19 wgEncodeGencode2wayConsPseudoV10 tables/wgEncodeGencode2wayConsPseudoV10.gp
touch loaded/wgEncodeGencodeTranscriptSupportV10.tab.loaded
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeTranscriptSourceV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeTranscriptSource.sql tables/wgEncodeGencodeTranscriptSourceV10.tab 
Scanning through 1 files
mv -f tables/wgEncodeGencodeAnnotationRemarkV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodeAnnotationRemarkV10.tab
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodePubMedV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodePubMed.sql tables/wgEncodeGencodePubMedV10.tab 
touch loaded/wgEncodeGencodeRefSeqV10.tab.loaded
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeAnnotationRemarkV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeAnnotationRemark.sql tables/wgEncodeGencodeAnnotationRemarkV10.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodeTranscriptSourceV10.tab.loaded
Scanning through 1 files
touch loaded/wgEncodeGencodePubMedV10.tab.loaded
Scanning through 1 files
touch loaded/wgEncodeGencodeAnnotationRemarkV10.tab.loaded
Scanning through 1 files
mv -f tables/wgEncodeGencodeUniProtV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodeUniProtV10.tab
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeUniProtV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeUniProt.sql tables/wgEncodeGencodeUniProtV10.tab 
touch loaded/wgEncodeGencodePdbV10.tab.loaded
touch loaded/wgEncodeGencode2wayConsPseudoV10.genePred.loaded
Scanning through 1 files
touch loaded/wgEncodeGencodeUniProtV10.tab.loaded
mv -f tables/wgEncodeGencodePolyaV10.gp.hgwdev.22170.tmp tables/wgEncodeGencodePolyaV10.gp
flock load.lock hgLoadGenePred -genePredExt hg19 wgEncodeGencodePolyaV10 tables/wgEncodeGencodePolyaV10.gp
touch loaded/wgEncodeGencodePolyaV10.genePredExt.loaded
mv -f data/gencodeManAuto.gp.hgwdev.22170.tmp data/gencodeManAuto.gp
mv -f data/gencodeManAuto.tsv.hgwdev.22170.tmp data/gencodeManAuto.tsv
gencodeMakeTracks --excludeChrom=chrM $(echo Basic | tr A-Z a-z) data/gencodeManAuto.gp data/gencodeManAuto.tsv tables/wgEncodeGencodeBasicV10.gp.hgwdev.22170.tmp
gencodeMakeTracks --excludeChrom=chrM $(echo Comp | tr A-Z a-z) data/gencodeManAuto.gp data/gencodeManAuto.tsv tables/wgEncodeGencodeCompV10.gp.hgwdev.22170.tmp
gencodeMakeTracks --excludeChrom=chrM $(echo PseudoGene | tr A-Z a-z) data/gencodeManAuto.gp data/gencodeManAuto.tsv tables/wgEncodeGencodePseudoGeneV10.gp.hgwdev.22170.tmp
gencodeMakeAttrs --excludeChrom=chrM data/gencodeManAuto.gp data/gencodeManAuto.tsv tables/wgEncodeGencodeAttrsV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodeTagV10.tab
mv -f tables/wgEncodeGencodePseudoGeneV10.gp.hgwdev.22170.tmp tables/wgEncodeGencodePseudoGeneV10.gp
flock load.lock hgLoadGenePred -genePredExt hg19 wgEncodeGencodePseudoGeneV10 tables/wgEncodeGencodePseudoGeneV10.gp
touch loaded/wgEncodeGencodePseudoGeneV10.genePredExt.loaded
mv -f tables/wgEncodeGencodeCompV10.gp.hgwdev.22170.tmp tables/wgEncodeGencodeCompV10.gp
flock load.lock hgLoadGenePred -genePredExt hg19 wgEncodeGencodeCompV10 tables/wgEncodeGencodeCompV10.gp
mv -f tables/wgEncodeGencodeBasicV10.gp.hgwdev.22170.tmp tables/wgEncodeGencodeBasicV10.gp
flock load.lock hgLoadGenePred -genePredExt hg19 wgEncodeGencodeBasicV10 tables/wgEncodeGencodeBasicV10.gp
mv -f tables/wgEncodeGencodeAttrsV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodeAttrsV10.tab
mkdir -p loaded/
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeAttrsV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeAttrs.sql tables/wgEncodeGencodeAttrsV10.tab 
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeTagV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeTag.sql tables/wgEncodeGencodeTagV10.tab 
touch loaded/wgEncodeGencodeCompV10.genePredExt.loaded
touch loaded/wgEncodeGencodeBasicV10.genePredExt.loaded
Scanning through 1 files
touch loaded/wgEncodeGencodeAttrsV10.tab.loaded
mkdir -p check/
mkdir -p check/
hgsql -Ne 'select geneId from wgEncodeGencodeAttrsV10 where geneId not in (select geneId from wgEncodeGencodeGeneSourceV10)' hg19 | sort -u >check/wgEncodeGencodeGeneSourceV10.missing
hgsql -Ne 'select transcriptId from wgEncodeGencodeAttrsV10 where transcriptId not in (select transcriptId from wgEncodeGencodeTranscriptSourceV10)' hg19 | sort -u >check/wgEncodeGencodeTranscriptSourceV10.missing
Scanning through 1 files
touch loaded/wgEncodeGencodeTagV10.tab.loaded
touch check/wgEncodeGencodeGeneSourceV10.checked
touch check/wgEncodeGencodeTranscriptSourceV10.checked
mv -f tables/wgEncodeGencodeExonSupportV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodeExonSupportV10.tab
mkdir -p loaded/
flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeExonSupportV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeExonSupport.sql tables/wgEncodeGencodeExonSupportV10.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodeExonSupportV10.tab.loaded

real	46m55.757s
user	48m56.944s
sys	0m33.582s
------------------------------------------------------------------------------
