#!/bin/tcsh -efx
# Script to create a relational version of UniProt database.  Should be run on
# hgwdev.

# NOTE: the next person to build this should set a variable DB, and then
# use $DBDATE instead of substituting $DBDATE with whatever the next data is!

set DBDATE=140122
set DB=sp$DBDATE

# Set up working directory
mkdir -p /hive/data/outside/uniProt/$DBDATE/build

# Download uniProt. This will take about 12 hours
cd /hive/data/outside/uniProt/$DBDATE/build
wget ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz
# 486,223,605  262K/s   in 28m 26s
wget ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.dat.gz
# 22,811,969,157 6.58M/s   in 16h 31m
wget ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz
# 7,393,531   1.66M/s   in 8.7s

# Turn flat file into relational tab-separated files.
time zcat *.dat.gz | spToDb stdin ../tabFiles
# real    32m21.930s

# Create the database.  
hgsql mm10 <<end
create database sp$DBDATE;
end

# Load it up with table definitions from source directory
hgsql sp$DBDATE < ~/kent/src/hg/protein/spToDb/spDb.sql

# Load up the data from tab files.  This takes about an hour.
set s=`date +%s`
cd /hive/data/outside/uniProt/$DBDATE/tabFiles
foreach i (*.txt)
  hgsqlimport --local sp$DBDATE $i
end
set e=`date +%s`
expr $e - $s
# 12170

# sp140122.accToKeyword: Records: 107641689  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.accToTaxon: Records: 52159208  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.author: Records: 494838  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.citation: Records: 61734988  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.citationRc: Records: 55008839  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.citationRp: Records: 172575  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.comment: Records: 80808264  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.commentType: Records: 29  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.commentVal: Records: 328352  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.commonName: Records: 118788  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.description: Records: 52159208  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.displayId: Records: 52159208  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.extDb: Records: 127  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.extDbRef: Records: 586498660  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.feature: Records: 35270331  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.featureClass: Records: 39  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.featureId: Records: 1576146  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.featureType: Records: 418114  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.gene: Records: 61530238  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.geneLogic: Records: 50178390  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.info: Records: 52159208  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.keyword: Records: 1127  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.organelle: Records: 10664  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.otherAcc: Records: 333988  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.pathogenHost: Records: 6216  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.protein: Records: 52159208  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.proteinEvidence: Records: 52159208  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.proteinEvidenceType: Records: 5  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.rcType: Records: 4  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.rcVal: Records: 1260506  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.reference: Records: 337684  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.referenceAuthors: Records: 1907542  Deleted: 0  Skipped: 0  Warnings: 0
# sp140122.taxon: Records: 463860  Deleted: 0  Skipped: 0  Warnings: 0


# Add varsplice info 
zcat ../build/uniprot_sprot_varsplic.fasta.gz | spDbAddVarSplice sp$DBDATE stdin .
hgLoadSqlTab sp$DBDATE -notOnServer -append varProtein /dev/null varProtein.txt
hgLoadSqlTab sp$DBDATE -notOnServer -append protein /dev/null varProtein.txt
hgLoadSqlTab sp$DBDATE -notOnServer -append varAcc /dev/null varAcc.txt
hgLoadSqlTab sp$DBDATE -notOnServer -append displayId /dev/null varDisplayId.txt
hgLoadSqlTab sp$DBDATE -notOnServer -append accToTaxon /dev/null varAccToTaxon.txt
hgLoadSqlTab sp$DBDATE -notOnServer -append geneLogic /dev/null varGeneLogic.txt
hgLoadSqlTab sp$DBDATE -notOnServer -append gene /dev/null varGene.txt
hgLoadSqlTab sp$DBDATE -notOnServer -append description /dev/null varDescription.txt

# Add table descriptions
makeTableDescriptions sp$DBDATE ~/kent/src/hg/protein/spToDb/spDbTables.as

# Zip up tab files for people who prefer them to database.
gzip *.txt


