#!/bin/tcsh -efx
# Script to create a relational version of UniProt database.  Should be run on
# hgwdev.

# NOTE: the next person to build this should set a variable DB, and then
# use $DB instead of substituting 120323 with whatever the next data is!

# Set up working directory
mkdir -p /hive/data/outside/swissprot/120323/build

# Download swissprot. This will take about 12 hours
cd /hive/data/outside/swissprot/120323/build
wget ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz
#  460,569,170  292K/s   in 24m 27s
wget ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.dat.gz
# 8,377,086,168  172K/s   in 11h 2m
wget ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz
#  6,672,072    173K/s   in 32s

# Turn flat file into relational tab-separated files.
# RECORD THE TIME IT TAKES TO RUN THIS!
zcat *.dat.gz | spToDb stdin ../tabFiles

# Create the database.  
hgsql mysql <<end
create database sp120323;
end

# Load it up with table definitions from source directory
hgsql sp120323 < ~/kent/src/hg/protein/spToDb/spDb.sql

# Load up the data from tab files.  This takes about an hour.
cd /hive/data/outside/swissprot/120323/tabFiles
foreach i (*.txt)
    hgsqlimport --local sp120323 $i
end

# Add varsplice info 
zcat ../build/uniprot_sprot_varsplic.fasta.gz | spDbAddVarSplice sp120323 stdin .
hgLoadSqlTab sp120323 -append varProtein /dev/null varProtein.txt
hgLoadSqlTab sp120323 -append protein /dev/null varProtein.txt
hgLoadSqlTab sp120323 -append varAcc /dev/null varAcc.txt
hgLoadSqlTab sp120323 -append displayId /dev/null varDisplayId.txt
hgLoadSqlTab sp120323 -append accToTaxon /dev/null varAccToTaxon.txt
hgLoadSqlTab sp120323 -append geneLogic /dev/null varGeneLogic.txt
hgLoadSqlTab sp120323 -append gene /dev/null varGene.txt
hgLoadSqlTab sp120323 -append description /dev/null varDescription.txt

# Add table descriptions
makeTableDescriptions sp120323 ~/kent/src/hg/protein/spToDb/spDbTables.as

# Zip up tab files for people who prefer them to database.
gzip *.txt


