# for emacs: -*- mode: sh; -*-


# This file describes how we made the proteins database using
# 6/29/03 release of NR database files from SWISS-PROT

# First ftp over raw data files

	mkdir /cluster/store5/swissprot/072003
	cd /cluster/store5/swissprot/072003
        mkdir build

	ftp us.expasy.org

		cd databases
		cd sp_tr_nrdb

		binary
		prompt
		mget *

	gzip -d *.dat.gz


#  	After decompression, total of about 2.2 GB.
 
# create biosql072003 and proteins072003 mySQL databases at hgwdev
  from interactive mysql

	create database biosql072003;
	create database proteins072003;  

  cd /cluster/home/fanhsu/bioperl/bioperl-db/scripts

	createTables 072003
	
# use bioPerl to parse the data and load them into biosql
	
	loadAll3 072003 &

# loading will take about 1.5 days

	select count(*) from biosql072003.bioentry;
  
  to check progress.  Over 1 million rows will be generated.

o make a subdirectory of:

	/cluster/store5/proteins/072003

  and go there.

o dump table definitions from proteins0405

	mysqldump -d proteins070403 -u hgcat -pBigSecret > proteins.sql

o edit proteins.sql to commented out index i1, i2, i3, i4 of the spXref2 table

o create tables in proteins

	mysql -u hgcat -pBigSecret -A proteins072003 <proteins.sql

o run spXref2 to create spXref.tab

      spXref2 072003 

  This will generate more than 6 million records, will take a while (~0.5 hr).

o load the table first and then create indexes.

	use proteins072003;

	LOAD DATA local INFILE 'spXref2.tab' into table proteins072003.spXref2;

	create index i1 on proteins072003.spXref2(accession);
	create index i2 on proteins072003.spXref2(displayID);
	create index i3 on proteins072003.spXref2(extAC);
	create index i4 on proteins072003.spXref2(bioentryID);

  Loading take less than a minute and indexing takes a few minutes.
  Indexing takes about 12 minutes in total.

o Build the database table for HUGO

        o ftp over the HUGO data file:
	    cd /cluster/store5/protein/hugo
	    wget http://www.gene.ucl.ac.uk/public-files/nomen/nomeids.txt

        o remove the first line and save the file as hugo.tab

        o load the data, at the mysql prompt

	     LOAD DATA local INFILE 'hugo.tab' into table proteins072003.hugo;
  
o Build spXref3 table

     spXref3 072003 

  from mysql"

     LOAD DATA local INFILE 'spXref3.tab' into table proteins072003.spXref3;

o Build spOrganism table

	spOrganism /cluster/store5/swissprot/072003/sprot.dat      spOrganismSP.dat
	spOrganism /cluster/store5/swissprot/072003/trembl.dat     spOrganismTR.dat
	spOrganism /cluster/store5/swissprot/072003/trembl_new.dat spOrganismTRNEW.dat
	cat spOrganismSP.dat spOrganismTR.dat spOrganismTRNEW.dat |sort|uniq >spOrganism.tab
	rm spOrganismSP.dat spOrganismTR.dat spOrganismTRNEW.dat

  Load spOrganism.tab to mySQL

    LOAD DATA local INFILE 'spOrganism.tab' into table proteins072003.spOrganism;

o Build spSecondaryID table

  spSecondID is a program based on SEQIO 
  (under /projects/cc/hg/fanhsu/kent/src/hg/seqio/seqio-1.2.2)

	spSecondID /cluster/store5/swissprot/072003/sprot.dat
	cp all.lis all.tab
	cp 2nd.lis 2nd.tab

	spSecondID /cluster/store5/swissprot/072003/trembl.dat
	cat all.lis >>all.tab
	cat 2nd.lis >>2nd.tab

	spSecondID /cluster/store5/swissprot/072003/trembl_new.dat
	cat all.lis >>all.tab
	cat 2nd.lis >>2nd.tab

	rm all.lis
	rm 2nd.lis
  
  load into table:

    LOAD DATA local INFILE '2nd.tab' into table proteins072003.spSecondaryID;

 
o Build pepSequence table

	getallpep 072003

  From mysql:

     LOAD DATA local INFILE 'allPep.tab' into table proteins072003.pepSequence;

o Build pfamXref and pfamDesc tables

  Ftp over Pfam-A.full.gz from: 

	ftp.sanger.ac.uk/pub/databases/Pfam/Pfam-A.full.gz

  and save it at /cluster/store5/proteins/pfam/072803

  Uncompress it by

	gzip -d Pfam-A.full.gz

  Run pfamXref to generate pfamAXref.tab and pfamADesc.tab files

	pfamXref proteins072003 /cluster/store5/proteins/pfam/072803/Pfam-A.full pfamADesc.tab pfamAXref.tab

  Load them into mySQL

	load data local infile "pfamADesc.tab" into table proteins072003.pfamDesc;
	load data local infile "pfamAXref.tab" into table proteins072003.pfamXref;

o Build pdbSP table

	pdbSP proteins072003
	
  Load the output file, pdbSP.tab, into mySQL

	load data local infile "pdbSP.tab" into table proteins072003.pdbSP

o Build spDisease table

  Create the spDisease table in proteins072003.

	CREATE TABLE spDisease (
  	accession varchar(40) NOT NULL default '',
  	displayID varchar(40) NOT NULL default '',
  	diseaseDesc text,
  	KEY accession (accession),
  	KEY displayID (displayID)
	) TYPE=MyISAM;
	
  Use the following SQL and hgsql to retrieve rows and create spDisease.tab.

select comment.acc, displayId.val, commentVal.val from sp092903.comment, sp092903.commentVal, sp092903.displayId where comment.commentType=19 and commentVal.id=comment.commentVal and displayId.acc=comment.acc;

  Load the table into proteins072003.

     load data local infile "spDisease.tab" into table proteins072003.spDisease;


