#!/bin/bash -e
usage="exportEnsembl - export genePred files and tab files from ENSEMBL or VEGA native database.

usage: exportEnsembl [options] organism version db
      -l liftfile - lift genePreds using this file (usually for contigs to random chroms)
      -f filterFile - drop ENSEMBL transcript ids that are in this file.  Empty lines and
         whose starting with # are ignored.
      -p load pseudogene track
   for example:
       exportEnsembl -l lift homo_sapiens core_43_36e

   creates approriate load files in current directory
"
liftfile="NO"
filterFile="NO"
pseudo=0
filterOpts=""
while getopts ":pl:f:" theOption; do
    case $theOption in
        l ) liftfile=$OPTARG;;
        f ) filterFile=$OPTARG
            filterOpts="-ignore $filterFile";;
        p ) pseudo=1;;
        \? ) echo "Invalid option: -$theOption" >&2
            exit 1;;
        * ) echo "BUG: option $theOption not handled" >&2
            exit 1;;
    esac
done
shift $(($OPTIND - 1))
[ $# != 3 ] && {
    echo "Error: wrong number of args:
$usage" >&2
    exit 1
}
organism=$1
version=$2
db=$3
ensDb=ensembl_${organism}_${version}
if [[ ${version} == vega_* ]] ; then
    genePre=vegaGene
    pseudoPre=vegaPseudo
    infoPre=vegaInfo
    xrefPre=vegaGeneXref
    gtpPre=vegaGtp
    # vega has tons of frame problems, so don't use it.
    frameOpt=""
else
    genePre=ensGene
    pseudoPre=ensPseudo
    infoPre=ensInfo
    xrefPre=ensGeneXref
    gtpPre=ensGtp
    frameOpt="-genePredExt"
fi
if [ $liftfile != "NO" ] ; then
    geneUnliftedPre=${genePre}.unlifted
    pseudoUnliftedPre=${pseudoPre}.unlifted
else
    geneUnliftedPre=${genePre}
    pseudoUnliftedPre=${pseudoPre}
fi

# extract genes
echo "Extract genes to ${geneUnliftedPre}.exons"
set -x
hgsql $ensDb -N -B  -e 'select sr.name as chrom, g.biotype, if(e.seq_region_strand > 0, fe.seq_region_start+seq_start-2, le.seq_region_end-seq_end) as cdsStart, if(e.seq_region_strand > 0, le.seq_region_start+seq_end-1, fe.seq_region_end-seq_start+1) as cdsEnd, e.seq_region_start-1 as exonStart, e.seq_region_end as exonEnd, ".",if(e.seq_region_strand < 0,"-","+") as strand,  if(e.exon_id = start_exon_id ,0, e.phase ) as phase, display_label, tsi.stable_id, gsi.stable_id, rank,g.description , g.gene_id, t.transcript_id from gene g, gene_stable_id gsi, seq_region sr , exon_stable_id es, exon e, transcript t left join translation r on r.transcript_id = t.transcript_id, exon_transcript et , transcript_stable_id tsi, xref x, exon fe , exon le where gsi.gene_id=g.gene_id and g.seq_region_id = sr.seq_region_id and g.gene_id = t.gene_id and t.transcript_id = et.transcript_id and et.exon_id = e.exon_id and e.exon_id = es.exon_id and start_exon_id = fe.exon_id and end_exon_id = le.exon_id and tsi.transcript_id = t.transcript_id and t.display_xref_id = x.xref_id and g.biotype not like "%pseudo%" order by tsi.stable_id, e.seq_region_start;' > ${geneUnliftedPre}.exons
hgsql $ensDb -N -B  -e 'select sr.name as chrom, g.biotype, if(e.seq_region_strand > 0, fe.seq_region_start+seq_start-2, le.seq_region_end-seq_end) as cdsStart, if(e.seq_region_strand > 0, le.seq_region_start+seq_end-1, fe.seq_region_end-seq_start+1) as cdsEnd, e.seq_region_start-1 as exonStart, e.seq_region_end as exonEnd, ".",if(e.seq_region_strand < 0,"-","+") as strand,  if(e.exon_id = start_exon_id ,0, e.phase ) as phase, "noXref", tsi.stable_id, gsi.stable_id, rank,g.description , g.gene_id, t.transcript_id from gene g, gene_stable_id gsi, seq_region sr , exon_stable_id es, exon e, transcript t left join translation r on r.transcript_id = t.transcript_id, exon_transcript et , transcript_stable_id tsi, exon fe , exon le where gsi.gene_id=g.gene_id and g.seq_region_id = sr.seq_region_id and g.gene_id = t.gene_id and t.transcript_id = et.transcript_id and et.exon_id = e.exon_id and e.exon_id = es.exon_id and start_exon_id = fe.exon_id and end_exon_id = le.exon_id and tsi.transcript_id = t.transcript_id and t.display_xref_id is null and g.biotype not like "%pseudo%" order by tsi.stable_id, e.seq_region_start;' >> ${geneUnliftedPre}.exons
ensGeneToGenePred ${frameOpt} ${filterOpts} ${geneUnliftedPre}.exons > ${geneUnliftedPre}.gp
wc -l ${geneUnliftedPre}.gp
[ $liftfile != "NO" ] && {
    liftUp ${frameOpt} ${genePre}.gp $liftfile carry ${geneUnliftedPre}.gp
}
genePredCheck -db=$db ${genePre}.gp

# extract pseudogenes; don't create frames, too many with bogus annotations.
[ $pseudo == 1 ] && {
    echo "Extract pseudogenes to ${pseudoUnliftedPre}.exons"
    hgsql $ensDb -N -B -e 'select sr.name as chrom, g.biotype, g.seq_region_start-1 as cdsStart, g.seq_region_end as cdsEnd,e.seq_region_start-1 as exonStart, e.seq_region_end as exonEnd, ".",if(e.seq_region_strand < 0,"-","+") as strand, e.phase, g.biotype, tsi.stable_id, gsi.stable_id, rank,g.gene_id, t.transcript_id from gene_stable_id gsi, seq_region sr , exon_stable_id es, exon e, transcript t, exon_transcript et , transcript_stable_id tsi,  gene g where gsi.gene_id=g.gene_id and g.seq_region_id = sr.seq_region_id and g.gene_id = t.gene_id and t.transcript_id = et.transcript_id and et.exon_id = e.exon_id and e.exon_id = es.exon_id and tsi.transcript_id = t.transcript_id and g.biotype like "%pseudo%" order by tsi.stable_id, e.seq_region_start;' > ${pseudoUnliftedPre}.exons
    ensGeneToGenePred ${filterOpts} ${pseudoUnliftedPre}.exons > ${pseudoUnliftedPre}.gp
    wc -l ${pseudoUnliftedPre}.gp
    [ $liftfile != "NO" ] && {
        liftUp ${pseudoPre}.gp $liftfile carry ${pseudoUnliftedPre}.gp
    }
    genePredCheck -db=$db ${pseudoPre}.gp
}

# extract ensInfo and ${gtpPre}
echo "extract info details table to ${infoPre}.tab" 
hgsql $ensDb -N -B -e 'select distinct tsi.stable_id, display_label, gsi.stable_id, g.biotype, g.description, g.status from gene g, gene_stable_id gsi, seq_region sr , exon_stable_id es, exon e, transcript t left join translation r on r.transcript_id = t.transcript_id, exon_transcript et , transcript_stable_id tsi, xref x, exon fe , exon le where gsi.gene_id=g.gene_id and g.seq_region_id = sr.seq_region_id and g.gene_id = t.gene_id and t.transcript_id = et.transcript_id and et.exon_id = e.exon_id and e.exon_id = es.exon_id and start_exon_id = fe.exon_id and end_exon_id = le.exon_id and tsi.transcript_id = t.transcript_id and t.display_xref_id = x.xref_id order by tsi.stable_id' > ${infoPre}.tab
#hgsql $ensDb -N -B -e 'select distinct tsi.stable_id, display_label, gsi.stable_id, g.biotype, g.description, g.status from gene g, gene_stable_id gsi, seq_region sr , exon_stable_id es, object_xref o, exon e, transcript t left join translation r on r.transcript_id = t.transcript_id, exon_transcript et , transcript_stable_id tsi, xref x, exon fe , exon le where gsi.gene_id=g.gene_id and g.seq_region_id = sr.seq_region_id and g.gene_id = t.gene_id and t.transcript_id = et.transcript_id and et.exon_id = e.exon_id and e.exon_id = es.exon_id and start_exon_id = fe.exon_id and end_exon_id = le.exon_id and tsi.transcript_id = t.transcript_id and o.xref_id = x.xref_id and ensembl_id = t.transcript_id order by tsi.stable_id' > ${infoPre}.tab
#hgsql $ensDb -N -B -e 'select distinct tsi.stable_id, display_label, gsi.stable_id, g.biotype, g.description, g.status from gene g, gene_stable_id gsi, seq_region sr , exon_stable_id es, object_xref o, exon e, transcript t,  translation r,  exon_transcript et , transcript_stable_id tsi, xref x, exon fe , exon le where r.transcript_id = t.transcript_id and gsi.gene_id=g.gene_id and g.seq_region_id = sr.seq_region_id and g.gene_id = t.gene_id and t.transcript_id = et.transcript_id and et.exon_id = e.exon_id and e.exon_id = es.exon_id and start_exon_id = fe.exon_id and end_exon_id = le.exon_id and tsi.transcript_id = t.transcript_id and o.xref_id = x.xref_id and ensembl_id = t.transcript_id order by tsi.stable_id' > ${infoPre}.tab
hgsql $ensDb -N -B -e 'select distinct tsi.stable_id, "noXref", gsi.stable_id, g.biotype, g.description, g.status from gene_stable_id gsi, seq_region sr , exon_stable_id es, exon e, transcript t , exon_transcript et , transcript_stable_id tsi, gene g where gsi.gene_id=g.gene_id and g.seq_region_id = sr.seq_region_id and g.gene_id = t.gene_id and t.transcript_id = et.transcript_id and et.exon_id = e.exon_id and e.exon_id = es.exon_id and tsi.transcript_id = t.transcript_id and t.display_xref_id is null' >> ${infoPre}.tab
hgsql $ensDb -N -B -e 'select distinct gsi.stable_id, tsi.stable_id, rsi.stable_id from gene g, translation_stable_id rsi, gene_stable_id gsi, seq_region sr , transcript t left join translation r on r.transcript_id = t.transcript_id, transcript_stable_id tsi where gsi.gene_id=g.gene_id and g.seq_region_id = sr.seq_region_id and g.gene_id = t.gene_id and rsi.translation_id = r.translation_id and tsi.transcript_id = t.transcript_id order by tsi.stable_id' > ${gtpPre}.tab
#extract cross reference table 
#hgsql $ensDb -N -B -e "select * from external_db" > externalDb.tab

#hgsql $ensDb -N -B -e 'select distinct "'$version'", logic_name, t.biotype, g.gene_id,  gsi.stable_id, gsi.version, t.transcript_id, tsi.stable_id, tsi.version,rsi.stable_id, r.translation_id, rsi.version, db_name, dbprimary_acc, t.status from gene g, translation_stable_id rsi, gene_stable_id gsi, seq_region sr , analysis a, transcript t left outer join xref x on t.display_xref_id = x.xref_id , translation r, transcript_stable_id tsi where r.transcript_id = t.transcript_id and gsi.gene_id=g.gene_id and g.seq_region_id = sr.seq_region_id and g.gene_id = t.gene_id and rsi.translation_id = r.translation_id and tsi.transcript_id = t.transcript_id and a.analysis_id = t.analysis_id' > ${xrefPre}.tab
hgsql $ensDb -N -B -e 'select distinct "'$version'", logic_name, t.biotype, g.gene_id,  gsi.stable_id, gsi.version, t.transcript_id, tsi.stable_id, tsi.version,rsi.stable_id, r.translation_id, rsi.version, db_name, dbprimary_acc, t.status from object_xref o, gene g, translation_stable_id rsi, gene_stable_id gsi, seq_region sr , analysis a, transcript t, xref x, translation r, transcript_stable_id tsi where o.xref_id = x.xref_id and ensembl_id = t.transcript_id and r.transcript_id = t.transcript_id and gsi.gene_id=g.gene_id and g.seq_region_id = sr.seq_region_id and g.gene_id = t.gene_id and rsi.translation_id = r.translation_id and tsi.transcript_id = t.transcript_id and a.analysis_id = t.analysis_id and db_name not like "AFFY%"' > ${xrefPre}.tab
 
echo "Finished Export"
