# This file describes how to build the knownToMupit table and mupitRanges table.
# knownToMupit is a link between knownGene IDS and PDB Ids supported by Mupit, used
#   by hgGene for links to 3D protein structures.
# mupitRanges is used on hgc pages for dbSnp to decide whether to show if the
#   SNP position is supported by MuPIT.
cd /hive/data/outside/
mkdir mupit
cd mupit

# mupit-pdbids.txt was emailed from Kyle Moad (kmoad@insilico.us.com)

# wc -l mupit-pdbids.txt
for db in "hg38" "hg19" "hg18"; do \
    # get knownGene IDs and associated PDB IDS
    # the extDb{Ref} parts come from hg/hgGene/domains.c:domainsPrint()
    hgsql -Ne "select kgID, extAcc1 from $db.kgXref x \
        inner join sp180404.extDbRef sp on x.spID = sp.acc \
        inner join sp180404.extDb e on sp.extDb=e.id \
        where x.spID != '' and e.val='PDB' order by kgID" \
        > $db.knownToPdb.txt;
    # filter out pdbIds not found in mupit
    cat mupit-pdbids.txt | tr '[a-z]' '[A-Z]' | \
        grep -Fwf - $db.knownToPdb.txt >  $db.knownToMupit.txt;
    # check that it filtered correctly:
    # cut -f2 $db.knownToMuipit.txt | sort -u | wc -l;
    # load new table for hgGene/hgc
    hgLoadSqlTab $db knownToMupit ~/kent/src/hg/lib/knownTo.sql $db.knownToMupit.txt
done

# mupit.sqlite came from Rick Kim: rkim@insilico.us.com
# Get all the entries from the database:
sqlite3 mupit.sqlite ".databases"
# seq  name             file                                                      
# ---  ---------------  ----------------------------------------------------------
# 0    main             /hive/data/outside/mupit/mupit.sqlite                     
sqlite3 mupit.sqlite ".tables"
# mupit
# The -separator is a literal tab inserted with by pressing "ctrl-v tab"
sqlite3 -separator '	' mupit.sqlite "select * from mupit;" | \
    tawk '{print $1,$2-1,$3}' | sort -k1,1 -k2,2n > hg38.mupitPositions.bed

# mupit_hg19_chrompos.txt also came from Rick Kim
# this file is only two  columns, make it into a range table:
tawk '{print $1,$2-1,$2}' mupit_hg19_chrompos.txt | sort -k1,1 -k2,2n | \
    uniq > hg19.mupitPositionsZeroBased.txt
bedtools merge -i hg19.mupitPositionsZeroBased.txt > hg19.mupitPositions.bed

for db in hg19 hg38; do hgLoadBed -tab $db mupitRanges $db.mupitPositions.bed; done
# Reading hg19.mupitPositions.bed
# Read 171024 elements of size 3 from hg19.mupitPositions.bed
# Sorted
# Creating table definition for mupitRanges, bedSize: 3
# Saving bed.tab
# Loading hg19
# Reading hg38.mupitPositions.bed
# ERROR: line 10547:'chr1 248858130       248857859'
# chromStart after chromEnd (248858130 > 248857859)

# Redo hg38 and throw out positions where chromStart > chromEnd
sqlite3 -separator '	' mupit.sqlite "select * from mupit;" | tawk '{if ($2 <= $3) print $1, $2-1, $3}' | sort -k1,1 -k2,2n > hg38.mupitPositions.bed
# now reload is clean:
for db in hg19 hg38; do hgLoadBed -tab $db mupitRanges $db.mupitPositions.bed; done
# Reading hg19.mupitPositions.bed
# Read 171024 elements of size 3 from hg19.mupitPositions.bed
# Sorted
# Creating table definition for mupitRanges, bedSize: 3
# Saving bed.tab
# Loading hg19
# Reading hg38.mupitPositions.bed
# Read 116709 elements of size 3 from hg38.mupitPositions.bed
# Sorted
# Creating table definition for mupitRanges, bedSize: 3
# Saving bed.tab
# Loading hg38

