
TransMap maps transcripts from source assemblies to destination assemblies via chains.

Tracks are built across current vertebrate assemblies concurrently.  The
concurrent build is done as the fasta files and metadata tables for the source
transcripts are shared and stored in hgFixed/gbdb.  Modifications will be
made to allow incrementally add destination genomes.

TransMap build is based on the pslMap and other tools in the kent/src tree.
The driver for generating the data is available in
 
   hg clone /cluster/home/markd/compbio/repos/hg/transMap
it also requires
   hg clone /cluster/home/markd/compbio/repos/hg/pycbio

The source is checkout and run in
    /hive/data/inside/transMap/build

General steps for setup, only done once:

    cd /hive/data/inside/transMap/build
    hg clone /cluster/home/markd/compbio/repos/hg/transMap
    mkdir transMap/extern
    cd  transMap/extern
    hg clone /cluster/home/markd/compbio/repos/hg/pycbio
    (cd pycbio && make)

##############################################################################################
# TransMap V3 tracks (2014-12-18 markd)
##
   # get data and setup batch
   make getData
   make batchSetup

   # build mappings
   ssk kk
   para create -batch=para1/aln para1/aln/jobs.para
   para try -batch=para1/aln
   para push -batch=para1/aln -maxPush=1000000
   para create -batch=para1/filt para1/filt/jobs.para
   para try -batch=para1/filt
   para push -batch=para1/filt -maxPush=1000000

   # load and verify data
   ssh hgwdev
   make -j 8 loadDb
   make runJoinerCheck

   # create push.list which has all tables for databases active on RR or hgwbeta
   make pushList

   # create template for hg/makeDb/doc that points to this file and append it to all
   # database in this build:
   destDbs=$(make printDestDbs)
   cd ~kent/src/hg/makeDb/doc
   for db in $destDbs ; do cat /hive/data/inside/transMap/build/transMap/builds/vertebrate.2014-12-11.V3/makedb.doc.template >> $db.txt; done


   # request push of tracks
   # note: was never made public
##############################################################################################
# TransMap V4 tracks (2017-02-10 markd)
##
  - This is the build using bigTransMap format (derived from bigPsl), not
    mysql tracks.
    Build code is here:
       /cluster/home/markd/compbio/repos/git/transMap.git
    which also requires
      /cluster/home/markd/compbio/repos/git/pycbio.git
      git@github.com:diekhans/pipettor.git

  - Build process:
    tmdir=~/compbio/projs/transMap/transMap-dev
    mkdir -p  /hive/data/outside/transMap/V4/logs
    pushd /hive/data/outside/transMap/V4
    (time nice ${tmdir}/bin/luigiTransMap --workers=32 --logConfFile=${tmdir}/etc/logging.conf ${tmdir}/etc/transMapBuildConf.py) >&logs/2017-01-12.log &
    # takes maybe 10-14 hours,  at the end, check for parasol jobs
    # that are lost (using one or two stranglers for a few batches), then do
    para stop -batch=/the/batch/name
    para push -batch=/the/batch/name
    # now wait for luigiTransMap to complete.  Must manually check log, as
    # current luigi doesn't exit non-zero if there were errors

  # create symlinks to /gbdb for bigPsl files
    ${tmdir}/bin/linkToGbdb  ${tmdir}/etc/transMapBuildConf.py

  # add trackDb entries at top level by editing:
    src/hg/makeDb/trackDb/trackDb.transMap.ra 
  # create push lists for databases and gbdb files for genomes on hgwbeta:
    ${tmdir}/bin/mkPushList ${tmdir}/etc/transMapBuildConf.py db.push.lst gbdb.push.lst

  # and creating the following html files:
    transMapV4.html
    transMapEnsemblV4.html
    transMapEstV4.html
    transMapRefseqV4.html
    transMapRnaV4.html


##############################################################################################
# TransMap V5 tracks (2019-06-10 markd)
##
  - Build code is here:
       /cluster/home/markd/compbio/repos/git/transMap.git
    which also requires
      /cluster/home/markd/compbio/repos/git/pycbio.git
      git@github.com:diekhans/pipettor.git

  - Build process:
    # IMPORTANT: SET VERSION IN transMap-dev/etc/transMapBuildConf.py
    tmdir=~/compbio/projs/transMap/transMap-dev
    mkdir -p  /hive/data/inside/transMap/V5/logs
    pushd /hive/data/inside/transMap/V5
    (time nice ${tmdir}/bin/luigiTransMap --workers=64 --logConfFile=${tmdir}/etc/logging.conf ${tmdir}/etc/transMapBuildConf.py) >&logs/2019-06-24.log &
    # takes maybe a day or more,  at the end, check for parasol jobs
    # that are lost (using one or two stranglers for a few batches), then do
    para stop -batch=/the/batch/name
    para push -batch=/the/batch/name
    # now wait for luigiTransMap to complete.  Must manually check log, as
    # current luigi doesn't exit non-zero if there were errors

  # create symlinks to /gbdb for bigPsl files
    ${tmdir}/bin/linkToGbdb  ${tmdir}/etc/transMapBuildConf.py

  # add trackDb entries at top level by editing:
    src/hg/makeDb/trackDb/trackDb.transMap.ra 
  # create push lists for databases and gbdb files for genomes on hgwbeta:
    ${tmdir}/bin/mkPushList ${tmdir}/etc/transMapBuildConf.py db.push.lst gbdb.push.lst


##############################################################################################
# TransMap V6 tracks (2021-04-26 markd)
##
  - Build code is here:
       /cluster/home/markd/compbio/repos/git/transMap.git
    which also requires
      /cluster/home/markd/compbio/repos/git/pycbio.git
      git@github.com:diekhans/pipettor.git

  - Build process:
    # IMPORTANT: SET VERSION IN transMap-dev/etc/transMapBuildConf.py
    tmdir=~markd/compbio/projs/transMap/transMap
    mkdir -p  /hive/data/inside/transMap/V6/logs
    pushd /hive/data/inside/transMap/V6
    # must start in foreground then background, as $(idate) is an alias that bash doesn't find
    (time nice ${tmdir}/bin/luigiTransMap --workers=64 --logConfFile=${tmdir}/etc/logging.conf ${tmdir}/etc/transMapBuildConf.py) >&logs/$(idate).log
    ^Z bg

    # takes maybe a day or more,  at the end, check for parasol jobs
    # that are lost (using one or two stranglers for a few batches), then do
    para stop -batch=/the/batch/name
    para push -batch=/the/batch/name
    # now wait for luigiTransMap to complete.  Review check log.

  # compare to stats/detailed of previous to see if all make sense
  (cd ../V5/stats/detailed && find -type f)  | sort >V5.stats.lst
  (cd ../V6/stats/detailed && find -type f)  | sort >V6.stats.lst
  diff V5.stats.lst V6.stats.lst >&stats.lst.diff

  # all removed genomes have been deactivated as they have been moved to
  # assembly hubs.  these are not public hubs.

  # create symlinks to /gbdb for bigPsl files
    ${tmdir}/bin/linkToGbdb  ${tmdir}/etc/transMapBuildConf.py

  # add trackDb entries at top level by editing:
    src/hg/makeDb/trackDb/trackDb.transMap.ra 
  # create push lists for databases and gbdb files for genomes on hgwbeta:
    ${tmdir}/bin/mkPushList ${tmdir}/etc/transMapBuildConf.py db.push.lst gbdb.push.lst

   # make collect QC data and plots
   ${tmdir}/bin/transMapStatsQCPlot --countChangeTsv=transMap-V5-V6.stats.tsv ../V5/stats/overall.stats.tsv ../V5/db.push.lst stats/overall.stats.tsv db.push.lst transMap-V5-V6.stats.pdf
   ${tmdir}/bin/transMapDetailsCmp ../V5/stats/detailed stats/detailed transMap-V5-V6.detailed.tsv

  # update tracks
  # add  transMapV6.html
  # update  trackDb.transMap.ra

  cd src/hg/makeDb/trackDb
  make -j 64



##############################################################################################
