# for emacs: -*- mode: sh; -*-

############################################################################
# STS MARKERS (DONE - 2015-06-22 - Hiram)
    # do NOT need to start over with downloads from NCBI, the data files
    # remain the same from hg19, simply start using the files prepared during
    # the hg19 build to run the blat alignments

    mkdir /hive/data/genomes/hg38/bed/stsMap
    cd /hive/data/genomes/hg38/bed/stsMap


    # Create sts sequence alignments
    mkdir /hive/data/genomes/hg38/bed/stsMap/split
    cd /hive/data/genomes/hg38/bed/stsMap/split
    ln -s /hive/data/genomes/hg19/bed/sts/split/*.fa .

    ssh ku
    mkdir /hive/data/genomes/hg38/bed/stsMap/run
    cd /hive/data/genomes/hg38/bed/stsMap/run

    #	going to run separate runs for the golden path sequence vs. the
    #	randoms, haplotypes, chrUn and chrM
    #	40,000,000 chunck sizes, 20,000 overlap
    partitionSequence.pl 40000000 20000 /hive/data/genomes/hg38/hg38.2bit \
	/hive/data/genomes/hg38/chrom.sizes 100 -lstDir tParts \
	| egrep -v "tParts|random|_alt|chrUn" \
	| sed -e "s/.*2bit://;" > hg38.list
    # verify only chroms are left:
    sed -e 's/:.*//;' hg38.list | sort -u | xargs echo | fold -s
# chr1 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr2 chr20
# chr21 chr22 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chrX chrY

    ls -1S ../split > sts.list

    cat > template << '_EOF_'
#LOOP
runOne $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat > runOne << '_EOF_'
#!/bin/csh -fe

set partSpec = $1
set query = $2.fa
set result = $3
set tmpFile = "/dev/shm/$1.$2"
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set chr = `echo $partSpec | sed -e "s/:.*//"`
set chrSize = `grep -P "^$chr\t" /hive/data/genomes/hg38/chrom.sizes | cut -f2`
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile
/cluster/bin/x86_64/blat -ooc=/hive/data/genomes/hg38/jkStuff/hg38.11.ooc \
    /hive/data/genomes/hg38/hg38.2bit:$partSpec \
	../split/${query} -stepSize=5 $tmpFile.psl
/bin/rm -f $result
/cluster/bin/x86_64/liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
rm -f $tmpFile.lift $tmpFile.psl
'_EOF_'
    # << happy emacs
    chmod +x runOne

    gensub2 hg38.list sts.list template jobList
    #	these jobs run quickly, allow only 100 at a time
    para -maxJob=400 create jobList
# 8460 jobs in batch
    para try ... check ... push ... etc
# Completed: 8460 of 8460 jobs
# CPU time in finished jobs:      63833s    1063.88m    17.73h    0.74d  0.002 y
# IO & Wait Time:                 21847s     364.12m     6.07h    0.25d  0.001 y
# Average job time:                  10s       0.17m     0.00h    0.00d
# Longest finished job:              80s       1.33m     0.02h    0.00d
# Submission to last job:           207s       3.45m     0.06h    0.00d

    #	and, run the randoms as a separate run:
    ssh ku
    mkdir /hive/data/genomes/hg38/bed/stsMap/runRandom
    cd /hive/data/genomes/hg38/bed/stsMap/runRandom
    partitionSequence.pl 40000000 20000 /scratch/data/hg38/hg38.2bit \
	/scratch/data/hg38/chrom.sizes 100 -lstDir tParts \
	| egrep "tParts|random|_alt|chrUn"
    cat tParts/* | sed -e "s/.*2bit://;" > hg38.list
    # verify all contigs are counted here or in the chrom run:
    sed -e 's/:.*//;' hg38.list ../run/hg38.list | sort -u | wc -l
#  455
    wc -l ../../../chrom.sizes
#  455 ../../../chrom.sizes

    ls -1S ../split > sts.list
    cat > template << '_EOF_'
#LOOP
runOne $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat > runOne << '_EOF_'
#!/bin/csh -fe

set partSpec = $1
set query = $2.fa
set result = $3
set tmpFile = "/dev/shm/$1.$2"
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile
/cluster/bin/x86_64/blat -ooc=/hive/data/genomes/hg38/jkStuff/hg38.11.ooc \
    /hive/data/genomes/hg38/hg38.2bit:$partSpec \
	../split/${query} -stepSize=5 $tmpFile.psl
/bin/rm -f $result
mv $tmpFile.psl $result
/bin/rm -f $tmpFile.psl
'_EOF_'
    # << happy emacs
    chmod +x runOne

    gensub2 hg38.list sts.list template jobList
    #	these jobs run quickly, allow only 100 at a time
    para -maxJob=400 create jobList
# 40514 jobs in batch
    para try ... check ... push ... etc
# Completed: 40514 of 40514 jobs
# CPU time in finished jobs:       6578s     109.64m     1.83h    0.08d  0.000 y
# IO & Wait Time:                103179s    1719.65m    28.66h    1.19d  0.003 y
# Average job time:                   3s       0.05m     0.00h    0.00d
# Longest finished job:              11s       0.18m     0.00h    0.00d
# Submission to last job:           180s       3.00m     0.05h    0.00d

    # Compile sts sequence results
    ssh hgwdev
    cd /hive/data/genomes/hg38/bed/stsMap/run
    time pslSort dirs raw.psl temp psl/chr*
    # 8460 files in 90 dirs
    #  Got 8460 files 92 files per mid file
    #	real    4m23.646s

    #  -rw-rw-r--  1 793116044 Aug 19 10:44 raw.psl

    rmdir temp

    cd /hive/data/genomes/hg38/bed/stsMap/runRandom
    time pslSort dirs raw.psl temp psl/chr*
    #  40514 files in 431 dirs
    #  Got 40514 files 201 files per mid file

    #	real    9m41.405s
    #   -rw-rw-r--   1 54980918 Aug 19 10:55 raw.psl

    rmdir temp
    cd /hive/data/genomes/hg38/bed/stsMap
    time cat run*/raw.psl | egrep -v "^$|^psLayout|^match|^ |^-" \
	| pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons stdin \
	stsMarkers.psl /dev/null
    #	Processed 7563037 alignments
    #   real    0m22.459s

    #   -rw-rw-r--   1 12403947 Aug 19 11:01 stsMarkers.psl

    time $HOME/kent/src/hg/stsMarkers/extractPslInfo -h stsMarkers.psl
    # real    0m2.917s
    # creates stsMarkers.psl.initial

    # -rw-rw-r--   1  4652174 Aug 19 11:03 stsMarkers.psl.initial

    wc -l stsMarkers.psl.initial
    #	104212  stsMarkers.psl.initial
    #	this command needs a chrom_names file and individual AGP files:
    mkdir /hive/data/genomes/hg38/bed/stsMap/agp
    cd /hive/data/genomes/hg38/bed/stsMap/agp
    cut -f1 ../../../chrom.sizes | sed -e "s/chr//" > chrom_names
    splitFileByColumn -chromDirs ../../../hg38.agp .

    cd /hive/data/genomes/hg38/bed/stsMap
    time $HOME/kent/src/hg/stsMarkers/findAccession.pl \
       -agp stsMarkers.psl.initial /hive/data/genomes/hg38/bed/stsMap/agp
    # real    0m8.260s

    # count should be the same as was in stsMarkers.psl.initial
     wc -l stsMarkers.psl.initial.acc stsMarkers.psl.initial
    #   104212 stsMarkers.psl.initial.acc
    #   104212 stsMarkers.psl.initial

    sort -k4,4n stsMarkers.psl.initial.acc > stsMarkers.final

    # determine found markers (4th field in file)
    cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found
    wc -l stsMarkers.found
    #	96531 stsMarkers.found
    #	out of 100520 total sequences from:
    wc -l /hive/data/outside/ncbi/sts.2009-04/all.STS.id
    # 100520 /hive/data/outside/ncbi/sts.2009-04/all.STS.id

    #	There are lots of duplicates:
    wc -l stsMarkers.final
    #	104212 stsMarkers.final
    #	And a lot of them are just completely haywire:
    awk '$3-$2 < 1001' stsMarkers.final | wc -l
    #	101073
    #	filter out markers that are too long
    awk '$3-$2 < 1001' stsMarkers.final > stsMarkers.1K.size.filtered

    #  alignment of primers, this business in sts.2009-04 was done before,
    #  no need to repeat this
    ssh ku
#    cd /hive/data/outside/ncbi/sts.2009-04
#    awk '$0 !~ /[^ACGT0-9\-\t]/ && (length($2) > 10) && (length($3) > 10) {printf "dbSTS_%s\t%s\t%s\n", $1,$2,$3}' \
#	    all.primers > all.primers.ispcr
#    mkdir primerAlign
#    cd primerAlign
#    mkdir split
#    cd split
#    split -l 5000 ../../all.primers.ispcr primer_
#    ls > ../primer.list

#    cd ..
    #	we need a 10.ooc file for this business
#    time blat /scratch/data/hg38/hg38.2bit \
#	/dev/null /dev/null -tileSize=10 -makeOoc=10.ooc -repMatch=1024
# Wrote 146902 overused 10-mers to 10.ooc
# real    19m16.758s

    # re-use the split primers from the hg19 procedures:
    mkdir -p /hive/data/genomes/hg38/bed/stsMap/primers/split
    cd /hive/data/genomes/hg38/bed/stsMap/primers/split
    ln -s /hive/data/outside/ncbi/sts.2009-04/primerAlign/split/* .
    ls > ../primer.list

    #	we need a hg38.10.ooc file for this business
    cd /hive/data/genomes/hg38/bed/stsMap/primers
    time blat /hive/data/genomes/hg38/hg38.2bit \
	/dev/null /dev/null -tileSize=10 -makeOoc=hg38.10.ooc -repMatch=1024
    #  Wrote 160397 overused 10-mers to hg38.10.ooc
    #  real    0m56.075s

    # separate runs for whole genome vs. randoms
    mkdir /hive/data/genomes/hg38/bed/stsMap/primers/run
    cd /hive/data/genomes/hg38/bed/stsMap/primers/run
    partitionSequence.pl 40000000 20000 /hive/data/genomes/hg38/hg38.2bit \
	/hive/data/genomes/hg38/chrom.sizes 100 -lstDir tParts \
	| egrep -v "tParts|random|_alt|chrUn" \
	| sed -e "s/.*2bit://;" > hg38.list
    # verify only the normal chrom names are in the list:
    sed -e 's/:.*//;' hg38.list | sort -u | xargs echo
# chr1 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr2
# chr20 chr21 chr22 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chrX chrY

    cat > runOne << '_EOF_'
#!/bin/csh -fe

set partSpec = $1
set primer = ../split/$2
set result = $3
set tmpFile = "/dev/shm/$1.$2"
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set chr = `echo $partSpec | sed -e "s/:.*//"`
set chrSize = `grep -P "^$chr\t" /hive/data/genomes/hg38/chrom.sizes | cut -f2`
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile.psl
/cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 \
    -ooc=/hive/data/genomes/hg38/bed/stsMap/primers/hg38.10.ooc -stepSize=5 \
	/hive/data/genomes/hg38/hg38.2bit:$partSpec $primer $tmpFile.psl
/bin/rm -f $result
/cluster/bin/x86_64/liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
/bin/rm -f $tmpFile.lift $tmpFile.psl
'_EOF_'
    # << happy emacs
    chmod +x runOne

    cat > template << '_EOF_'
#LOOP
runOne $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 hg38.list ../primer.list template jobList
    para create jobList
# 5760 jobs written to /hive/data/genomes/hg38/bed/stsMap/primers/run/batch
    para try ... check ... push ... etc
# Completed: 5760 of 5760 jobs
# CPU time in finished jobs:     117786s    1963.11m    32.72h    1.36d  0.004 y
# IO & Wait Time:                 15087s     251.44m     4.19h    0.17d  0.000 y
# Average job time:                  23s       0.38m     0.01h    0.00d
# Longest finished job:            3081s      51.35m     0.86h    0.04d
# Submission to last job:          3192s      53.20m     0.89h    0.04d

    #	sort and filter the results
    ssh hgwdev
    cd /hive/data/genomes/hg38/bed/stsMap/primers/run/psl
    time pslSort dirs raw.psl temp chr*
    #   5760 files in 90 dirs
    #   Got 5760 files 76 files per mid file
    # real    real    1m34.065s
    #   -rw-rw-r-- 1 193562562 Aug 19 12:18 raw.psl

    cd ..
    mkdir filter
    time pslQuickFilter -minMatch=26 -maxMismatch=5 \
        -maxTinsert=5000 -verbose psl/ filter/
    # real    0m3.606s
    #   -rw-rw-r-- 1 59390296 Aug 19 12:19 raw.psl

    #	And, for the randoms
    mkdir /hive/data/genomes/hg38/bed/stsMap/primers/runRandom
    cd /hive/data/genomes/hg38/bed/stsMap/primers/runRandom

    partitionSequence.pl 40000000 20000 /hive/data/genomes/hg38/hg38.2bit \
	/hive/data/genomes/hg38/chrom.sizes 100 -lstDir tParts \
	| egrep "tParts|random|_alt|chrUn" \
	| sed -e "s/.*2bit://;" > hg38.list
    cat tParts/* | sed -e "s/.*2bit://;" > hg38.list
    # verify all contigs are counted here or in the chrom run:
    sed -e 's/:.*//;' hg38.list ../run/hg38.list | sort -u | wc -l
#  455
    wc -l ../../../../chrom.sizes
#  455 ../../../../chrom.sizes

    cat > runOne << '_EOF_'
#!/bin/csh -fe

set partSpec = $1
set primer = ../split/$2
set result = $3
set tmpFile = "/dev/shm/$1.$2"
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile.psl
/cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 \
    -ooc=/hive/data/genomes/hg38/bed/stsMap/primers/hg38.10.ooc -stepSize=5 \
	/hive/data/genomes/hg38/hg38.2bit:$partSpec $primer $tmpFile.psl
/bin/rm -f $result
mv $tmpFile.psl $result
'_EOF_'
    # << happy emacs
    chmod +x runOne

    #	can not use line+ check here, many of them are empty
    cat > template << '_EOF_'
#LOOP
runOne $(file1) $(root2) {check out exists psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 hg38.list ../primer.list template jobList
    #	they run quickly, limit to 100
    para -maxJob=400 create jobList
    # 27584 jobs in batch
    para try ... check ... push ... etc
# Completed: 27584 of 27584 jobs
# CPU time in finished jobs:       6588s     109.79m     1.83h    0.08d  0.000 y
# IO & Wait Time:                 70470s    1174.51m    19.58h    0.82d  0.002 y
# Average job time:                   3s       0.05m     0.00h    0.00d
# Longest finished job:              25s       0.42m     0.01h    0.00d
# Submission to last job:           145s       2.42m     0.04h    0.00d

    #	sort and filter the results
    ssh hgwdev
    cd /hive/data/genomes/hg38/bed/stsMap/primers/runRandom/psl
    cd psl
    pslSort dirs raw.psl temp chr*
    # 27584 files in 431 dirs
    #   Got 27584 files 166 files per mid file
    rmdir temp
    #   -rw-rw-r-- 1 134646598 Aug 19 12:32 raw.psl
    cd ..
    time pslQuickFilter -minMatch=26 -maxMismatch=5 \
         -maxTinsert=5000 -verbose psl/ filter/
    # real    0m1.878s
    #   -rw-rw-r-- 1 10030723 Aug 19 12:36 raw.psl

    #	putting the two runs together
    cd /hive/data/genomes/hg38/bed/stsMap/primers
    time pslSort dirs primers.psl temp run/psl/raw.psl runRandom/psl/raw.psl
    # real    9m40.210s

    # -rw-rw-r-- 1  328208733 Aug 19 12:37 primers.psl

    rmdir temp
    ln -s /hive/data/outside/ncbi/sts.2009-04/all.primers .
     time pslFilterPrimers -verbose=1 primers.psl all.primers primers.filter.psl
# Reading all primers file: 'all.primers'
# Reading isPCR file: 'primers.psl' processing output to: 'primers.filter.psl'
# Writing primers not found to file: 'primers.filter.psl.notfound.primers'
# real    0m5.332s

# -rw-rw-r-- 1   30233721 Aug 20 09:57 primers.filter.psl
# -rw-rw-r-- 1    6877339 Aug 20 09:57 primers.filter.psl.notfound.primers
    wc -l primers.filter.psl primers.filter.psl.notfound.primers
    # 262188 primers.filter.psl
    # 109663 primers.filter.psl.notfound.primers

    #	see if ePCR can find some of these notfound
    ssh ku
    mkdir /hive/data/genomes/hg38/bed/stsMap/primers/epcr
    cd /hive/data/genomes/hg38/bed/stsMap/primers/epcr
#     cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/epcr

    mkdir split
    cd split
    split -l 5000 ../../primers.filter.psl.notfound.primers  primers_
    cd ..
    ls -1S split > primers.lst
    ~/kent/src/hg/utils/automation/partitionSequence.pl 40000000 20000 \
        /scratch/data/hg38/hg38.2bit \
	/scratch/data/hg38/chrom.sizes 100 -lstDir tParts \
	| grep -v tParts | sed -e "s/.*2bit://;" > hg38.list
    cat tParts/* | sed -e "s/.*2bit://;" >> hg38.list

    cat > runOne.csh << '_EOF_'
#!/bin/csh -fe

set partSpec = $1
set primer = split/$2
set result = $3
set tmpFile = "/dev/shm/$1.$2"
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set chr = `echo $partSpec | sed -e "s/:.*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg38/chrom.sizes | cut -f2`
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p epcr/$partSpec
/bin/rm -f $tmpFile.psl
twoBitToFa /scratch/data/hg38/hg38.2bit:$partSpec $tmpFile.fa
/cluster/bin/scripts/runEpcr64 $primer $tmpFile.fa $tmpFile.epcr
/bin/rm -f $result
/bin/mv $tmpFile.epcr $result
rm -f $tmpFile.fa $tmpFile.lift $tmpFile.psl $tmpFile.*
'_EOF_'
    # << happy emacs
    chmod +x runOne.csh

    cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line epcr/$(file1)/$(root2).epcr}
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 hg38.list primers.lst template jobList
    para create jobList
	# 3160 jobs
    para try ... check ... push ... etc ...
# Completed: 11462 of 11462 jobs
# CPU time in finished jobs:     122087s    2034.78m    33.91h    1.41d  0.004 y
# IO & Wait Time:                 32148s     535.80m     8.93h    0.37d  0.001 y
# Average job time:                  13s       0.22m     0.00h    0.00d
# Longest finished job:             127s       2.12m     0.04h    0.00d
# Submission to last job:           958s      15.97m     0.27h    0.01d

    find ./epcr -type f | xargs cat > all.epcr
    wc -l all.epcr
    #	848232 all.epcr
    awk '{print $1}' all.epcr | sort -u > hg38.partSpec.txt
    $HOME/kent/src/hg/stsMarkers/liftFromSpec.pl hg38 hg38.partSpec.txt \
	> all.epcr.lift
    cat all.epcr | sed -e "s/\.\./ /; s/  */\t/g" \
	| liftUp -type=.bed stdout all.epcr.lift error stdin \
	| awk '
{
printf "%s %d..%d %d %d\n", $1, $2, $3, $4, $5
}
' > all.epcr.lifted

    time ~/kent/src/hg/pslFilterPrimers/pslFilterPrimers \
        -epcr=all.epcr.lifted \
	-verbose=2 ../primers.psl ../all.primers epcr.primers.psl
# Reading all primers file: '../all.primers'
# Reading epcr file: 'all.epcr.lifted'
# Reading isPCR file: '../primers.psl' processing output to: 'epcr.primers.psl'
# Writing epcr.not.found file
# Writing primers not found to file: 'epcr.primers.psl.notfound.primers'

# real    25m27.329s

# -rw-rw-r--   1  3106400 Aug 20 14:40 epcr.not.found
# -rw-rw-r--   1 30233721 Aug 20 14:40 epcr.primers.psl
# -rw-rw-r--   1  1966514 Aug 20 14:40 epcr.primers.psl.notfound.primers
# -rw-rw-r--   1  1616885 May  5 17:28 epcr.primers.psl.notfound.primers

    time ./epcrToHgPsl.pl epcr.not.found ../../all.primers \
    time $HOME/kent/src/hg/stsMarkers/epcrToPsl epcr.not.found \
	../all.primers /hive/data/genomes/hg38
# Reading chrom sizes /hive/data/genomes/hg38/chrom.sizes
# Reading primer info from ../all.primers
# Creating psl file epcr.not.found.psl and epcr.not.found.nomatch
# 500 of 80740 complete == % 0.62
# 1000 of 80740 complete == % 1.24
# 1500 of 80740 complete == % 1.86
# ...
# 79500 of 80740 complete == % 98.46
# 80000 of 80740 complete == % 99.08
# 80500 of 80740 complete == % 99.70
# real    13m54.272s

    #	real    69m38.444s
    #   -rw-rw-r--   1        0 Aug 20 16:38 epcr.not.found.nomatch
    #   -rw-rw-r--   1  9293194 Aug 20 16:52 epcr.not.found.psl

    #	combining everything together now
    cd /hive/data/genomes/hg38/bed/stsMap/primers

    sort -u primers.filter.psl epcr/epcr.primers.psl epcr/epcr.not.found.psl \
                | sort -k15,15 -k17,17n > primers.final.psl
    wc -l primers.final.psl
    #	342834 primers.final.psl

    time $HOME/kent/src/hg/stsMarkers/fixPrimersQueryGaps.pl \
        all.primers primers.final.psl > primers.final.fix.psl
    #	real    0m13.182s
    wc -l primers.final.fix.psl
    #	342834 primers.final.fix.psl

    # Extract relevant info, make alignments unique, and create final file to
    #	be merged with full sequence alignments
    $HOME/kent/src/hg/stsMarkers/extractPslInfo -h primers.final.fix.psl
    #	real    0m15.303s
    #	-rw-rw-r-- 1   17594603 Aug 21 12:31 primers.final.fix.psl.initial
    wc -l primers.final.fix.psl.initial
    #	338460 primers.final.fix.psl.initial

    $HOME/kent/src/hg/stsMarkers/findAccession.pl -agp \
	primers.final.fix.psl.initial /hive/data/genomes/hg38/bed/stsMap/agp
    # -rw-rw-r-- 1   21363234 Aug 21 12:36 primers.final.fix.psl.initial.acc

    wc -l primers.final.fix.psl.initial.acc
    #	338460 primers.final.fix.psl.initial.acc

    ln -s /hive/data/outside/ncbi/sts.11/stsInfo2.bed .
    time $HOME/kent/src/hg/stsMarkers/getStsId stsInfo2.bed \
	primers.final.fix.psl.initial.acc | sort -k 4n > primers.final
    # real    0m9.363s

    wc -l primers.final
    # 338460 primers.final

    #	There doesn't appear to be any use for this primers.ids list
    #	except for curiosity.  Check the head and tail of this list to
    #	verify no garbage is in here.  There should just be numbers.
    awk '{print $4}' primers.final | sort -n | uniq > primers.ids
    wc -l primers.ids
    #	285101 primers.ids

    # Merge primer and sequence files to create final bed file
    # Merge (combineSeqPrimerPos) takes about an hour to run
    cd /hive/data/genomes/hg38/bed/stsMap
    time $HOME/kent/src/hg/stsMarkers/combineSeqPrimerPos stsMarkers.final \
	primers/primers.final
    #	real    0m10.914s
    #	-rw-rw-r--   1 21482704 Aug 22 22:50 stsMarkers_pos.rdb

    wc -l stsMarkers_pos.rdb
    #	344391 stsMarkers_pos.rdb

    time /cluster/bin/scripts/createSTSbed primers/stsInfo2.bed \
	stsMarkers_pos.rdb > stsMap.bed
    #	real    0m10.287s
    #	-rw-rw-r--   1 38176467 Aug 22 22:51 stsMap.bed
    wc -l stsMap.bed
    #	304339 stsMap.bed

    # Set up sequence files
    ssh hgwdev
    mkdir /gbdb/hg38/sts.11/
    ln -s /hive/data/outside/ncbi/sts.11/all.STS.fa \
	/gbdb/hg38/sts.11/all.STS.fa
    ln -s /hive/data/outside/ncbi/sts.11/all.primers.fa \
        /gbdb/hg38/sts.11/all.primers.fa

    # Load all files
    cd /hive/data/genomes/hg38/bed/stsMap
    hgLoadSeq hg38 /gbdb/hg38/sts.11/all.STS.fa /gbdb/hg38/sts.11/all.primers.fa
    # Creating seq.tab file
    # Adding /gbdb/hg38/sts.11/all.STS.fa
    # 100520 sequences
    # Adding /gbdb/hg38/sts.11/all.primers.fa
    # 317592 sequences
    # Updating seq table

    hgsql hg38 < $HOME/kent/src/hg/lib/stsInfo2.sql
    hgsql hg38 < $HOME/kent/src/hg/lib/stsAlias.sql
    #	these files already exist from previous operations, reusing them here:
    ln -s  /hive/data/outside/ncbi/sts.11/{stsInfo2.bed,stsAlias.bed} .
    hgsql hg38 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
    hgsql hg38 -e 'load data local infile "stsAlias.bed" into table stsAlias'
    #	a couple minutes for each load above
    #	filter the stsMap.bed to eliminate items longer than 5,000 bases,
    #	takes out about 850:
    awk '$3-$2 < 5001' stsMap.bed | sort -k1,1 -k2,2n \
	> stsMap.filtered.5000.bed

    hgLoadBed -notItemRgb -noBin -tab \
	-sqlTable=$HOME/kent/src/hg/lib/stsMap.sql hg38 stsMap \
	    stsMap.filtered.5000.bed
    #	Read 306855 elements of size 28 from stsMap.filtered.5000.bed

    # four rows had illegal tBaseInsert values in column 8, filter those out:
# Warning 1264 Out of range value for column 'tBaseInsert' at row 15925
# Warning 1264 Out of range value for column 'tBaseInsert' at row 21564
# Warning 1264 Out of range value for column 'tBaseInsert' at row 70740
# Warning 1264 Out of range value for column 'tBaseInsert' at row 112040
    awk -F'\t' '$8 > -1' primers/primers.final.fix.psl \
      | hgLoadPsl -nobin -table=all_sts_primer hg38 stdin

    hgLoadPsl -nobin -table=all_sts_seq hg38 stsMarkers.psl

##############################################################################
