# for emacs: -*- mode: sh; -*-

##############################################################################
# hg38 patch 11 build
##############################################################################
##  download sequence, prepare files (DONE - 2017-08-08 - Hiram)
##############################################################################

NOTE: This doesn't work if you try to use the 'refseq' assembly.
      We are trying to compare part names with UCSC hg38 and the 'genbank'
      assembly has the same names.

mkdir /hive/data/genomes/hg38/bed/hg38Patch11
cd /hive/data/genomes/hg38/bed/hg38Patch11
mkdir genbank
cd genbank
time rsync -L -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCA_000001405.26_GRCh38.p11/ ./
# sent 22792 bytes  received 4107420691 bytes  7699050.58 bytes/sec
# total size is 4106840243  speedup is 1.00
# real    8m52.740s

# this is an entire release of everything:

faSize GCA_000001405.26_GRCh38.p11_genomic.fna.gz

# 3253848404 bases (161368694 N's 3092479710 real 1917945460 upper
#	1174534250 lower) in 578 sequences in 1 files
# Total size: mean 5629495.5 sd 28120492.5 min 970 (KI270394.1)
#	max 248956422 (CM000663.2) median 166136
# %36.10 masked total, %37.98 masked real

# so the question is, what is new here compared to what we have in hg38

cd /hive/data/genomes/hg38/bed/hg38Patch11
time faCount genbank/GCA_000001405.26_GRCh38.p11_genomic.fna.gz \
   > faCount.GRCH38.p11.txt
#  real    1m13.179s

~/kent/src/hg/makeDb/doc/hg38/scanAssemblyReport.pl ../../chrom.sizes \
faCount.GRCH38.p11.txt genbank/GCA_000001405.26_GRCh38.p11_assembly_report.txt \
   | grep new | sed -e 's/^/# /'

# there are 123 new sequences:  (verify == 578 (p11) - 455 (hg38 chrom.sizes))

# chr1_KQ031383v1_fix   467143  KQ031383.1      new
# chr1_KQ983255v1_alt   278659  KQ983255.1      new
# chr1_KN538361v1_fix   305542  KN538361.1      new
# chr1_KQ458383v1_alt   349938  KQ458383.1      new
# chr1_KN196473v1_fix   166200  KN196473.1      new
# chr1_KZ208904v1_alt   166136  KZ208904.1      new
# chr1_KN196472v1_fix   186494  KN196472.1      new
# chr1_KQ458382v1_alt   141019  KQ458382.1      new
# chr1_KV880763v1_alt   551020  KV880763.1      new
# chr1_KZ208905v1_alt   140355  KZ208905.1      new
# chr1_KN196474v1_fix   122022  KN196474.1      new
# chr1_KN538360v1_fix   460100  KN538360.1      new
# chr1_KZ208906v1_fix   330031  KZ208906.1      new
# chr1_KQ458384v1_alt   212205  KQ458384.1      new
# chr2_KQ031384v1_fix   481245  KQ031384.1      new
# chr2_KQ983256v1_alt   535088  KQ983256.1      new
# chr2_KZ208907v1_alt   181658  KZ208907.1      new
# chr2_KZ208908v1_alt   140361  KZ208908.1      new
# chr2_KN538362v1_fix   208149  KN538362.1      new
# chr2_KN538363v1_fix   365499  KN538363.1      new
# chr3_KN196475v1_fix   451168  KN196475.1      new
# chr3_KN538364v1_fix   415308  KN538364.1      new
# chr3_KQ031385v1_fix   373699  KQ031385.1      new
# chr3_KV766192v1_fix   411654  KV766192.1      new
# chr3_KZ208909v1_alt   175849  KZ208909.1      new
# chr3_KN196476v1_fix   305979  KN196476.1      new
# chr3_KQ031386v1_fix   165718  KQ031386.1      new
# chr4_KQ090013v1_alt   90922   KQ090013.1      new
# chr4_KQ090014v1_alt   163749  KQ090014.1      new
# chr4_KQ090015v1_alt   236512  KQ090015.1      new
# chr4_KV766193v1_alt   420675  KV766193.1      new
# chr4_KQ983257v1_fix   230434  KQ983257.1      new
# chr4_KQ983258v1_alt   205407  KQ983258.1      new
# chr5_KN196477v1_alt   139087  KN196477.1      new
# chr5_KV575243v1_alt   362221  KV575243.1      new
# chr5_KZ208910v1_alt   135987  KZ208910.1      new
# chr5_KV575244v1_fix   673059  KV575244.1      new
# chr6_KZ208911v1_fix   242796  KZ208911.1      new
# chr6_KQ090017v1_alt   82315   KQ090017.1      new
# chr6_KN196478v1_fix   268330  KN196478.1      new
# chr6_KQ031387v1_fix   320750  KQ031387.1      new
# chr6_KQ090016v1_fix   245716  KQ090016.1      new
# chr6_KV766194v1_fix   139427  KV766194.1      new
# chr7_KV880764v1_fix   142129  KV880764.1      new
# chr7_KV880765v1_fix   468267  KV880765.1      new
# chr7_KZ208912v1_fix   589656  KZ208912.1      new
# chr7_KZ208913v1_alt   680662  KZ208913.1      new
# chr7_KQ031388v1_fix   179932  KQ031388.1      new
# chr8_KV880766v1_fix   156998  KV880766.1      new
# chr8_KV880767v1_fix   265876  KV880767.1      new
# chr8_KZ208914v1_fix   165120  KZ208914.1      new
# chr8_KZ208915v1_fix   6367528 KZ208915.1      new
# chr9_KQ090018v1_alt   163882  KQ090018.1      new
# chr9_KQ090019v1_alt   134099  KQ090019.1      new
# chr9_KN196479v1_fix   330164  KN196479.1      new
# chr10_KN538367v1_fix  420164  KN538367.1      new
# chr10_KQ090020v1_alt  185507  KQ090020.1      new
# chr10_KN196480v1_fix  277797  KN196480.1      new
# chr10_KN538365v1_fix  14347   KN538365.1      new
# chr10_KN538366v1_fix  85284   KN538366.1      new
# chr10_KQ090021v1_fix  264545  KQ090021.1      new
# chr11_KQ759759v1_fix  196940  KQ759759.1      new
# chr11_KN538368v1_alt  203552  KN538368.1      new
# chr11_KN196481v1_fix  108875  KN196481.1      new
# chr11_KQ090022v1_fix  181958  KQ090022.1      new
# chr11_KV766195v1_fix  140877  KV766195.1      new
# chr12_KQ090023v1_alt  109323  KQ090023.1      new
# chr12_KN196482v1_fix  211377  KN196482.1      new
# chr12_KN538369v1_fix  541038  KN538369.1      new
# chr12_KZ208916v1_fix  1046838 KZ208916.1      new
# chr12_KZ208918v1_alt  174808  KZ208918.1      new
# chr12_KN538370v1_fix  86533   KN538370.1      new
# chr12_KQ759760v1_fix  315610  KQ759760.1      new
# chr12_KZ208917v1_fix  64689   KZ208917.1      new
# chr13_KN538372v1_fix  356766  KN538372.1      new
# chr13_KQ090024v1_alt  168146  KQ090024.1      new
# chr13_KN196483v1_fix  35455   KN196483.1      new
# chr13_KN538373v1_fix  148762  KN538373.1      new
# chr13_KQ090025v1_alt  123480  KQ090025.1      new
# chr13_KN538371v1_fix  206320  KN538371.1      new
# chr14_KZ208920v1_fix  690932  KZ208920.1      new
# chr14_KZ208919v1_alt  171798  KZ208919.1      new
# chr15_KN538374v1_fix  4998962 KN538374.1      new
# chr15_KQ031389v1_alt  2365364 KQ031389.1      new
# chr16_KQ090026v1_alt  59016   KQ090026.1      new
# chr16_KV880768v1_fix  1927115 KV880768.1      new
# chr16_KQ031390v1_alt  169136  KQ031390.1      new
# chr16_KQ090027v1_alt  267463  KQ090027.1      new
# chr16_KZ208921v1_alt  78609   KZ208921.1      new
# chr17_KV575245v1_fix  154723  KV575245.1      new
# chr17_KV766196v1_fix  281919  KV766196.1      new
# chr17_KV766197v1_alt  246895  KV766197.1      new
# chr17_KV766198v1_alt  276292  KV766198.1      new
# chr18_KQ458385v1_alt  205101  KQ458385.1      new
# chr18_KQ090028v1_fix  407387  KQ090028.1      new
# chr18_KZ208922v1_fix  93070   KZ208922.1      new
# chr19_KN196484v1_fix  370917  KN196484.1      new
# chr19_KQ458386v1_fix  405389  KQ458386.1      new
# chr19_KV575246v1_alt  163926  KV575246.1      new
# chr19_KV575247v1_alt  170206  KV575247.1      new
# chr19_KV575248v1_alt  168131  KV575248.1      new
# chr19_KV575249v1_alt  293522  KV575249.1      new
# chr19_KV575250v1_alt  241058  KV575250.1      new
# chr19_KV575251v1_alt  159285  KV575251.1      new
# chr19_KV575252v1_alt  178197  KV575252.1      new
# chr19_KV575253v1_alt  166713  KV575253.1      new
# chr19_KV575254v1_alt  99845   KV575254.1      new
# chr19_KV575255v1_alt  161095  KV575255.1      new
# chr19_KV575256v1_alt  223118  KV575256.1      new
# chr19_KV575257v1_alt  100553  KV575257.1      new
# chr19_KV575258v1_alt  156965  KV575258.1      new
# chr19_KV575259v1_alt  171263  KV575259.1      new
# chr19_KV575260v1_alt  145691  KV575260.1      new
# chr22_KN196485v1_alt  156562  KN196485.1      new
# chr22_KN196486v1_alt  153027  KN196486.1      new
# chr22_KQ458387v1_alt  155930  KQ458387.1      new
# chr22_KQ458388v1_alt  174749  KQ458388.1      new
# chr22_KQ759761v1_alt  145162  KQ759761.1      new
# chr22_KQ759762v1_fix  101037  KQ759762.1      new
# chrX_KV766199v1_alt   188004  KV766199.1      new
# chrY_KN196487v1_fix   101150  KN196487.1      new
# chrY_KZ208923v1_fix   48370   KZ208923.1      new
# chrY_KZ208924v1_fix   209722  KZ208924.1      new

# how much sequence:
~/kent/src/hg/makeDb/doc/hg38/scanAssemblyReport.pl ../../chrom.sizes \
  faCount.GRCH38.p11.txt genbank/GCA_000001405.26_GRCh38.p11_assembly_report.txt \
     | grep new | awk '{sum += $2; printf "%d\t%s\n", sum, $0}' | tail
# 44562299        chrY_KZ208924v1_fix     209722  KZ208924.1      new
# ^^^^^^^^ total new sequence
    # verify, vs. p11: 3253848404  hg38: 3209286105
    # 3253848404 - 3209286105 = 44562299

~/kent/src/hg/makeDb/doc/hg38/scanAssemblyReport.pl ../../chrom.sizes \
  faCount.GRCH38.p11.txt genbank/GCA_000001405.26_GRCh38.p11_assembly_report.txt \
     | grep new > new.sequences.list

cut -f3 new.sequences.list > extract.new.list
awk '{printf "s/%s/%s/; ", $3,$1}' new.sequences.list > genbankToUCSC.sed

~/kent/src/hg/makeDb/doc/hg38/scanAssemblyReport.pl ../../chrom.sizes \
  faCount.GRCH38.p11.txt genbank/GCA_000001405.26_GRCh38.p11_assembly_report.txt \
     | grep -v new > existing.sequences.list

cut -f3 existing.sequences.list > extract.exist.list
awk '{printf "s/%s/%s/; ", $3,$1}' new.sequences.list > genbankToUCSC.sed

faSomeRecords genbank/GCA_000001405.26_GRCh38.p11_genomic.fna.gz \
    extract.new.list stdout | sed -e 's/ .*//;' | \
      sed -f genbankToUCSC.sed | gzip -c > hg38Patch11.fa.gz

faSomeRecords genbank/GCA_000001405.26_GRCh38.p11_genomic.fna.gz \
  extract.exist.list stdout | sed -e 's/ .*//;' | gzip -c > existing.fa.gz

# verify same amount of sequence here as hg38:
faSize existing.fa.gz
# 3209286105 bases (159970322 N's 3049315783 real 1890811945 upper 1158503838 lower) in 455 sequences in 1 files
# Total size: mean 7053376.1 sd 31548372.6 min 970 (KI270394.1) max 248956422 (CM000663.2) median 161218
# %36.10 masked total, %37.99 masked real


# hg38 has different masking
head -1 ../../faSize.hg38.2bit.txt 
# 3209286105 bases (159970322 N's 3049315783 real 1460684798 upper 1588630985 lower) in 455 sequences in 1 files

# verify correct amount of patch11 sequence here:
faSize hg38Patch11.fa.gz
# 44562299 bases (1398372 N's 43163927 real 27133515 upper 16030412 lower)
#	in 123 sequences in 1 files
# Total size: mean 362295.1 sd 750382.1 min 14347 (chr10_KN538365v1_fix)
#	max 6367528 (chr8_KZ208915v1_fix) median 186494
# %35.97 masked total, %37.14 masked real

# this is the same total obtained before:
# 44562299        chrY_KZ208924v1_fix     209722  KZ208924.1      new

# and both together should equal the original full patch11 sequence
time faSize existing.fa.gz hg38Patch11.fa.gz
# real    1m12.301s
# 3253848404 bases (161368694 N's 3092479710 real 1917945460 upper
#	1174534250 lower) in 578 sequences in 2 files
# Total size: mean 5629495.5 sd 28120492.5 min 970 (KI270394.1)
#	max 248956422 (CM000663.2) median 166136
# %36.10 masked total, %37.98 masked real

# original from above:

# 3253848404 bases (161368694 N's 3092479710 real 1917945460 upper
#	1174534250 lower) in 578 sequences in 1 files
# Total size: mean 5629495.5 sd 28120492.5 min 970 (KI270394.1)
#	max 248956422 (CM000663.2) median 166136
# %36.10 masked total, %37.98 masked real

# construct locations file:
~/kent/src/hg/makeDb/doc/hg38/regionScan.pl extract.new.list \
   genbank/GCA_000001405.26_GRCh38.p11_assembly_regions.txt \
     > patchLocations.bed

# verify correct number of locations, measured 96 before:
wc -l patchLocations.bed 
#   123 patchLocations.bed

#  separate haplotypes from fix patches for two tracks:
grep -v fix patchLocations.bed | sed -e 's/_alt//;' \
   | sed -e 's/\tchr.*_/\t/;' | sed -e 's/v/./;' > hg38Patch11Haplotypes.bed

grep fix patchLocations.bed | sed -e 's/_fix//;' \
   | sed -e 's/\tchr.*_/\t/;' | sed -e 's/v\([0-9]\)$/.\1/;' \
     > hg38Patch11Patches.bed

# verify nothing lost, should be 123:
wc -l hg38*.bed
#   59 hg38Patch11Haplotypes.bed
#   64 hg38Patch11Patches.bed
#  123 total

hgLoadBed -type=bed4 hg38 hg38Patch11Haplotypes hg38Patch11Haplotypes.bed 
# Read 59 elements of size 4 from hg38Patch11Haplotypes.bed

hgLoadBed -type=bed4 hg38 hg38Patch11Patches hg38Patch11Patches.bed 
# Read 64 elements of size 4 from hg38Patch11Patches.bed

# construct 2bit file:
faToTwoBit hg38Patch11.fa.gz hg38Patch11.unmasked.2bit
twoBitInfo hg38Patch11.unmasked.2bit stdout | sort -k2nr > hg38Patch11.chrom.sizes
# take a look at that to verify it looks OK:
cat hg38Patch11.chrom.sizes | sed -e 's/^/# /;'
# chr8_KZ208915v1_fix   6367528
# chr15_KN538374v1_fix  4998962
# chr15_KQ031389v1_alt  2365364
# chr16_KV880768v1_fix  1927115
# chr12_KZ208916v1_fix  1046838
# chr14_KZ208920v1_fix  690932
# chr7_KZ208913v1_alt   680662
# chr5_KV575244v1_fix   673059
# chr7_KZ208912v1_fix   589656
# chr1_KV880763v1_alt   551020
# chr12_KN538369v1_fix  541038
# chr2_KQ983256v1_alt   535088
...
# chr19_KV575257v1_alt  100553
# chr19_KV575254v1_alt  99845
# chr18_KZ208922v1_fix  93070
# chr4_KQ090013v1_alt   90922
# chr12_KN538370v1_fix  86533
# chr10_KN538366v1_fix  85284
# chr6_KQ090017v1_alt   82315
# chr16_KZ208921v1_alt  78609
# chr12_KZ208917v1_fix  64689
# chr16_KQ090026v1_alt  59016
# chrY_KZ208923v1_fix   48370
# chr13_KN196483v1_fix  35455
# chr10_KN538365v1_fix  14347

zcat genbank/GCA_000001405.26_GRCh38.p11_assembly_structure/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz \
   | sed -f genbankToUCSC.sed > hg38Patch11.agp

checkAgpAndFa hg38Patch11.agp hg38Patch11.unmasked.2bit | tail -1

# All AGP and FASTA entries agree - both files are valid

#############################################################################
# build hg38Patch11 database (DONE - 2017-08-09 - Hiram)
    # need this database for netClass operations during the chain/net
    # construction

    mkdir /hive/data/genomes/hg38Patch11
    cd /hive/data/genomes/hg38Patch11
    mkdir /gbdb/hg38Patch11
    ln -s /hive/data/genomes/hg38/bed/hg38Patch11/hg38Patch11.unmasked.2bit ./
    ln -s /hive/data/genomes/hg38/bed/hg38Patch11/hg38Patch11.agp ./

    twoBitInfo hg38Patch11.unmasked.2bit stdout | sort -k2nr > chrom.sizes
    mkdir -p bed/chromInfo
    awk '{printf "%s\t%d\t/gbdb/hg38Patch11/hg38Patch11.2bit\n", $1, $2}' \
          chrom.sizes > bed/chromInfo/chromInfo.tab

    hgsql -e 'create database hg38Patch11;' hg38

    hgsql hg38Patch11 < $HOME/kent/src/hg/lib/grp.sql

    hgLoadSqlTab hg38Patch11 chromInfo $HOME/kent/src/hg/lib/chromInfo.sql \
             bed/chromInfo/chromInfo.tab
    hgGoldGapGl -noGl hg38Patch11 hg38Patch11.agp
    featureBits -or -countGaps hg38Patch11 gold gap
    # 44562299 bases of 44562299 (100.000%) in intersection

    hgsql hgcentraltest -e 'INSERT INTO dbDb (name, description, nibPath,
organism, defaultPos, active, orderKey, genome, scientificName, htmlPath,
hgNearOk, hgPbOk, sourceName, taxId)
VALUES ("hg38Patch11", "Jun. 2017", "/gbdb/hg38Patch11", "GRCh38.p11",
"chr15_KN538374v1_fix:2000000-2200000", 1, 7749, "GRCh38.p11", "Homo sapiens",
"/gbdb/hg38Patch11/html/description.html", 0, 0,
"GRCh38 patch 11 Genome Reference Consortium Human Reference 38", 9606);
INSERT INTO defaultDb (genome, name) VALUES ("GRCh38.p11", "hg38Patch11");
INSERT INTO genomeClade (genome, clade, priority)
VALUES ("GRCh38.p11", "haplotypes", 134);'

    mkdir html
    # copy description.html from hg38Patch9/html/description.html
    # edit to update for Patch11
    cp ../../hg38Patch9/html/description.html .

    mkdir -p /hive/data/genomes/hg38Patch11/bed/gc5Base
    cd /hive/data/genomes/hg38Patch11/bed/gc5Base
  time hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 hg38Patch11 \
         ../../hg38Patch11.unmasked.2bit | wigEncode stdin gc5Base.{wig,wib}
    #  real    0m5.699s
    # Converted stdin, upper limit 100.00, lower limit 0.00

    mkdir /gbdb/hg38Patch11/wib
    ln -s `pwd`/gc5Base.wib /gbdb/hg38Patch11/wib

    hgLoadWiggle -pathPrefix=/gbdb/hg38Patch11/wib hg38Patch11 gc5Base gc5Base.wig

    mkdir /hive/data/genomes/hg38Patch11/bed/repeatMasker
    cd /hive/data/genomes/hg38Patch11/bed/repeatMasker
    time (doRepeatMasker.pl -bigClusterHub=ku \
       -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38Patch11) \
          > do.log 2>&1
    # real    53m35.937s

    cat faSize.rmsk.txt
# 44562299 bases (1398372 N's 43163927 real 20846957 upper 22316970 lower)
#	in 123 sequences in 1 files
# Total size: mean 362295.1 sd 750382.1 min 14347 (chr10_KN538365v1_fix)
#	max 6367528 (chr8_KZ208915v1_fix) median 186494
# %50.08 masked total, %51.70 masked real

    mkdir /hive/data/genomes/hg38Patch11/bed/simpleRepeat
    cd /hive/data/genomes/hg38Patch11/bed/simpleRepeat
    time (doSimpleRepeat.pl -bigClusterHub=ku -workhorse=hgwdev \
        -trf409=6 -smallClusterHub=ku -buildDir=`pwd` hg38Patch11) > do.log 2>&1
    # real    5m36.925s

    cat fb.simpleRepeat
    # 2049146 bases of 43164255 (4.747%) in intersection

    # the simpleRepeat procedure fails in the cleanup step since there
    # is no TrfPart directory

    mkdir /hive/data/genomes/hg38Patch11/bed/windowMasker
    cd /hive/data/genomes/hg38Patch11/bed/windowMasker
    time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev hg38Patch11) > do.log 2>&1 &
    # real    2m7.148s

    cat faSize.hg38Patch11.cleanWMSdust.txt

# 44562299 bases (1398372 N's 43163927 real 31289187 upper 11874740 lower)
#	in 123 sequences in 1 files
# Total size: mean 362295.1 sd 750382.1 min 14347 (chr10_KN538365v1_fix)
#	max 6367528 (chr8_KZ208915v1_fix) median 186494
# %26.65 masked total, %27.51 masked real

    cat fb.hg38Patch11.rmsk.windowmaskerSdust.txt
# 9423450 bases of 44562299 (21.147%) in intersection

    cd /hive/data/genomes/hg38Patch11
    twoBitMask hg38Patch11.rmsk.2bit \
        -add bed/simpleRepeat/trfMask.bed hg38Patch11.2bit
    twoBitToFa hg38Patch11.2bit stdout | faSize stdin

# 44562299 bases (1398372 N's 43163927 real 20794645 upper 22369282 lower)
#	in 123 sequences in 1 files
# Total size: mean 362295.1 sd 750382.1 min 14347 (chr10_KN538365v1_fix)
#	max 6367528 (chr8_KZ208915v1_fix) median 186494
# %50.20 masked total, %51.82 masked real

    # same size as above in original source:

# 44562299 bases (1398372 N's 43163927 real 27133515 upper 16030412 lower)
#	in 123 sequences in 1 files

    ln -s `pwd`/hg38Patch11.2bit /gbdb/hg38Patch11

#########################################################################
# run up idKeys files for ncbiRefSeq (DONE - 2017-08-09 - Hiram)
    mkdir /hive/data/genomes/hg38Patch11/bed/idKeys
    cd /hive/data/genomes/hg38Patch11/bed/idKeys

    time (doIdKeys.pl \
      -twoBit=/hive/data/genomes/hg38Patch11/hg38Patch11.unmasked.2bit \
        -buildDir=`pwd`  hg38Patch11) > do.log 2>&1
    # real    0m18.300s

    cat hg38Patch11.keySignature.txt
    #   74497a30967b27ea68a7fad40cc01de5

#########################################################################
# ncbiRefSeq Genes (TBD - 2016-05-19 - Hiram)

    mkdir /hive/data/genomes/hg38Patch11/bed/ncbiRefSeq
    cd /hive/data/genomes/hg38Patch11/bed/ncbiRefSeq
    # running step wise as this script is still under development
    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -bigClusterHub=ku -dbHost=hgwdev \
      -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      genbank vertebrate_mammalian Homo_sapiens \
      GCA_000001405.33_GRCh38.p11 hg38Patch11) > download.log 2>&1
    # real    11m43.551s

    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -continue=process -bigClusterHub=ku -dbHost=hgwdev \
      -stop=process -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      genbank vertebrate_mammalian Homo_sapiens \
      GCA_000001405.33_GRCh38.p11 hg38Patch11) > process.log 2>&1
    # real    12m40.655s

    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -continue=load -bigClusterHub=ku -dbHost=hgwdev \
      -stop=load -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      genbank vertebrate_mammalian Homo_sapiens \
      GCA_000001405.33_GRCh38.p11 hg38Patch11) > load.log 2>&1
    # real    0m13.277s

    cat fb.ncbiRefSeq.hg38Patch11.txt
    #  1727668 bases of 21862561 (7.902%) in intersection

#########################################################################
# ucscToINSDC table/track (TBD - 2016-05-19 - Hiram)
    # the sequence here is working for a 'genbank' assembly with a chrM
    # situation may be specific depending upon what is available in the assembly

    mkdir /hive/data/genomes/hg38Patch11/bed/ucscToINSDC
    cd /hive/data/genomes/hg38Patch11/bed/ucscToINSDC

    # special script to get the names directly out of the assembly report:
    ./processReport.pl  | sort > ucscToINSDC.txt

    awk '{printf "%s\t%s\n", $2, $1}' ucscToINSDC.txt | sort > insdcToUcsc.txt
    # there is no name for chrM/NC_001323.1 sequence, there is no such
    #  sequence with an INSDC name
    grep -v "^#" /hive/data/genomes/hg38/bed/hg38Patch11/genbank/GCA_000001405.26_GRCh38.p11_assembly_report.txt | cut -f5,7 \
      | sed -e 's/na\b/notAvailable/;' | awk '{printf "%s\t%s\n", $1, $2}' \
         | sort > insdc.genbank.txt
    # the sed \b means to match word

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > name.coordinate.tab
    join insdc.genbank.txt insdcToUcsc.txt | tr '[ ]' '[\t]' | sort -k3 \
       | join -2 3 name.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-3,5 \
           > ucscToINSDC.bed

    # should be same line counts throughout (except for insdc.genbank.txt):
    wc -l *
    # 196 insdc.genbank.txt
    # 130 insdcToUcsc.txt
    # 130 name.coordinate.tab
    # 130 ucscToINSDC.bed
    # 130 ucscToINSDC.txt

    cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1
    # 20
    # use the 20 in this sed
    sed -e "s/21/20/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         | hgLoadSqlTab hg38Patch11 ucscToINSDC stdin ucscToINSDC.bed
    checkTableCoords hg38Patch11
    # should cover %100 entirely:
    featureBits -countGaps hg38Patch11 ucscToINSDC
    # 23260605 bases of 23260605 (100.000%) in intersection

    join -1 2 <(sort -k2 ucscToINSDC.txt) insdc.genbank.txt | tr '[ ]' '[\t]' \
      | sort -k2 | join -2 2 name.coordinate.tab - |  tr '[ ]' '[\t]' \
        | cut -f1-4 > ucscToRefSeq.bed
    cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1
    # 20
    # use the 20 in this sed
    sed -e "s/21/20/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
       | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql
    hgLoadSqlTab hg38Patch11 ucscToRefSeq ./ucscToRefSeq.sql ucscToRefSeq.bed

    checkTableCoords  hg38Patch11 -table=ucscToRefSeq
    # should cover %100 all bases:
    featureBits -countGaps hg38Patch11 ucscToRefSeq
    # 23260605 bases of 23260605 (100.000%) in intersection

##############################################################################
# cpgIslands on UNMASKED sequence (DONE - 2016-12-05 - Hiram)

    mkdir /hive/data/genomes/hg38Patch11/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/hg38Patch11/bed/cpgIslandsUnmasked

    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -tableName=cpgIslandExtUnmasked \
          -maskedSeq=/hive/data/genomes/hg38Patch11/hg38Patch11.unmasked.2bit \
             -workhorse=hgwdev -smallClusterHub=ku hg38Patch11) > do.log 2>&1
    # real    1m9.904s

    cat fb.hg38Patch11.cpgIslandExtUnmasked.txt
    # 390840 bases of 27757875 (1.408%) in intersection

##########################################################################
# cpgIslands - (DONE - 2017-08-09 - Hiram)
    mkdir /hive/data/genomes/hg38Patch11/bed/cpgIslands
    cd /hive/data/genomes/hg38Patch11/bed/cpgIslands
    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev -smallClusterHub=ku hg38Patch11) > do.log 2>&1
    # real    0m43.302s

    cat fb.hg38Patch11.cpgIslandExt.txt
    # 267468 bases of 27757875 (1.360%) in intersection

##############################################################################
# cytoBandIdeo - (DONE - 2016-12-05 - Hiram)
    mkdir /hive/data/genomes/hg38Patch11/bed/cytoBand
    cd /hive/data/genomes/hg38Patch11/bed/cytoBand
    makeCytoBandIdeo.csh hg38Patch11

#########################################################################
# CREATE MICROSAT TRACK (DONE - 2016-04-19 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/hg38Patch11/bed/microsat
    cd /hive/data/genomes/hg38Patch11/bed/microsat

    awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
       ../simpleRepeat/simpleRepeat.bed > microsat.bed

    hgLoadBed hg38Patch11 microsat microsat.bed
    # Read 312 elements of size 4 from microsat.bed

#############################################################################
# lastz alignments to hg38 (DONE - 2017-08-09 - Hiram)
#############################################################################

mkdir /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38.2017-08-09
cd /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38.2017-08-09

# prepare bits of hg38 sequence to lastz align to the patches,
# this is selecting out the specific section of hg38 where the patch
# is supposed to match, and setting up lastz parameters

rm -fr hg38Bits run.blastz run.blastz/tParts run.blastz/qParts psl \
   hg38Bits.lift
mkdir -p hg38Bits run.blastz run.blastz/tParts run.blastz/qParts psl
cut -f4 ../patchLocations.bed | while read FIX
do
  chr=`grep "${FIX}" ../patchLocations.bed | cut -f1`
  start=`grep "${FIX}" ../patchLocations.bed | cut -f2`
  end=`grep "${FIX}" ../patchLocations.bed | cut -f3`
  bitSize=`echo ${end} ${start} | awk '{printf "%d", $1-$2}'`
  chrSize=`grep -w "${chr}" ../../../chrom.sizes | cut -f2`
  fixSize=`grep "${FIX}" ../hg38Patch11.chrom.sizes | cut -f2`
  printf "${chr}:${start}-${end} vs. ${FIX}:0-${fixSize}\n" 1>&2
  twoBitToFa /hive/data/genomes/hg38/hg38.unmasked.2bit:${chr}:${start}-${end} stdout \
    | sed -e "s/${chr}:/${FIX}_/g" > hg38Bits/${FIX}.fa
  fixName=`head -1 hg38Bits/${FIX}.fa | sed -e 's/>//;'`
  printf "${start}\t${fixName}\t${fixSize}\t${chr}\t${chrSize}\n" 1>&2
  printf "${start}\t${fixName}\t${fixSize}\t${chr}\t${chrSize}\n" >> hg38Bits.lift
  printf "/hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38.2017-08-09/hg38Bits.2bit:${fixName}:0-${bitSize}\n" 1>&2
  printf "/hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38.2017-08-09/hg38Bits.2bit:${fixName}:0-${bitSize}\n" > run.blastz/qParts/${fixName}.lst
  printf "/hive/data/genomes/hg38/bed/hg38Patch11/hg38Patch11.unmasked.2bit:${FIX}:0-${fixSize}\n" > run.blastz/tParts/${fixName}.lst
  printf "/cluster/bin/scripts/blastz-run-ucsc -outFormat psl tParts/${fixName}.lst qParts/${fixName}.lst ../DEF {check out exists ../psl/${fixName}.psl}\n" 1>&2
  printf "/cluster/bin/scripts/blastz-run-ucsc -outFormat psl tParts/${fixName}.lst qParts/${fixName}.lst ../DEF {check out exists ../psl/${fixName}.psl}\n" >> run.blastz/jobList
done

faToTwoBit hg38Bits/*.fa hg38Bits.2bit
twoBitInfo hg38Bits.2bit stdout | sort -k2nr > hg38Bits.chrom.sizes

    printf 'BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
# human vs human
# maximum M allowed with lastz is only 254
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
BLASTZ_O=600
BLASTZ_E=150
# other parameters from hg18 vs venter1 lastz on advice from Webb
BLASTZ_K=10000
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human Hg38Patch11
SEQ1_DIR=/hive/data/genomes/hg38/bed/hg38Patch11/hg38Patch11.unmasked.2bit
SEQ1_LEN=/hive/data/genomes/hg38/bed/hg38Patch11/hg38Patch11.chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
SEQ1_LIMIT=1

# QUERY: Human Hg38
#SEQ2_DIR=/scratch/data/hg38/hg38.2bit
SEQ2_DIR=/hive/data/genomes/hg38/hg38.unmasked.2bit
SEQ2_LEN=/scratch/data/hg38/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38.2017-08-09/hg38Bits.2bit
SEQ2_CTGLEN=/hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38.2017-08-09/hg38Bits.chrom.sizes
SEQ2_LIFT=/hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38.2017-08-09/hg38Bits.lift
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
SEQ2_LIMIT=1

BASE=/hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38.2017-08-09
TMPDIR=/dev/shm
' > DEF

ssh ku
cd /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38.2017-08-09/run.blastz
para create jobList
para try ... check ... push ... etc
para time > run.time
# Completed: 123 of 123 jobs
# CPU time in finished jobs:        164s       2.73m     0.05h    0.00d  0.000 y
# IO & Wait Time:                   330s       5.51m     0.09h    0.00d  0.000 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest finished job:              28s       0.47m     0.01h    0.00d
# Submission to last job:            46s       0.77m     0.01h    0.00d

# put together the individual results
cd /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38.2017-08-09
mkdir pslParts
cat psl/chr*.psl | gzip -c > pslParts/hg38Patch11.hg38.psl.gz

#   constructing a chain from those results
mkdir -p /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38.2017-08-09/axtChain/run
cd /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38.2017-08-09/axtChain/run
time zcat ../../pslParts/hg38Patch11.hg38.psl.gz \
| axtChain -psl -verbose=0 -scoreScheme=/scratch/data/blastz/human_chimp.v2.q -minScore=2000 -linearGap=medium stdin \
    /hive/data/genomes/hg38/bed/hg38Patch11/hg38Patch11.unmasked.2bit \
    /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38.2017-08-09/hg38Bits.2bit \
    stdout \
| chainAntiRepeat /hive/data/genomes/hg38/bed/hg38Patch11/hg38Patch11.unmasked.2bit \
    /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38.2017-08-09/hg38Bits.2bit  \
    stdin hg38Patch11.hg38.preLift.chain
    #   real    0m10.588s

liftUp -chainQ hg38Patch11.hg38.lifted.chain \
    ../../hg38Bits.lift carry hg38Patch11.hg38.preLift.chain

# constructing the net files:
cd /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38.2017-08-09/axtChain

chainMergeSort run/hg38Patch11.hg38.lifted.chain \
        | gzip -c > hg38Patch11.hg38.all.chain.gz

chainSplit chain hg38Patch11.hg38.all.chain.gz
# Make nets ("noClass", i.e. without rmsk/class stats which are added later):
time chainPreNet hg38Patch11.hg38.all.chain.gz \
    ../../hg38Patch11.chrom.sizes \
    /hive/data/genomes/hg38/chrom.sizes stdout \
    | chainNet  stdin -minSpace=1 ../../hg38Patch11.chrom.sizes \
        /hive/data/genomes/hg38/chrom.sizes stdout /dev/null \
    | netSyntenic stdin noClass.net
    #   real    0m0.605s

hgLoadChain -tIndex hg38Patch11 chainHg38 hg38Patch11.hg38.all.chain.gz
# Loading 25196 chains into hg38Patch11.chainHg38

featureBits hg38Patch11 chainHg38Link
# 42087519 bases of 43164255 (97.505%) in intersection

netClass -verbose=0 -noAr noClass.net hg38Patch11 hg38 hg38Patch11.hg38.net

netFilter -minGap=10 hg38Patch11.hg38.net \
  | hgLoadNet -verbose=0 hg38Patch11 netHg38 stdin

# Make liftOver chains:
netChainSubset -verbose=0 noClass.net hg38Patch11.hg38.all.chain.gz stdout \
    | chainStitchId stdin stdout | gzip -c > hg38Patch11.hg38.over.chain.gz

# Make axtNet for download: one .axt per hg38Patch11 seq.
netSplit noClass.net net
cd ..
mkdir -p axtNet
# beware, tcsh scripting here:
foreach f (axtChain/net/*.net)
netToAxt $f axtChain/chain/$f:t:r.chain \
  ../hg38Patch11.unmasked.2bit \
        /hive/data/genomes/hg38/hg38.unmasked.2bit stdout \
  | axtSort stdin stdout \
  | gzip -c > axtNet/$f:t:r.hg38Patch11.hg38.net.axt.gz
end

# Make mafNet for multiz: one .maf per hg38Patch11 seq.
mkdir -p mafNet
# beware, tcsh scripting here:
foreach f (axtNet/*.hg38Patch11.hg38.net.axt.gz)
  axtToMaf -tPrefix=hg38Patch11. -qPrefix=hg38. $f \
        ../hg38Patch11.chrom.sizes \
        /hive/data/genomes/hg38/chrom.sizes \
        stdout \
  | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
end

#############################################################################
# run this same business with hg38 as target, Patch11 sequence as query
#  (DONE - 2017-08-09 - Hiram)

mkdir /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38Patch11.2017-08-09
cd /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38Patch11.2017-08-09

rm -fr hg38Bits run.blastz psl hg38Bits.lift
mkdir -p hg38Bits run.blastz/tParts run.blastz/qParts psl
cut -f4 ../patchLocations.bed | while read FIX
do
  chr=`grep "${FIX}" ../patchLocations.bed | cut -f1`
  start=`grep "${FIX}" ../patchLocations.bed | cut -f2`
  end=`grep "${FIX}" ../patchLocations.bed | cut -f3`
  bitSize=`echo ${end} ${start} | awk '{printf "%d", $1-$2}'`
  chrSize=`grep -w "${chr}" ../../../chrom.sizes | cut -f2`
  fixSize=`grep "${FIX}" ../hg38Patch11.chrom.sizes | cut -f2`
  echo ${chr}:${start}-${end} vs. ${FIX}:0-${fixSize} 1>&2
  twoBitToFa /hive/data/genomes/hg38/hg38.unmasked.2bit:${chr}:${start}-${end} stdout \
    | sed -e "s/${chr}:/${FIX}_/g"  > hg38Bits/${FIX}.fa
  fixName=`head -1 hg38Bits/${FIX}.fa | sed -e 's/>//;'`
  echo -e "${start}\t${fixName}\t${fixSize}\t${chr}\t${chrSize}" >> hg38Bits.lift
  echo -e "/hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38Patch11.2017-08-09/hg38Bits.2bit:${fixName}:0-${bitSize}" > run.blastz/tParts/${FIX}.lst
  echo -e "/hive/data/genomes/hg38/bed/hg38Patch11/hg38Patch11.unmasked.2bit:${FIX}:0-${fixSize}" > run.blastz/qParts/${FIX}.lst
  echo -e "/cluster/bin/scripts/blastz-run-ucsc -outFormat psl tParts/${FIX}.lst qParts/${FIX}.lst ../DEF {check out exists ../psl/${FIX}.psl}" >> run.blastz/jobList
done

faToTwoBit hg38Bits/*.fa hg38Bits.2bit
twoBitInfo hg38Bits.2bit stdout | sort -k2n > hg38Bits.chrom.sizes

    printf 'BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
# human vs human
# maximum M allowed with lastz is only 254
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
BLASTZ_O=600
BLASTZ_E=150
# other parameters from hg18 vs venter1 lastz on advice from Webb
BLASTZ_K=10000
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human Hg38
SEQ1_DIR=/hive/data/genomes/hg38/hg38.unmasked.2bit
SEQ1_LEN=/scratch/data/hg38/chrom.sizes
SEQ1_CTGDIR=/hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38Patch11.2017-08-09/hg38Bits.2bit
SEQ1_CTGLEN=/hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38Patch11.2017-08-09/hg38Bits.chrom.sizes
SEQ1_LIFT=/hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38Patch11.2017-08-09/hg38Bits.lift
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
SEQ1_LIMIT=1

# QUERY: Human Hg38Patch11
SEQ2_DIR=/hive/data/genomes/hg38/bed/hg38Patch11/hg38Patch11.unmasked.2bit
SEQ2_LEN=/hive/data/genomes/hg38/bed/hg38Patch11/hg38Patch11.chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
SEQ2_LIMIT=1

BASE=/hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38Patch11.2017-08-09
TMPDIR=/dev/shm
' > DEF

ssh ku
cd /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38Patch11.2017-08-09/run.blastz
para create jobList
para try ... check ... push ... etc
para time > run.time
sed -e 's/^/# /;' run.time
# Completed: 123 of 123 jobs
# CPU time in finished jobs:        160s       2.66m     0.04h    0.00d  0.000 y
# IO & Wait Time:                   330s       5.51m     0.09h    0.00d  0.000 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest finished job:              27s       0.45m     0.01h    0.00d
# Submission to last job:            44s       0.73m     0.01h    0.00d

# put together the individual results
cd /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38Patch11.2017-08-09
mkdir pslParts
cat psl/chr*.psl | gzip -c > pslParts/hg38.hg38Patch11.psl.gz

#   constructing a chain from those results
mkdir -p /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38Patch11.2017-08-09/axtChain/run
cd /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38Patch11.2017-08-09/axtChain/run

time zcat ../../pslParts/hg38.hg38Patch11.psl.gz \
| axtChain -psl -verbose=0 -scoreScheme=/scratch/data/blastz/human_chimp.v2.q -minScore=2000 -linearGap=medium stdin \
    /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38Patch11.2017-08-09/hg38Bits.2bit \
    /hive/data/genomes/hg38/bed/hg38Patch11/hg38Patch11.unmasked.2bit \
    stdout \
     | chainAntiRepeat /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38Patch11.2017-08-09/hg38Bits.2bit \
     /hive/data/genomes/hg38/bed/hg38Patch11/hg38Patch11.unmasked.2bit \
    stdin hg38.hg38Patch11.preLift.chain
    #   real    0m7.347s

liftUp hg38.hg38Patch11.lifted.chain \
    ../../hg38Bits.lift carry hg38.hg38Patch11.preLift.chain

# constructing the net files:
cd /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38Patch11.2017-08-09/axtChain

chainMergeSort run/hg38.hg38Patch11.lifted.chain \
        | gzip -c > hg38.hg38Patch11.all.chain.gz

hgLoadChain -tIndex hg38 chainHg38Patch11 hg38.hg38Patch11.all.chain.gz
# Loading 25108 chains into hg38.chainHg38Patch11

featureBits hg38 chainHg38Patch11Link
# 38662499 bases of 3049335806 (1.268%) in intersection

chainSplit chain hg38.hg38Patch11.all.chain.gz
# Make nets ("noClass", i.e. without rmsk/class stats which are added later):
time chainPreNet hg38.hg38Patch11.all.chain.gz \
    /hive/data/genomes/hg38/chrom.sizes \
    ../../hg38Patch11.chrom.sizes stdout \
    | chainNet  stdin -minSpace=1 /hive/data/genomes/hg38/chrom.sizes \
        ../../hg38Patch11.chrom.sizes stdout /dev/null \
    | netSyntenic stdin noClass.net
    #   real    0m0.564s

time netClass -verbose=0 -noAr noClass.net hg38 hg38Patch11 hg38.hg38Patch11.net
#  real    0m19.909s

time netFilter -minGap=10 hg38.hg38Patch11.net \
  | hgLoadNet -verbose=0 hg38 netHg38Patch11 stdin
#  real    0m0.112s

# Make liftOver chains:
netChainSubset -verbose=0 noClass.net hg38.hg38Patch11.all.chain.gz stdout \
    | chainStitchId stdin stdout | gzip -c > hg38.hg38Patch11.over.chain.gz

# Make axtNet for download: one .axt per hg38Patch11 seq.
netSplit noClass.net net
cd ..
mkdir -p axtNet
# more tcsh here
foreach f (axtChain/net/*.net)
netToAxt $f axtChain/chain/$f:t:r.chain \
  /hive/data/genomes/hg38/hg38.unmasked.2bit \
        ../hg38Patch11.unmasked.2bit stdout \
  | axtSort stdin stdout \
  | gzip -c > axtNet/$f:t:r.hg38.hg38Patch11.net.axt.gz
end

# Make mafNet for multiz: one .maf per hg38Patch11 seq.
mkdir -p mafNet
# tcsh loop again
foreach f (axtNet/*.hg38.hg38Patch11.net.axt.gz)
  axtToMaf -tPrefix=hg38. -qPrefix=hg38Patch11. $f \
        /hive/data/genomes/hg38/chrom.sizes \
        ../hg38Patch11.chrom.sizes \
        stdout \
  | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
end

cd /hive/data/genomes/hg38/bed/hg38Patch11/lastzHg38Patch11.2017-08-09/axtChain
mkdir -p queryChains
chainSplit -q queryChains hg38.hg38Patch11.all.chain.gz

# then run a 'lift over' chain/net on each single one
mkdir -p singleLiftOver

for F in queryChains/*.chain
do
    C=`basename ${F}`
    B=`echo ${C} | sed -e "s/.chain//"`
    chainPreNet -inclHap ${F} /hive/data/genomes/hg38/chrom.sizes \
        ../../hg38Patch11.chrom.sizes stdout \
    | chainNet -inclHap stdin -minSpace=1 /hive/data/genomes/hg38/chrom.sizes \
        ../../hg38Patch11.chrom.sizes singleLiftOver/${B}.raw.net \
        /dev/null
    netSyntenic singleLiftOver/${B}.raw.net singleLiftOver/${B}.noClass.net
    netFilter -chimpSyn singleLiftOver/${B}.noClass.net > singleLiftOver/${B}.chimpSyn.net
    netChainSubset -verbose=0 singleLiftOver/${B}.noClass.net \
        ${F} stdout \
    | chainStitchId stdin stdout > singleLiftOver/${C}
    printf "${F} -> singleLiftOver/${C}\n" 1>&2
done

# put the chains together into one file
chainMergeSort singleLiftOver/chr*.chain | gzip -c \
        > hg38.hg38Patch11.single.over.chain.gz
# -rw-rw-r-- 1  43719 Aug  9 09:17 hg38.hg38Patch11.single.over.chain.gz

 # construct psl files from those chains
chainToPsl hg38.hg38Patch11.single.over.chain.gz \
   /hive/data/genomes/hg38/chrom.sizes \
    ../../hg38Patch11.chrom.sizes \
     /hive/data/genomes/hg38/hg38.unmasked.2bit \
       ../../hg38Patch11.unmasked.2bit \
        hg38.hg38Patch11.over.psl
# pslCheck reports no errors
pslCheck -db=hg38 hg38.hg38Patch11.over.psl
# checked: 565 failed: 0 errors: 0

# pslRecalcMatch doesn't change anything
# pslRecalcMatch hg38.hg38Patch11.over.psl \
#    /hive/data/genomes/hg38/hg38.unmasked.2bit \
#      ../../hg38Patch11.unmasked.2bit hg38.hg38Patch11.recalc.over.psl

# load this PSL track
#     this table name prefix altSeqLiftOverPsl is recognized in hgc clicks
hgLoadPsl hg38 -table=altSeqLiftOverPslP11 hg38.hg38Patch11.over.psl

mkdir /hive/data/genomes/hg38/bed/hg38Patch11/seqExt
cd /hive/data/genomes/hg38/bed/hg38Patch11/seqExt
twoBitToFa ../hg38Patch11.unmasked.2bit hg38Patch11.fa
mkdir -p /gbdb/hg38/hg38Patch11 hg38Patch11
faSplit byname hg38Patch11.fa ./hg38Patch11/
ln -s `pwd`/hg38Patch11/*.fa /gbdb/hg38/hg38Patch11
hgLoadSeq -drop -seqTbl=seqHg38Patch11 -extFileTbl=extHg38Patch11 hg38 \
        /gbdb/hg38/hg38Patch11/*.fa

#############################################################################
# wrap up scripts and file editing (trackDb, hgdownload)
#  (DONE - 2017-08-09 - Hiram)

# edit ${HOME}/kent/src/hg/makeDb/doc/hg38/patchDescr.pl to reflect this most
# recent patch version.

cd /hive/data/genomes/hg38/bed/hg38Patch11
# edit patchDescr.pl to update to this patch level
${HOME}/kent/src/hg/makeDb/doc/hg38/patchDescr.pl \
    > ${HOME}/kent/src/hg/makeDb/trackDb/human/hg38/hg38Patch11.html

# copy README.txt from previous patch and edit as needed - replace the
# patch description lines with the output of running
cd /hive/data/genomes/hg38/bed/hg38Patch11
cp -p ../hg38Patch9/readmeLines.pl .
./readmeLines.pl genbank/GCA_000001405.26_GRCh38.p11_assembly_report.txt \
    genbank/GCA_000001405.26_GRCh38.p11_assembly_regions.txt \
      patchLocations.bed new.sequences.list > add.to.README.txt

mkdir /hive/data/genomes/hg38/bed/hg38Patch11/downloads
cd /hive/data/genomes/hg38/bed/hg38Patch11/downloads
#This README.txt file needs to be edited, and add the lines ../add.to.README.txt
cp -p /hive/data/genomes/hg38/bed/hg38Patch9/downloads/README.txt .
ln -s ../hg38Patch11.unmasked.2bit hg38Patch11.2bit
ln -s ../hg38Patch11.agp
ln -s ../hg38Patch11.fa.gz
md5sum * > md5sum.txt
mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/hg38Patch11
ln -s `pwd`/*  /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/hg38Patch11/

#############################################################################
