# for emacs: -*- mode: sh; -*-

# Assembly Name:  GRCm38.p4
# Description:    Genome Reference Consortium Mouse Build 38 patch release 4
# (GRCm38.p4)
# Organism name:  Mus musculus
# Taxid:          10090
# Submitter:      Genome Reference Consortium
# Date:           2015-3-23
# Assembly type:  haploid-with-alt-loci
# Release type:   patch
# Assembly level: Chromosome
# Genome representation: full
# GenBank Assembly Accession: GCA_000001635.6 (latest)
# RefSeq Assembly Accession: GCF_000001635.24 (latest)
# RefSeq Assembly and GenBank Assemblies Identical: yes

##############################################################################
# mm10 patch 4 build
##############################################################################
##  download sequence, prepare files (DONE - 2016-03-08 - Hiram)
##############################################################################

mkdir /hive/data/genomes/mm10/bed/mm10Patch4
cd /hive/data/genomes/mm10/bed/mm10Patch4
mkdir genbank
cd genbank
time rsync -L -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/Mus_musculus/all_assembly_versions/GCA_000001635.6_GRCm38.p4/ ./
# real    4m30.702s

# appears to be the entire assembly:
faSize GCA_000001635.6_GRCm38.p4_genomic.fna.gz
# 2803568840 bases (79356978 N's 2724211862 real 1731886981 upper
#    992324881 lower) in 196 sequences in 1 files
# Total size: mean 14303922.7 sd 41517244.4 min 1976 (JH584295.1)
#    max 195471971 (CM000994.2) median 207968
# %35.40 masked total, %36.43 masked real

# so the question is, what is new here compared to what we have in mm10

cd /hive/data/genomes/mm10/bed/mm10Patch4
time faCount genbank/GCA_000001635.6_GRCm38.p4_genomic.fna.gz \
   > faCount.GRCm38.p4.txt
#  real    1m4.939s

~/kent/src/hg/makeDb/doc/mm10Patch4/scanAssemblyReport.pl ../../chrom.sizes \
  faCount.GRCm38.p4.txt genbank/GCA_000001635.6_GRCm38.p4_assembly_report.txt \
   | grep new | sed -e 's/^/# /'

# one of the new sequences is missing some information from the
#  assembly_report:
MMCHRUN_NODMRKTAC_CTG1  alt-scaffold    na      na      GL456050.1      =      NT_080256.1      NOD/MrkTac      142341  na
# it appers to be a sequence that isn't located on this assembly

# resulting in an incorrect name:

# chrna_GL456050_alt    142341  GL456050.1      NT_080256.1     new

# there are 130 new sequences:

# chr1_KK082441_alt     456798  KK082441.1      NW_006763128.1  new
# chr3_KQ030484_fix     214957  KQ030484.1      NW_012132900.1  new
# chr4_JH792826_fix     368286  JH792826.1      NW_004058050.1  new
# chr4_KQ030485_fix     185548  KQ030485.1      NW_012132901.1  new
# chr4_KQ030486_fix     399265  KQ030486.1      NW_012132902.1  new
# chr4_KQ030487_fix     316842  KQ030487.1      NW_012132903.1  new
# chr4_KQ030488_fix     45901   KQ030488.1      NW_012132904.1  new
# chr4_KQ030489_fix     188269  KQ030489.1      NW_012132905.1  new
# chr5_JH792827_fix     205713  JH792827.1      NW_004058051.1  new
# chr6_KK082442_fix     154766  KK082442.1      NW_006763129.1  new
# chr6_KK082443_fix     620323  KK082443.1      NW_006763130.1  new
# chr7_JH792828_fix     473738  JH792828.1      NW_004058052.1  new
# chr9_KB469738v3_fix   210641  KB469738.3      NW_004450259.3  new
# chr9_KQ030490_fix     120154  KQ030490.1      NW_012132906.1  new
# chr10_KQ030491_fix    129865  KQ030491.1      NW_012132907.1  new
# chr11_KB469739_fix    331111  KB469739.1      NW_004450260.1  new
# chr12_KB469740_fix    316140  KB469740.1      NW_004450261.1  new
# chr15_KQ030492_fix    166012  KQ030492.1      NW_012132908.1  new
# chr15_KQ030493_fix    269476  KQ030493.1      NW_012132909.1  new
# chr16_KB469741_fix    267241  KB469741.1      NW_004450262.1  new
# chr17_KB469742_fix    1059955 KB469742.1      NW_004450263.1  new
# chr18_JH792829_fix    352455  JH792829.1      NW_004058053.1  new
# chr19_JH792830_fix    246751  JH792830.1      NW_004058054.1  new
# chr19_KQ030494_fix    506812  KQ030494.1      NW_012132910.1  new
# chrX_JH792831_fix     182256  JH792831.1      NW_004058055.1  new
# chrX_KQ030495_fix     490000  KQ030495.1      NW_012132911.1  new
# chrX_KQ030496_fix     260447  KQ030496.1      NW_012132912.1  new
# chrX_KQ030497_fix     219721  KQ030497.1      NW_012132913.1  new
# chrY_JH792832_fix     544189  JH792832.1      NW_004058056.1  new
# chrY_JH792833_fix     221588  JH792833.1      NW_004058057.1  new
# chrY_JH792834_fix     331480  JH792834.1      NW_004058058.1  new
# chr19_GL456079_alt    46440   GL456079.1      NT_039697.1     new
# chr2_GL456024v2_alt   410905  GL456024.2      NT_039223.3     new
# chr3_GL456006_alt     320308  GL456006.1      NT_039248.1     new
# chr3_GL456007_alt     273369  GL456007.1      NT_039256.1     new
# chr3_GL456008_alt     124933  GL456008.1      NT_080257.1     new
# chr5_GL456011_alt     180394  GL456011.1      NT_109599.1     new
# chr6_GL456025_alt     145152  GL456025.1      NT_039373.1     new
# chr6_GL456026v2_alt   636141  GL456026.2      NT_039382.4     new
# chr7_GL456014_alt     91251   GL456014.1      NT_078572.1     new
# chr12_GL456017v2_alt  1714434 GL456017.2      NT_114985.3     new
# chr12_GL456074_alt    84250   GL456074.1      NT_114988.1     new
# chr13_JH584305_alt    179252  JH584305.1      NT_187006.1     new
# chr14_GL456019_alt    1666365 GL456019.1      NT_039614.1     new
# chr16_JH584306_alt    480699  JH584306.1      NT_187007.1     new
# chr16_JH584307_alt    250595  JH584307.1      NT_187008.1     new
# chr17_JH584308_alt    273566  JH584308.1      NT_187010.1     new
# chr17_JH584309_alt    162935  JH584309.1      NT_187011.1     new
# chr17_JH590470_alt    1154078 JH590470.1      NT_187009.1     new
# chrX_GL456031_alt     247850  GL456031.1      NT_111920.1     new
# chrX_GL456032_alt     194271  GL456032.1      NT_039744.2     new
# chrX_GL456033v2_alt   78689   GL456033.2      NT_039747.4     new
# chr7_GL456013_alt     34498   GL456013.1      NT_039443.1     new
# chr1_GL455991_alt     281255  GL455991.1      NT_039192.2     new
# chr1_GL455992v2_alt   120856  GL455992.2      NT_039194.5     new
# chr1_GL455993_alt     272653  GL455993.1      NT_039198.1     new
# chr4_GL455994_alt     170592  GL455994.1      NT_039291.1     new
# chr5_GL455995_alt     262166  GL455995.1      NT_039334.1     new
# chr8_GL455996_alt     183135  GL455996.1      NT_039469.1     new
# chr8_GL455997_alt     251056  GL455997.1      NT_039470.1     new
# chr11_GL455998_alt    170397  GL455998.1      NT_078683.1     new
# chr13_GL455999v2_alt  138435  GL455999.2      NT_039594.3     new
# chr15_GL456000_alt    88442   GL456000.1      NT_039622.1     new
# chr16_GL456001v2_alt  1618162 GL456001.2      NT_039634.4     new
# chr16_JH584310_alt    261336  JH584310.1      NT_187012.1     new
# chr16_JH584311_alt    189707  JH584311.1      NT_187013.1     new
# chr16_JH584312_alt    184044  JH584312.1      NT_187014.1     new
# chr16_JH584313_alt    374389  JH584313.1      NT_187015.1     new
# chr16_JH584314_alt    170763  JH584314.1      NT_187016.1     new
# chr17_GL456002v2_alt  285971  GL456002.2      NT_039666.2     new
# chr17_GL456021v2_alt  28130   GL456021.2      NT_039661.4     new
# chrX_GL456003v2_alt   414761  GL456003.2      NT_039739.2     new
# chrX_GL456004_alt     123835  GL456004.1      NT_099024.1     new
# chr1_GL456005_alt     154284  GL456005.1      NT_039195.1     new
# chr1_JH584315_alt     220020  JH584315.1      NT_187017.1     new
# chr4_GL456010_alt     76099   GL456010.1      NT_039296.1     new
# chr6_GL456012v2_alt   312568  GL456012.2      NT_039372.2     new
# chr6_GL456065_alt     198515  GL456065.1      NT_039367.1     new
# chr11_GL456016_alt    161334  GL456016.1      NT_039525.1     new
# chr14_GL456020_alt    105813  GL456020.1      NT_096797.1     new
# chr16_GL456028v2_alt  176974  GL456028.2      NT_039630.5     new
# chr17_GL456022v2_alt  1934410 GL456022.2      NT_039662.3     new
# chr13_GL455990_alt    100803  GL455990.1      NT_039593.1     new
# chrY_GL456080_alt     191521  GL456080.1      NT_166394.1     new
# chrY_GL456081_alt     185346  GL456081.1      NT_166397.1     new
# chrY_GL456082_alt     209635  GL456082.1      NT_166400.1     new
# chr7_GL455989_alt     55856   GL455989.1      NT_095534.1     new
# chr12_GL456068_alt    59641   GL456068.1      NT_096355.1     new
# chr11_JH584316_alt    247497  JH584316.1      NT_187028.1     new
# chr11_JH584317_alt    186379  JH584317.1      NT_187029.1     new
# chr18_JH584318_alt    231969  JH584318.1      NT_187030.1     new
# chrY_GL456071_alt     189119  GL456071.1      NT_166396.1     new
# chrY_GL456072_alt     204663  GL456072.1      NT_166398.1     new
# chrY_GL456073_alt     196014  GL456073.1      NT_166399.1     new
# chr1_JH584320_alt     1003859 JH584320.1      NT_187019.1     new
# chr1_JH584321_alt     3760957 JH584321.1      NT_187020.1     new
# chr1_JH584322_alt     326027  JH584322.1      NT_187021.1     new
# chr3_GL456042v2_alt   689363  GL456042.2      NT_114257.4     new
# chr3_GL456044v2_alt   466430  GL456044.2      NT_166287.2     new
# chr3_GL456045v2_alt   161821  GL456045.2      NT_039247.2     new
# chr3_GL456048_alt     292416  GL456048.1      NT_039252.1     new
# chr3_GL456049v2_alt   243366  GL456049.2      NT_039257.4     new
# chr3_JH584323_alt     1153325 JH584323.1      NT_187022.1     new
# chr4_GL456053v2_alt   3205425 GL456053.2      NT_166299.2     new
# chr4_JH584324_alt     3050841 JH584324.1      NT_187023.1     new
# chr4_JH584325_alt     215601  JH584325.1      NT_187024.1     new
# chr4_JH584326_alt     1625541 JH584326.1      NT_187025.1     new
# chr11_GL456060v2_alt  1462311 GL456060.2      NT_166317.2     new
# chr11_JH584327_alt    1550185 JH584327.1      NT_187026.1     new
# chr17_JH584328_alt    4256209 JH584328.1      NT_187027.1     new
# chrna_GL456050_alt    142341  GL456050.1      NT_080256.1     new
# chr6_GL456054v2_alt   5956088 GL456054.2      NT_166305.2     new
# chr6_JH584264_alt     1759414 JH584264.1      NT_187002.1     new
# chr11_JH584265_alt    3086744 JH584265.1      NT_187003.1     new
# chr17_JH584266_alt    4910976 JH584266.1      NT_187004.1     new
# chr17_JH584267_alt    1908920 JH584267.1      NT_187005.1     new
# chr12_GL456349_alt    83612   GL456349.1      NT_166433.1     new
# chr18_GL456070_alt    85548   GL456070.1      NT_165784.1     new
# chr4_GL456009_alt     240498  GL456009.1      NT_109318.2     new
# chr4_GL456064_alt     138473  GL456064.1      NT_039292.1     new
# chr10_GL456015_alt    77891   GL456015.1      NT_078651.1     new
# chr17_GL456069_alt    138490  GL456069.1      NT_097336.1     new
# chr19_JH584319_alt    25407   JH584319.1      NT_187018.1     new
# chr4_GL456075_alt     264822  GL456075.1      NT_166293.1     new
# chr4_GL456076_alt     191695  GL456076.1      NT_166294.1     new
# chr4_GL456077_alt     83218   GL456077.1      NT_166295.1     new
# chr4_JH584268_alt     69753   JH584268.1      NT_186999.1     new
# chr4_JH584269_alt     284739  JH584269.1      NT_187000.1     new
# chr12_GL456078_alt    99963   GL456078.1      NT_166322.1     new
# chr15_JH584270_alt    89177   JH584270.1      NT_187001.1     new

# how much sequence:
~/kent/src/hg/makeDb/doc/mm10Patch4/scanAssemblyReport.pl ../../chrom.sizes \
  faCount.GRCm38.p4.txt genbank/GCA_000001635.6_GRCm38.p4_assembly_report.txt \
        | grep new | awk '{sum += $2; printf "%d\t%s\n", sum, $0}' | tail
#  72697066   chr15_JH584270_alt   89177   JH584270.1      NT_187001.1  new

~/kent/src/hg/makeDb/doc/mm10Patch4/scanAssemblyReport.pl ../../chrom.sizes \
  faCount.GRCm38.p4.txt genbank/GCA_000001635.6_GRCm38.p4_assembly_report.txt \
     | grep new > new.sequences.list

cut -f3 new.sequences.list > extract.new.list
awk '{printf "s/%s/%s/; ", $3,$1}' new.sequences.list > genbankToUCSC.sed

~/kent/src/hg/makeDb/doc/mm10Patch4/scanAssemblyReport.pl ../../chrom.sizes \
  faCount.GRCm38.p4.txt genbank/GCA_000001635.6_GRCm38.p4_assembly_report.txt \
     | grep -v new > existing.sequences.list

cut -f3 existing.sequences.list > extract.exist.list

faSomeRecords genbank/GCA_000001635.6_GRCm38.p4_genomic.fna.gz \
    extract.new.list stdout | sed -e 's/ .*//;' | \
      sed -f genbankToUCSC.sed | gzip -c > mm10Patch4.fa.gz

faSomeRecords genbank/GCA_000001635.6_GRCm38.p4_genomic.fna.gz \
  extract.exist.list stdout | sed -e 's/ .*//;' | gzip -c > existing.fa.gz

# verify same amount of sequence here as mm10:
faSize existing.fa.gz
# 2730871774 bases (78088274 N's 2652783500 real 1686409493 upper
#    966374007 lower) in 66 sequences in 1 files
# Total size: mean 41376845.1 sd 63617337.3 min 1976 (JH584295.1)
#    max 195471971 (CM000994.2) median 184189
# %35.39 masked total, %36.43 masked real

# mm10 has different masking, but totals are the same
head -1 ../../faSize.mm10.2bit.txt 
# 2730871774 bases (78088274 N's 2652783500 real 1454267808 upper 1198515692 lower) in 66 sequences in 1 files

# verify correct amount of patch4 sequence here:
faSize mm10Patch4.fa.gz
# 72697066 bases (1268704 N's 71428362 real 45477488 upper
#    25950874 lower) in 130 sequences in 1 files
# Total size: mean 559208.2 sd 959105.8 min 25407 (chr19_JH584319_alt)
#    max 5956088 (chr6_GL456054v2_alt) median 221588
# %35.70 masked total, %36.33 masked real

# this is the same total obtained before:
#  72697066   chr15_JH584270_alt   89177   JH584270.1      NT_187001.1  new

# and both together should equal the original full patch4 sequence
zcat existing.fa.gz mm10Patch4.fa.gz | faSize stdin
# 2803568840 bases (79356978 N's 2724211862 real 1731886981 upper 992324881 lower) in 196 sequences in 1 files
# Total size: mean 14303922.7 sd 41517244.4 min 1976 (JH584295.1) max 195471971 (CM000994.2) median 207968
# %35.40 masked total, %36.43 masked real

# construct locations file:
~/kent/src/hg/makeDb/doc/mm10Patch4/regionScan.pl extract.new.list \
   ucsc.genbank.tab genbank/GCA_000001635.6_GRCm38.p4_assembly_regions.txt \
     | egrep -v "MMCHR6_CHORI29_IDD6_3" > patchLocations.bed

# what are these:
# not finding PAR na on the newList
# not finding PAR na on the newList
# not done GL456050.1
# not done GL456071.1
# not done GL456072.1
# not done GL456073.1
# not done GL456080.1
# not done GL456081.1
# not done GL456082.1

# the PAR is merely these two items:
# PAR     X       169969759       170931299       PAR     na      na      na
# PAR     Y       90745845        91644698        PAR     na      na      na

# Message from Valerie Schneider
# These are alternate loci for which there is no placement defined on the
#    GRCm38 B57BL/6J primary assembly. 

# The rules for alternate loci in mouse, where the primary assembly unit
# represents a single strain, are a little different than they are for human.
# In mouse, each alt unit represents a different strain. In contrast to
# human, mouse alt loci do not contain an "anchor" sequence that is part
# of the primary assembly unit, b/c we never mix strain
# sequences (on either primary or alt).

# The lack of placements is allowed for mouse, but not for human.
# All human alt loci must have a placement, and the anchor ensures
# that placement can be made.

# separate haplotypes from fix patches for two tracks:
# the sed command turn the UCSC chrom names for these new sequences back
# into their genbank names for external URL reference
grep -v fix patchLocations.bed | sed -e 's/_alt//;' \
   | sed -e 's/\tchr.*_/\t/;' | sed -e 's/v/./;' > mm10Patch4Haplotypes.bed

hgLoadBed -type=bed4 mm10 mm10Patch4Haplotypes mm10Patch4Haplotypes.bed 
# Read 93 elements of size 4 from mm10Patch4Haplotypes.bed

grep fix patchLocations.bed | sed -e 's/_fix//;' \
   | sed -e 's/\tchr.*_/\t/;' | sed -e 's/v\([0-9]\)$/.\1/;' \
     > mm10Patch4Patches.bed

# verify none lost:
    wc -l mm10Patch4Haplotypes.bed mm10Patch4Patches.bed
#   93 mm10Patch4Haplotypes.bed
#   29 mm10Patch4Patches.bed
#  122 total

    wc -l patchLocations.bed
#  122 patchLocations.bed

hgLoadBed -type=bed4 mm10 mm10Patch4Patches mm10Patch4Patches.bed 
# Read 41 elements of size 4 from mm10Patch4Patches.bed

# construct 2bit file:
faToTwoBit mm10Patch4.fa.gz mm10Patch4.unmasked.2bit
twoBitInfo mm10Patch4.unmasked.2bit stdout | sort -k2nr > mm10Patch4.chrom.sizes
# take a look at that to verify it looks OK:
cat mm10Patch4.chrom.sizes | sed -e 's/^/# /;'
# chr6_GL456054v2_alt   5956088
# chr17_JH584266_alt    4910976
# chr17_JH584328_alt    4256209
# chr1_JH584321_alt     3760957
# chr4_GL456053v2_alt   3205425
# chr11_JH584265_alt    3086744
# chr4_JH584324_alt     3050841
# chr17_GL456022v2_alt  1934410
# chr17_JH584267_alt    1908920
# chr6_JH584264_alt     1759414
# chr12_GL456017v2_alt  1714434
# chr14_GL456019_alt    1666365
# chr4_JH584326_alt     1625541
# chr16_GL456001v2_alt  1618162
# chr11_JH584327_alt    1550185
# chr11_GL456060v2_alt  1462311
# chr17_JH590470_alt    1154078
# chr3_JH584323_alt     1153325
# chr17_KB469742_fix    1059955
# chr1_JH584320_alt     1003859
# chr3_GL456042v2_alt   689363
# chr6_GL456026v2_alt   636141
# chr6_KK082443_fix     620323
# chrY_JH792832_fix     544189
# chr19_KQ030494_fix    506812
# chrX_KQ030495_fix     490000
# chr16_JH584306_alt    480699
# chr7_JH792828_fix     473738
# chr3_GL456044v2_alt   466430
# chr1_KK082441_alt     456798
# chrX_GL456003v2_alt   414761
# chr2_GL456024v2_alt   410905
# chr4_KQ030486_fix     399265
# chr16_JH584313_alt    374389
# chr4_JH792826_fix     368286
# chr18_JH792829_fix    352455
# chrY_JH792834_fix     331480
# chr11_KB469739_fix    331111
# chr1_JH584322_alt     326027
# chr3_GL456006_alt     320308
# chr4_KQ030487_fix     316842
# chr12_KB469740_fix    316140
# chr6_GL456012v2_alt   312568
# chr3_GL456048_alt     292416
# chr17_GL456002v2_alt  285971
# chr4_JH584269_alt     284739
# chr1_GL455991_alt     281255
# chr17_JH584308_alt    273566
# chr3_GL456007_alt     273369
# chr1_GL455993_alt     272653
# chr15_KQ030493_fix    269476
# chr16_KB469741_fix    267241
# chr4_GL456075_alt     264822
# chr5_GL455995_alt     262166
# chr16_JH584310_alt    261336
# chrX_KQ030496_fix     260447
# chr8_GL455997_alt     251056
# chr16_JH584307_alt    250595
# chrX_GL456031_alt     247850
# chr11_JH584316_alt    247497
# chr19_JH792830_fix    246751
# chr3_GL456049v2_alt   243366
# chr4_GL456009_alt     240498
# chr18_JH584318_alt    231969
# chrY_JH792833_fix     221588
# chr1_JH584315_alt     220020
# chrX_KQ030497_fix     219721
# chr4_JH584325_alt     215601
# chr3_KQ030484_fix     214957
# chr9_KB469738v3_fix   210641
# chrY_GL456082_alt     209635
# chr5_JH792827_fix     205713
# chrY_GL456072_alt     204663
# chr6_GL456065_alt     198515
# chrY_GL456073_alt     196014
# chrX_GL456032_alt     194271
# chr4_GL456076_alt     191695
# chrY_GL456080_alt     191521
# chr16_JH584311_alt    189707
# chrY_GL456071_alt     189119
# chr4_KQ030489_fix     188269
# chr11_JH584317_alt    186379
# chr4_KQ030485_fix     185548
# chrY_GL456081_alt     185346
# chr16_JH584312_alt    184044
# chr8_GL455996_alt     183135
# chrX_JH792831_fix     182256
# chr5_GL456011_alt     180394
# chr13_JH584305_alt    179252
# chr16_GL456028v2_alt  176974
# chr16_JH584314_alt    170763
# chr4_GL455994_alt     170592
# chr11_GL455998_alt    170397
# chr15_KQ030492_fix    166012
# chr17_JH584309_alt    162935
# chr3_GL456045v2_alt   161821
# chr11_GL456016_alt    161334
# chr6_KK082442_fix     154766
# chr1_GL456005_alt     154284
# chr6_GL456025_alt     145152
# chrna_GL456050_alt    142341
# chr17_GL456069_alt    138490
# chr4_GL456064_alt     138473
# chr13_GL455999v2_alt  138435
# chr10_KQ030491_fix    129865
# chr3_GL456008_alt     124933
# chrX_GL456004_alt     123835
# chr1_GL455992v2_alt   120856
# chr9_KQ030490_fix     120154
# chr14_GL456020_alt    105813
# chr13_GL455990_alt    100803
# chr12_GL456078_alt    99963
# chr7_GL456014_alt     91251
# chr15_JH584270_alt    89177
# chr15_GL456000_alt    88442
# chr18_GL456070_alt    85548
# chr12_GL456074_alt    84250
# chr12_GL456349_alt    83612
# chr4_GL456077_alt     83218
# chrX_GL456033v2_alt   78689
# chr10_GL456015_alt    77891
# chr4_GL456010_alt     76099
# chr4_JH584268_alt     69753
# chr12_GL456068_alt    59641
# chr7_GL455989_alt     55856
# chr19_GL456079_alt    46440
# chr4_KQ030488_fix     45901
# chr7_GL456013_alt     34498
# chr17_GL456021v2_alt  28130
# chr19_JH584319_alt    25407

zcat genbank/GCA_000001635.6_GRCm38.p4_assembly_structure/*/alt_scaffolds/AGP/*.agp.gz \
   | sed -f genbankToUCSC.sed > mm10Patch4.agp

checkAgpAndFa mm10Patch4.agp mm10Patch4.unmasked.2bit | tail -1
# All AGP and FASTA entries agree - both files are valid

#############################################################################
# build mm10Patch4 database (DONE - 2016-05-16 - Hiram)
    # need this database for netClass operations during the chain/net
    # construction

    mkdir /hive/data/genomes/mm10Patch4
    cd /hive/data/genomes/mm10Patch4
    mkdir /gbdb/mm10Patch4
    ln -s /hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.unmasked.2bit ./
    ln -s /hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.agp ./

    twoBitInfo mm10Patch4.unmasked.2bit stdout | sort -k2nr > chrom.sizes
    mkdir -p bed/chromInfo
    awk '{printf "%s\t%d\t/gbdb/mm10Patch4/mm10Patch4.2bit\n", $1, $2}' \
          chrom.sizes > bed/chromInfo/chromInfo.tab

    hgsql -e 'create database mm10Patch4;' mm10

    hgsql mm10Patch4 < $HOME/kent/src/hg/lib/grp.sql

    hgLoadSqlTab mm10Patch4 chromInfo $HOME/kent/src/hg/lib/chromInfo.sql \
             bed/chromInfo/chromInfo.tab
    hgGoldGapGl -noGl mm10Patch4 mm10Patch4.agp
    featureBits -or -countGaps mm10Patch4 gold gap
    # 72697066 bases of 72697066 (100.000%) in intersection

    hgsql hgcentraltest -e 'INSERT INTO dbDb (name, description, nibPath,
organism, defaultPos, active, orderKey, genome, scientificName, htmlPath,
hgNearOk, hgPbOk, sourceName, taxId)
VALUES ("mm10Patch4", "Mar. 2015", "/gbdb/mm10Patch4", "GRCm38.p4",
"chr4_KQ030486_fix:1-399265", 1, 7764, "GRCm38.p4", "Mus musculus",
"/gbdb/mm10Patch4/html/description.html", 0, 0,
"GRCm38 Genome Reference Consortium Mouse Build 38 patch release 4", 10090);
INSERT INTO defaultDb (genome, name) VALUES ("GRCm38.p4", "mm10Patch4");
INSERT INTO genomeClade (genome, clade, priority)
VALUES ("GRCm38.p4", "haplotypes", 198);'

    mkdir html
    # copy description.html from mm10Patch1/html/description.html
    # edit to update for Patch4

    mkdir -p /hive/data/genomes/mm10Patch4/bed/gc5Base
    cd /hive/data/genomes/mm10Patch4/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 mm10Patch4 \
                 ../../mm10Patch4.unmasked.2bit | wigEncode stdin gc5Base.{wig,wib}
    # Converted stdin, upper limit 100.00, lower limit 0.00
    mkdir /gbdb/mm10Patch4/wib
    ln -s `pwd`/bed/gc5Base/gc5Base.wib /gbdb/mm10Patch4/wib

    cd bed/gc5Base
    hgLoadWiggle -pathPrefix=/gbdb/mm10Patch4/wib mm10Patch4 gc5Base \
       gc5Base.wig

    mkdir /hive/data/genomes/mm10Patch4/bed/repeatMasker
    cd /hive/data/genomes/mm10Patch4/bed/repeatMasker
    time (doRepeatMasker.pl -bigClusterHub=ku \
       -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` mm10Patch4) \
          > do.log 2>&1
    # real    64m1.494s

    cat faSize.rmsk.txt
# 72697066 bases (1268704 N's 71428362 real 39545014 upper 31883348 lower)
#    in 130 sequences in 1 files
# Total size: mean 559208.2 sd 959105.8 min 25407 (chr19_JH584319_alt)
#    max 5956088 (chr6_GL456054v2_alt) median 221588
# %43.86 masked total, %44.64 masked real

    mkdir /hive/data/genomes/mm10Patch4/bed/simpleRepeat
    cd /hive/data/genomes/mm10Patch4/bed/simpleRepeat
    time (doSimpleRepeat.pl -bigClusterHub=ku -workhorse=hgwdev \
        -smallClusterHub=ku -buildDir=`pwd` mm10Patch4) > do.log 2>&1
    # real    6m33.708s
    cat fb.simpleRepeat 
    # cat 2578582 bases of 71430661 (3.610%) in intersection
    # the simpleRepeat procedure fails in the cleanup step since there
    # is no TrfPart directory

    mkdir /hive/data/genomes/mm10Patch4/bed/windowMasker
    cd /hive/data/genomes/mm10Patch4/bed/windowMasker
    time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev mm10Patch4) > do.log 2>&1
    # real    2m44.252s
    cat fb.mm10Patch4.rmsk.windowmaskerSdust.txt
    #  14367720 bases of 72697066 (19.764%) in intersection

    cd /hive/data/genomes/mm10Patch4
    twoBitMask mm10Patch4.rmsk.2bit \
        -add bed/simpleRepeat/trfMask.bed mm10Patch4.2bit
    twoBitToFa mm10Patch4.2bit stdout | faSize stdin

    # 72697066 bases (1268704 N's 71428362 real 39482408 upper 31945954 lower)
    #    in 130 sequences in 1 files
    # Total size: mean 559208.2 sd 959105.8 min 25407 (chr19_JH584319_alt)
    #    max 5956088 (chr6_GL456054v2_alt) median 221588
    # %43.94 masked total, %44.72 masked real

    ln -s `pwd`/mm10Patch4.2bit /gbdb/mm10Patch4

#########################################################################
# ucscToINSDC table/track (DONE - 2016-04-14 - Hiram)
    # the sequence here is working for a 'refseq' assembly with a chrM
    # situation may be specific depending upon what is available in the assembly

    mkdir /hive/data/genomes/mm10Patch4/bed/ucscToINSDC
    cd /hive/data/genomes/mm10Patch4/bed/ucscToINSDC

    # special script to get the names directly out of the assembly report:
    ./processReport.pl  | sort > ucscToINSDC.txt

    awk '{printf "%s\t%s\n", $2, $1}' ucscToINSDC.txt | sort > insdcToUcsc.txt
    # there is no name for chrM/NC_001323.1 sequence, there is no such
    #  sequence with an INSDC name
    grep -v "^#" /hive/data/genomes/mm10/bed/mm10Patch4/genbank/GCA_000001635.6_GRCm38.p4_assembly_report.txt | cut -f5,7 \
      | sed -e 's/na\b/notAvailable/;' | awk '{printf "%s\t%s\n", $1, $2}' \
         | sort > insdc.refseq.txt
    # the sed \b means to match word

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > name.coordinate.tab
    join insdc.refseq.txt insdcToUcsc.txt | tr '[ ]' '[\t]' | sort -k3 \
       | join -2 3 name.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-3,5 \
           > ucscToINSDC.bed

    # should be same line counts throughout (except for insdc.refseq.txt):
    wc -l *
    # 196 insdc.refseq.txt
    # 130 insdcToUcsc.txt
    # 130 name.coordinate.tab
    # 130 ucscToINSDC.bed
    # 130 ucscToINSDC.txt

    cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1
    # 20
    # use the 20 in this sed
    sed -e "s/21/20/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         | hgLoadSqlTab mm10Patch4 ucscToINSDC stdin ucscToINSDC.bed
    checkTableCoords mm10Patch4
    # should cover %100 entirely:
    featureBits -countGaps mm10Patch4 ucscToINSDC
    # 72697066 bases of 72697066 (100.000%) in intersection

    join -1 2 <(sort -k2 ucscToINSDC.txt) insdc.refseq.txt | tr '[ ]' '[\t]' \
      | sort -k2 | join -2 2 name.coordinate.tab - |  tr '[ ]' '[\t]' \
        | cut -f1-4 > ucscToRefSeq.bed
    cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1
    # 20
    # use the 20 in this sed
    sed -e "s/21/20/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
       | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql
    hgLoadSqlTab mm10Patch4 ucscToRefSeq ./ucscToRefSeq.sql ucscToRefSeq.bed

    checkTableCoords  mm10Patch4 -table=ucscToRefSeq
    # should cover %100 all bases:
    featureBits -countGaps mm10Patch4 ucscToRefSeq
    # 72697066 bases of 72697066 (100.000%) in intersection

##############################################################################
# cpgIslands on UNMASKED sequence (DONE - 2016-05-17 - Hiram)

    mkdir /hive/data/genomes/mm10Patch4/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/mm10Patch4/bed/cpgIslandsUnmasked

    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -tableName=cpgIslandExtUnmasked \
          -maskedSeq=/hive/data/genomes/mm10Patch4/mm10Patch4.unmasked.2bit \
             -workhorse=hgwdev -smallClusterHub=ku mm10Patch4) > do.log 2>&1
    # real    1m16.641s

    cat fb.mm10Patch4.cpgIslandExtUnmasked.txt
    # 563715 bases of 71430661 (0.789%) in intersection

##########################################################################
# cpgIslands - (DONE - 2016-05-17 - Hiram)
    mkdir /hive/data/genomes/mm10Patch4/bed/cpgIslands
    cd /hive/data/genomes/mm10Patch4/bed/cpgIslands
    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev -smallClusterHub=ku mm10Patch4) > do.log 2>&1
    # real    1m14.891s

    cat fb.mm10Patch4.cpgIslandExt.txt
    # 558038 bases of 71430661 (0.781%) in intersection

##############################################################################
# cytoBandIdeo - (DONE - 2016-05-17 - Hiram)
    mkdir /hive/data/genomes/mm10Patch4/bed/cytoBand
    cd /hive/data/genomes/mm10Patch4/bed/cytoBand
    makeCytoBandIdeo.csh mm10Patch4

#########################################################################
# CREATE MICROSAT TRACK (DONE - 2016-04-14 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/mm10Patch4/bed/microsat
    cd /hive/data/genomes/mm10Patch4/bed/microsat

    awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
       ../simpleRepeat/simpleRepeat.bed > microsat.bed

    hgLoadBed mm10Patch4 microsat microsat.bed
    # Read 2030 elements of size 4 from microsat.bed

#############################################################################
# lastz alignments to mm10 (DONE - 2016-05-17 - Hiram)
#############################################################################

mkdir /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17
cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17

# prepare bits of mm10 sequence to lastz align to the patches,
# this is selecting out the specific section of mm10 where the patch
# is supposed to match, and setting up lastz parameters

rm -fr mm10Bits run.blastz run.blastz/tParts run.blastz/qParts psl mm10Bits.lift
mkdir -p mm10Bits run.blastz run.blastz/tParts run.blastz/qParts psl
cut -f4 ../patchLocations.bed | while read FIX
do
  chr=`grep "${FIX}" ../patchLocations.bed | cut -f1`
  start=`grep "${FIX}" ../patchLocations.bed | cut -f2`
  end=`grep "${FIX}" ../patchLocations.bed | cut -f3`
  bitSize=`echo ${end} ${start} | awk '{printf "%d", $1-$2}'`
  chrSize=`grep -w "${chr}" ../../../chrom.sizes | cut -f2`
#  printf "grep '${FIX}' ../mm10Patch4.chrom.sizes | cut -f2\n" 1>&2
  fixSize=`grep "${FIX}" ../mm10Patch4.chrom.sizes | cut -f2`
#  printf "${chr}:${start}-${end} vs. ${FIX}:0-${fixSize}\n" 1>&2
  twoBitToFa /hive/data/genomes/mm10/mm10.unmasked.2bit:${chr}:${start}-${end} stdout \
    | sed -e "s/${chr}:/${FIX}_/g" > mm10Bits/${FIX}.fa
  fixName=`head -1 mm10Bits/${FIX}.fa | sed -e 's/>//;'`
  printf "${start}\t${fixName}\t${fixSize}\t${chr}\t${chrSize}\n" 1>&2
  printf "${start}\t${fixName}\t${fixSize}\t${chr}\t${chrSize}\n" >> mm10Bits.lift
  printf "/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/mm10Bits.2bit:${fixName}:0-${bitSize}\n" 1>&2
  printf "/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/mm10Bits.2bit:${fixName}:0-${bitSize}\n" > run.blastz/qParts/${fixName}.lst
  printf "/hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.unmasked.2bit:${FIX}:0-${fixSize}\n" > run.blastz/tParts/${fixName}.lst
  printf "/cluster/bin/scripts/blastz-run-ucsc -outFormat psl tParts/${fixName}.lst qParts/${fixName}.lst ../DEF {check out exists ../psl/${fixName}.psl}\n" 1>&2
  printf "/cluster/bin/scripts/blastz-run-ucsc -outFormat psl tParts/${fixName}.lst qParts/${fixName}.lst ../DEF {check out exists ../psl/${fixName}.psl}\n"  >> run.blastz/jobList
done

faToTwoBit mm10Bits/*.fa mm10Bits.2bit
twoBitInfo mm10Bits.2bit stdout | sort -k2n > mm10Bits.chrom.sizes

printf 'BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
# mouse vs mouse
# maximum M allowed with lastz is only 254
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
BLASTZ_O=600
BLASTZ_E=150
# other parameters from hg18 vs venter1 lastz on advice from Webb
BLASTZ_K=10000
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Mouse Mm10Patch4
SEQ1_DIR=/hive/data/genomes/mm10Patch4/mm10Patch4.unmasked.2bit
SEQ1_LEN=/hive/data/genomes/mm10Patch4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1
SEQ1_IN_CONTIGS=0

# QUERY: Mouse Mm10
SEQ2_DIR=/hive/data/genomes/mm10/mm10.unmasked.2bit
SEQ2_LEN=/scratch/data/mm10/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/mm10Bits.2bit
SEQ2_CTGLEN=/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/mm10Bits.chrom.sizes
SEQ2_LIFT=/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/mm10Bits.lift
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
SEQ2_LIMIT=1

BASE=/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17
TMPDIR=/dev/shm
' > DEF

ssh ku
cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/run.blastz
para create jobList
para try ... check ... push ... etc
para time > run.time
# Completed: 122 of 122 jobs
# CPU time in finished jobs:        233s       3.89m     0.06h    0.00d  0.000 y
# IO & Wait Time:                   376s       6.26m     0.10h    0.00d  0.000 y
# Average job time:                   5s       0.08m     0.00h    0.00d
# Longest finished job:              22s       0.37m     0.01h    0.00d
# Submission to last job:            47s       0.78m     0.01h    0.00d

# put together the individual results
cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17
mkdir pslParts
cat psl/chr*.psl | gzip -c > pslParts/mm10Patch4.mm10.psl.gz

#   constructing a chain from those results
mkdir -p /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/axtChain/run
cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/axtChain/run
time zcat ../../pslParts/mm10Patch4.mm10.psl.gz \
| axtChain -psl -verbose=0 -scoreScheme=/scratch/data/blastz/human_chimp.v2.q -minScore=2000 -linearGap=medium stdin \
    /hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.unmasked.2bit \
    /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/mm10Bits.2bit \
    stdout \
| chainAntiRepeat /hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.unmasked.2bit \
    /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/mm10Bits.2bit  \
    stdin mm10Patch4.mm10.preLift.chain
    #   real    0m16.050s
# -rw-rw-r-- 1 6410864 May 17 11:03 mm10Patch4.mm10.preLift.chain

liftUp -chainQ mm10Patch4.mm10.lifted.chain \
    ../../mm10Bits.lift carry mm10Patch4.mm10.preLift.chain

# constructing the net files:
cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/axtChain

chainMergeSort run/mm10Patch4.mm10.lifted.chain \
        | gzip -c > mm10Patch4.mm10.all.chain.gz

chainSplit chain mm10Patch4.mm10.all.chain.gz
# Make nets ("noClass", i.e. without rmsk/class stats which are added later):
time chainPreNet -inclHap mm10Patch4.mm10.all.chain.gz \
    ../../mm10Patch4.chrom.sizes \
    /hive/data/genomes/mm10/chrom.sizes stdout \
    | chainNet -inclHap stdin -minSpace=1 ../../mm10Patch4.chrom.sizes \
        /hive/data/genomes/mm10/chrom.sizes stdout /dev/null \
    | netSyntenic stdin noClass.net
    #   real    0m0.811s
# -rw-rw-r-- 1 1398298 May 17 11:06 noClass.net

hgLoadChain -tIndex mm10Patch4 chainMm10 mm10Patch4.mm10.all.chain.gz
# Loading 33147 chains into mm10Patch4.chainMm10

featureBits mm10Patch4 chainMm10Link
# 65748666 bases of 71430661 (92.045%) in intersection

netClass -verbose=0   -noAr noClass.net mm10Patch4 mm10 mm10Patch4.mm10.net

netFilter -minGap=10 mm10Patch4.mm10.net \
  | hgLoadNet -verbose=0 mm10Patch4 netMm10 stdin

# Make liftOver chains:
netChainSubset -verbose=0 noClass.net mm10Patch4.mm10.all.chain.gz stdout \
    | chainStitchId stdin stdout | gzip -c > mm10Patch4.mm10.over.chain.gz
# -rw-rw-r-- 1  225547 May 17 11:09 mm10Patch4.mm10.over.chain.gz

# Make axtNet for download: one .axt per mm10Patch4 seq.
netSplit noClass.net net
cd ..
mkdir -p axtNet
# beware, tcsh scripting here:
foreach f (axtChain/net/*.net)
netToAxt $f axtChain/chain/$f:t:r.chain \
  ../mm10Patch4.unmasked.2bit \
        /hive/data/genomes/mm10/mm10.unmasked.2bit stdout \
  | axtSort stdin stdout \
  | gzip -c > axtNet/$f:t:r.mm10Patch4.mm10.net.axt.gz
end

# Make mafNet for multiz: one .maf per mm10Patch4 seq.
mkdir -p mafNet
# beware, tcsh scripting here:
foreach f (axtNet/*.mm10Patch4.mm10.net.axt.gz)
  axtToMaf -tPrefix=mm10Patch4. -qPrefix=mm10. $f \
        ../mm10Patch4.chrom.sizes \
        /hive/data/genomes/mm10/chrom.sizes \
        stdout \
  | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
end

#############################################################################
# run this same business with mm10 as target, Patch4 sequence as query
#  (DONE - 2016-05-17 - Hiram)
mkdir /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17
cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17

printf '# mouse vs. mouse
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
BLASTZ_O=600
BLASTZ_E=150
# other parameters from hg18 vs venter1 lastz on advice from Webb
BLASTZ_K=10000
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Mouse Mm10
SEQ1_DIR=/hive/data/genomes/mm10/mm10.unmasked.2bit
SEQ1_LEN=/scratch/data/mm10/chrom.sizes
SEQ1_CTGDIR=/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/mm10Bits.2bit
SEQ1_CTGLEN=/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/mm10Bits.chrom.sizes
SEQ1_LIFT=/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/mm10Bits.lift
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
SEQ1_LIMIT=1

# QUERY: Mouse Mm10Patch4
SEQ2_DIR=/hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.unmasked.2bit
SEQ2_LEN=/hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
SEQ2_LIMIT=1

BASE=/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17
TMPDIR=/dev/shm
' > DEF

rm -fr mm10Bits run.blastz psl mm10Bits.lift
mkdir -p mm10Bits run.blastz/tParts run.blastz/qParts psl
cut -f4 ../patchLocations.bed | while read FIX
do
  chr=`grep "${FIX}" ../patchLocations.bed | cut -f1`
  start=`grep "${FIX}" ../patchLocations.bed | cut -f2`
  end=`grep "${FIX}" ../patchLocations.bed | cut -f3`
  bitSize=`echo ${end} ${start} | awk '{printf "%d", $1-$2}'`
  chrSize=`grep -w "${chr}" ../../../chrom.sizes | cut -f2`
  fixSize=`grep "${FIX}" ../mm10Patch4.chrom.sizes | cut -f2`
  echo ${chr}:${start}-${end} vs. ${FIX}:0-${fixSize} 1>&2
  twoBitToFa /hive/data/genomes/mm10/mm10.unmasked.2bit:${chr}:${start}-${end} stdout \
    | sed -e "s/${chr}:/${FIX}_/g"  > mm10Bits/${FIX}.fa
  fixName=`head -1 mm10Bits/${FIX}.fa | sed -e 's/>//;'`
  printf "${start}\t${fixName}\t${fixSize}\t${chr}\t${chrSize}\n" >> mm10Bits.lift
  printf "/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/mm10Bits.2bit:${fixName}:0-${bitSize}\n" > run.blastz/tParts/${FIX}.lst
  printf "/hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.unmasked.2bit:${FIX}:0-${fixSize}\n" > run.blastz/qParts/${FIX}.lst
  printf "/cluster/bin/scripts/blastz-run-ucsc -outFormat psl tParts/${FIX}.lst qParts/${FIX}.lst ../DEF {check out exists ../psl/${FIX}.psl}\n" >> run.blastz/jobList
done

faToTwoBit mm10Bits/*.fa mm10Bits.2bit
twoBitInfo mm10Bits.2bit stdout | sort -k2nr > mm10Bits.chrom.sizes

ssh ku
cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/run.blastz
para create jobList
para try ... check ... push ... etc
para time
# Completed: 122 of 122 jobs
# CPU time in finished jobs:        225s       3.75m     0.06h    0.00d  0.000 y
# IO & Wait Time:                   361s       6.01m     0.10h    0.00d  0.000 y
# Average job time:                   5s       0.08m     0.00h    0.00d
# Longest finished job:              20s       0.33m     0.01h    0.00d
# Submission to last job:            43s       0.72m     0.01h    0.00d

# put together the individual results
cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17
mkdir pslParts
cat psl/chr*.psl | gzip -c > pslParts/mm10.mm10Patch4.psl.gz
# -rw-rw-r-- 1 3139605 May 17 11:27 mm10.mm10Patch4.psl.gz

#   constructing a chain from those results
mkdir -p /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/axtChain/run
cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/axtChain/run

time zcat ../../pslParts/mm10.mm10Patch4.psl.gz \
| axtChain -psl -verbose=0 -scoreScheme=/scratch/data/blastz/human_chimp.v2.q -minScore=2000 -linearGap=medium stdin \
    /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/mm10Bits.2bit \
    /hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.unmasked.2bit \
    stdout \
     | chainAntiRepeat /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/mm10Bits.2bit \
     /hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.unmasked.2bit \
    stdin mm10.mm10Patch4.preLift.chain
    #   real    0m16.904s
# -rw-rw-r-- 1 6251918 May 17 11:28 mm10.mm10Patch4.preLift.chain

liftUp mm10.mm10Patch4.lifted.chain \
    ../../mm10Bits.lift carry mm10.mm10Patch4.preLift.chain

# constructing the net files:
cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/axtChain

chainMergeSort run/mm10.mm10Patch4.lifted.chain \
        | gzip -c > mm10.mm10Patch4.all.chain.gz
# -rw-rw-r-- 1 1423248 May 17 11:30 mm10.mm10Patch4.all.chain.gz

hgLoadChain -tIndex mm10 chainMm10Patch4 mm10.mm10Patch4.all.chain.gz
# Loading 31778 chains into mm10.chainMm10Patch4

featureBits mm10 chainMm10Patch4Link
# 56269294 bases of 2652783500 (2.121%) in intersection

chainSplit chain mm10.mm10Patch4.all.chain.gz
# Make nets ("noClass", i.e. without rmsk/class stats which are added later):
time chainPreNet -inclHap mm10.mm10Patch4.all.chain.gz \
    /hive/data/genomes/mm10/chrom.sizes \
    ../../mm10Patch4.chrom.sizes stdout \
    | chainNet -inclHap stdin -minSpace=1 /hive/data/genomes/mm10/chrom.sizes \
        ../../mm10Patch4.chrom.sizes stdout /dev/null \
    | netSyntenic stdin noClass.net
    #   real    0m0.576s

netClass -verbose=0 -noAr noClass.net mm10 mm10Patch4 mm10.mm10Patch4.net
# -rw-rw-r-- 1 2854952 Jun 29 16:44 mm10.mm10Patch4.net

netFilter -minGap=10 mm10.mm10Patch4.net \
  | hgLoadNet -verbose=0 mm10 netMm10Patch4 stdin

# Make liftOver chains:
netChainSubset -verbose=0 noClass.net mm10.mm10Patch4.all.chain.gz stdout \
    | chainStitchId stdin stdout | gzip -c > mm10.mm10Patch4.over.chain.gz
# -rw-rw-r-- 1  202518 Jun 29 16:47 mm10.mm10Patch4.over.chain.gz

# Make axtNet for download: one .axt per mm10Patch4 seq.
netSplit noClass.net net
cd ..
mkdir -p axtNet
# more tcsh here
foreach f (axtChain/net/*.net)
netToAxt $f axtChain/chain/$f:t:r.chain \
  /hive/data/genomes/mm10/mm10.unmasked.2bit \
        ../mm10Patch4.unmasked.2bit stdout \
  | axtSort stdin stdout \
  | gzip -c > axtNet/$f:t:r.mm10.mm10Patch4.net.axt.gz
end

# Make mafNet for multiz: one .maf per mm10Patch4 seq.
mkdir -p mafNet
# tcsh loop again
foreach f (axtNet/*.mm10.mm10Patch4.net.axt.gz)
  axtToMaf -tPrefix=mm10. -qPrefix=mm10Patch4. $f \
        /hive/data/genomes/mm10/chrom.sizes \
        ../mm10Patch4.chrom.sizes \
        stdout \
  | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
end

cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/axtChain
mkdir -p queryChains
chainSplit -q queryChains mm10.mm10Patch4.all.chain.gz

# then run a 'lift over' chain/net on each single one
mkdir -p singleLiftOver

for F in queryChains/*.chain
do
    C=`basename ${F}`
    B=`echo ${C} | sed -e "s/.chain//"`
    chainPreNet -inclHap ${F} /hive/data/genomes/mm10/chrom.sizes \
        ../../mm10Patch4.chrom.sizes stdout \
    | chainNet -inclHap stdin -minSpace=1 /hive/data/genomes/mm10/chrom.sizes \
        ../../mm10Patch4.chrom.sizes singleLiftOver/${B}.raw.net \
        /dev/null
    netSyntenic singleLiftOver/${B}.raw.net singleLiftOver/${B}.noClass.net
    netFilter -chimpSyn singleLiftOver/${B}.noClass.net > singleLiftOver/${B}.chimpSyn.net
    netChainSubset -verbose=0 singleLiftOver/${B}.noClass.net \
        ${F} stdout \
    | chainStitchId stdin stdout > singleLiftOver/${C}
    printf "${F} -> singleLiftOver/${C}" 1>&2
done

# put the chains together into one file
chainMergeSort singleLiftOver/chr*.chain | gzip -c \
        > mm10.mm10Patch4.single.over.chain.gz
# -rw-rw-r-- 1  268273 May 17 11:38 mm10.mm10Patch4.single.over.chain.gz

 # construct psl files from those chains
chainToPsl mm10.mm10Patch4.single.over.chain.gz \
   /hive/data/genomes/mm10/chrom.sizes \
    ../../mm10Patch4.chrom.sizes \
     /hive/data/genomes/mm10/mm10.unmasked.2bit \
       ../../mm10Patch4.unmasked.2bit \
        mm10.mm10Patch4.over.psl
# pslCheck reports no errors
pslCheck -db=mm10 mm10.mm10Patch4.over.psl
# checked: 1417 failed: 0 errors: 0

# load this PSL track
#     this table name prefix altSeqLiftOverPsl is recognized in hgc clicks
hgLoadPsl mm10 -table=altSeqLiftOverPslP4 mm10.mm10Patch4.over.psl

mkdir /hive/data/genomes/mm10/bed/mm10Patch4/seqExt
cd /hive/data/genomes/mm10/bed/mm10Patch4/seqExt
twoBitToFa ../mm10Patch4.unmasked.2bit mm10Patch4.fa
mkdir -p /gbdb/mm10/mm10Patch4 mm10Patch4
faSplit byname mm10Patch4.fa ./mm10Patch4/
ln -s `pwd`/mm10Patch4/*.fa /gbdb/mm10/mm10Patch4
hgLoadSeq -drop -seqTbl=seqMm10Patch4 -extFileTbl=extMm10Patch4 mm10 \
        /gbdb/mm10/mm10Patch4/*.fa

#############################################################################
# wrap up scripts and file editing (trackDb, hgdownload)
#  (DONE - 2016-05-17 - Hiram)

# edit ${HOME}/kent/src/hg/makeDb/doc/mm10/patchDescr.pl to reflect this most
# recent patch version.

cd /hive/data/genomes/mm10/bed/mm10Patch4
  ${HOME}/kent/src/hg/makeDb/doc/mm10Patch4/patchDescr.pl \
    > ${HOME}/kent/src/hg/makeDb/trackDb/mouse/mm10/mm10Patch4.html

mkdir /hive/data/genomes/mm10/bed/mm10Patch4/downloads
cd /hive/data/genomes/mm10/bed/mm10Patch4/downloads
ln -s ../mm10Patch4.unmasked.2bit mm10Patch4.2bit
ln -s ../mm10Patch4.agp
ln -s ../mm10Patch4.fa.gz
md5sum * > md5sum.txt
mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/mm10Patch4
ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/mm10Patch4/

# copy README.txt from previous patch and edit as needed - replace the
# patch description lines with the output of running
# /hive/data/genomes/mm10/bed/mm10Patch4/readmeLines.pl

/cluster/home/hiram/kent/src/hg/makeDb/doc/mm10Patch4/readmeLines.pl \
    genbank/GCA_000001635.6_GRCm38.p4_assembly_report.txt \
    genbank/GCA_000001635.6_GRCm38.p4_assembly_regions.txt \
     patchLocations.bed new.sequences.list

#############################################################################
