### adding HPRC multiple alignment to the hs1 browser ###

### pick up the maf file from Glenn:
### /private/home/hickey/dev/work/hprc-v1.0-maf/hprc-v1.0-mc-chm13.single-copy.maf.gz

mkdir /hive/data/genomes/hs1/bed/hprc/fromGlenn
cd /hive/data/genomes/hs1/bed/hprc/fromGlenn

## the file was in a /private/ directory on the phoenix.gi server,
## which is behind the 'prism' firewall, therefore, login to the phoenix
## server and rsync it out of there to hgwdev:

-rw-r--r-- 1 11932640896 Sep  5 11:32 hprc-v1.0-mc-chm13.single-copy.maf.gz
-rw-r--r-- 1     4722431 Sep  5 11:55 hprc-v1.0-mc-chm13.single-copy.maf.gz.tai

## I don't know what the .tai file is, but picked that up also
## scan the maf file for names to see what was used:

zgrep '^s' hprc-v1.0-mc-chm13.single-copy.maf.gz | cut -f2 \
  > raw.name.list

head raw.name.list
CHM13.chr1
HG00673#1.JAHBBZ010000189.1
HG01071#1.JAHBCF010000154.1
HG01071#2.JAHBCE010000132.1
HG01109#2.JAHEOZ010000140.1

tail raw.name.list:
GRCh38.chrY
CHM13.chrY
GRCh38.chrY
CHM13.chrY
GRCh38.chrY

#########################################################################
### fix the names in the maf file to correspond to UCSC names

mkdir  /hive/data/genomes/hs1/bed/hprc/reName
cd  /hive/data/genomes/hs1/bed/hprc/reName

### reuse the name mapping file from this same process in hg38

cp -p \
/hive/data/genomes/hg38/bed/hprc/mafFile/db.hgMatPat.asmName.longLabel.txt .

rw-rw-r-- 1      12514 Aug 17 14:48 db.hgMatPat.asmName.longLabel.txt

head db.hgMatPat.asmName.longLabel.txt 
GCA_018466835.1 HG02257.pat     HG02257#1       HG02257.alt.pat.f1_v2 (May 2021 GCA_018466835.1_HG02257.alt.pat.f1_v2) HPRC project computed Chain Nets
GCA_018466845.1 HG02257.mat     HG02257#2       HG02257.pri.mat.f1_v2 (May 2021 GCA_018466845.1_HG02257.pri.mat.f1_v2) HPRC project computed Chain Nets

### with the perl script:

    time (./reName.pl | gzip -c > renamed.maf.gz) > reName.log 2>&1 &
    # real    52m4.679s
    # -rw-rw-r-- 1 7203795233 Sep  5 15:38 renamed.maf.gz

#!/usr/bin/env perl

use strict;
use warnings;

my %asmNameDb;	# key is asmName in the hal file, value is dbName at UCSC

open (my $fh, "<", "db.hgMatPat.asmName.longLabel.txt") or die "can not read db.hgMatPat.asmName.longLabel.txt";
while (my $line = <$fh>) {
  chomp $line;
  my @a = split('\t', $line);
  $asmNameDb{$a[2]} = $a[0];
}
close ($fh);
$asmNameDb{'GRCh38'} = "hg38";
$asmNameDb{'CHM13'} = "hs1";

open ($fh, "-|", "zcat ../fromGlenn/hprc-v1.0-mc-chm13.single-copy.maf.gz") or die "can not zcat hprc-v1.0-mc-chm13.single-copy.maf.gz";
while (my $line = <$fh>) {
  chomp $line;
  if ($line =~ m/^s\t/) {
    my @a = split('\t', $line, 3);
    my $asmName = $a[1];
    $asmName =~ s/\..*//;
    my $seqName = "hg38";
    if (!defined($asmNameDb{$asmName})) {
      printf "no equivalent name for: '%s'\n", $asmName;
      exit 255;
    }
    $seqName = $asmNameDb{$asmName} if ($asmName ne "GRCh38");
    $a[1] =~ s/^$asmName/$seqName/;
    printf "%s\n", join("\t", @a);
  } else {
    printf "%s\n", $line;
  }
}
close ($fh);


## scan that result:
    zgrep "^s" renamed.maf.gz | cut -f2 > rename.name.scan
## and which genomes:
    grep -v "GC" rename.name.scan | cut -d'.' -f1 | sort | uniq -c
1549489 hg38
1618032 hs1

    grep  "GC" rename.name.scan | cut -d'.' -f1-2 | sort | uniq -c \
       | sort -rn | head


##############################################################################
