####
# 2025-06-07 CLS long-read RNA  (Mark Diekhans <markd@ucsc.edu>)
#
# See also hg38/clsLongReadRna.txt
####

clsSrc=$HOME/compbio/browser/dev/kent/src/hg/makeDb/outside/clsLongReadRna
clsTrackTool=${clsSrc}/clsTrackTool
clsModelBedAs=${clsSrc}/clsModelBed.as 
clsTargetBedAs=${clsSrc}/clsTargetBed.as
gff_attrs_extract=${clsSrc}/gff-attrs-extract.awk

cd /hive/data/genomes/mm10/bed/clsLongReadRna

# Obtained human and mouse BAMs and indexes
  https://public-docs.crg.es/rguigo/Data/scarbonell/gencode_paper2024_figures/browser_tracks/hub/path2LyRicBAMs.tsv 

  copy BAMs directly from CRG via my account, split BAMs into pre-capture and
  post-capture directories


# download LyRic models and metadata:
  metadata documented here:
  https://github.com/guigolab/gencode-cls-master-table
  cd lyric-data
  wget -nv https://github.com/guigolab/gencode-cls-master-table/releases/latest/download/Mv2_masterTable_refined.gtf.gz
  wget -nv https://github.com/guigolab/gencode-cls-master-table/releases/download/GencodeCLS_v1.0/Mv2_metadata.tsv.gz
  wget -nv https://github.com/guigolab/gencode-cls-master-table/releases/download/Supplementary/Mv2_CLS3_targetDesign_mergedRegions.gtf.gz

# check BAMs vs metadata
  $clsTrackTool check-bams lyric-data/Mv2_metadata.tsv.gz .

# process GTF
  # convert to BED via GTF -> GFF3 -> genePred -> BED
  # need to go via GFF3 to get attributes in a file
  cd lyric-data/
  zcat Mv2_masterTable_refined.gtf.gz | gffread -F | gff3ToGenePred -attrsOut=Mv2_masterTable_refined.attrs.tab stdin stdout |\
    genePredToBed stdin stdout | tawk '{$7=$2; print}' | sort -k1,1 -k2,2n |gzip -9 > Mv2_masterTable_refined.bed.gz 

  # extract tags of interest to  TSV
  tawk -f  $gff_attrs_extract Mv2_masterTable_refined.attrs.tab |pigz -c  >Mv2_transcript-meta.tsv.gz
  rm Mv2_masterTable_refined.attrs.tab

  # build target BEDs (discards ERCC-0* entries)
  zcat Mv2_CLS3_targetDesign_mergedRegions.gtf.gz | ../../target-gff-to-bed | pigz -c  >../cls-targets.bed.gz

# make model BEDs
  one for each sample+phase+platform,
  one for each sample
  one with all

  $clsTrackTool make-beds lyric-data/Mv2_metadata.tsv.gz lyric-data/Mv2_masterTable_refined.bed.gz lyric-data/Mv2_transcript-meta.tsv.gz . &
  bedToBigBed -tab -type=bed6+2 -as=$clsTargetBedAs -sizesIsChromAliasBb cls-targets.bed.gz /hive/data/genomes/mm10/bed/chromAlias/p6/mm10.chromAlias.bb cls-targets.bb
  for bed in cls-models*.bed.gz cls-models.bed.gz *capture/*.bed.gz ; do
     bedToBigBed -tab -type=bed12+3 -as=$clsModelBedAs -sizesIsChromAliasBb $bed /hive/data/genomes/mm10/bed/chromAlias/p6/mm10.chromAlias.bb $(dirname $bed)/$(basename $bed .bed.gz).bb
  done

# make trackDb
  $clsTrackTool make-trackdb --parent=long_read_transcripts lyric-data/Mv2_metadata.tsv.gz /gbdb/mm10/clsLongReadRna ~/kent/src/hg/makeDb/trackDb/mouse/mm10/clsLongReadRna.ra

  # edit to make member of long_read_transcripts.ra:
    include clsLongReadRna.ra alpha

# link to gbdb
  mkdir -p /gbdb/mm10/clsLongReadRna/{post-capture,pre-capture}
  for bb in $(find . -name '*.bb' -o  -name '*.bam*') ; do ln -sf $(realpath $bb) /gbdb/mm10/clsLongReadRna/$bb ; done
