#! /bin/bash

# file (received by MarkD through email 12 May 2024):
# /hive/data/genomes/hg38/bed/pseudoPipe/Homo_sapiens.GRCh38.103.pseudogene.pseudopipe.wparent.gtf
# This is incorrectly formatted gtf because it has overlapping exons. The python code converts
# the gtf to bed using MarkD's pycbio code, which can be pip installed:
#python3 -m venv mytestvenv
#source mytestvenv/bin/activate
#pip install /hive/groups/browser/pycbio
# the program also merges overlapping exons when they're in the same transcript because
# the gtf we received contains those.

# Adding HUGO ids using gencode tables.
# gene IDs come from v37 and v38 (both are needed to find all)
rm -rf all.ids
for i in 37 38; do
       echo "select geneId, geneName, transcriptId, proteinId from hg38.wgEncodeGencodeAttrsV$i" | \
          hgsql -N >> all.ids
done

annotationFile='/hive/data/genomes/hg38/bed/pseudoPipe/Homo_sapiens.GRCh38.103.pseudogene.pseudopipe.wparent.gtf'

# file has duplicate lines. Remove without sorting
awk '!seen[$0]++' $annotationFile > uniq.gtf
./pseudoPipeToBed.py --genes all.ids --gtf uniq.gtf --bed pgene.bed

# change the chromosome IDs (they need to  match the parents)
bigBedToBed /gbdb/hg38/hg38.chromAlias.bb chromAlias.bed
tawk '{print $6, $1}' chromAlias.bed > sub.list
subColumn 1 pgene.bed sub.list pgene2.bed

bedSort pgene2.bed pgene.bed



########## PARENT TRACK ######################
# Create a parent track and add parent info to the children

### V37
zcat /hive/data/genomes/hg38/bed/gencodeV37/build/gencodeV37.bed.gz > v37.bed

### V38
bigBedToBed /hive/data/genomes/hg38/bed/gencodeV38/build/hg38.gencodeV38.bb V38.bed

# we really only want v37 genes if they are not in v38
selectById -not 4 V38.bed 4 v37.bed > addMe
# sort by gene ID, then chr
cat V38.bed addMe  | sort -k4 -k1 | cut -f1-12> annot.bed

./pseudoPipeParents.py --pgenebed pgene.bed --annotbed annot.bed --pgeneout pseudopipePgenes.bed --parentout parents.bed

### convert to bigbed ###
cat << '_EOF_' > pseudoPipePgenes.as
table yale_pseudogenes
"Bed 12+11 file with pseudogenes and their (gene, transcript, and protein) parents."
    (
    string chrom;      "Chromosome (or contig, scaffold, etc.)"
    uint   chromStart; "Start position in chromosome"
    uint   chromEnd;   "End position in chromosome"
    string name;       "Name of item"
    uint   score;      "Score from 0-1000"
    char[1] strand;    "+ or -"
    uint thickStart;   "Start of where display should be thick (start codon)"
    uint thickEnd;     "End of where display should be thick (stop codon)"
    uint reserved;     "Used as itemRgb as of 2004-11-22"
    int blockCount;    "Number of blocks"
    int[blockCount] blockSizes;  "Comma separated list of block sizes"
    int[blockCount] chromStarts; "Start positions relative to chromStart"
    string pgenehugo;           "Pseudogene HUGO ID"
    string parenthugo;          "Parent pseudogene HUGO ID"
    string ppgene;              "Pseudopipe pseudogene gene ID"
    string pptx;                "Pseudopipe pseudogene transcript ID"
    string pgeneType;           "Pseudogene type"
    string pensgene;            "Ensembl pseudogene gene ID"
    string penstx;              "Ensembl pseudogene transcript ID"
    string ensgene;             "Ensembl parent gene ID"
    string _enstx;              "Ensembl parent transcript ID"
    string ensprot;             "Ensembl parent protein ID"
    string enstxurl;            "Ensembl parent transcript ID (url)"
    )
_EOF_
bedToBigBed -type=bed12+10 -tab -as=pseudoPipePgenes.as -extraIndex=pgenehugo,parenthugo,name,ppgene,pptx,pensgene,penstx,ensgene,_enstx,ensprot pseudopipePgenes.bed /hive/data/genomes/hg38/chrom.sizes pseudoPipePgenes.bb

# trix index (make search case insensitive) for hugo and ENSG parent IDs
cut -f4,13-16,18-22 pseudopipePgenes.bed > name.table
ixIxx name.table pseudoPipePgenes.ix pseudoPipePgenes.ixx

cat << '_EOF_' > pseudoPipeParents.as
table yale_parents
"Bed 12+3 file with parents and and their pseudogenes."
    (
    string chrom;      "Chromosome (or contig, scaffold, etc.)"
    uint   chromStart; "Start position in chromosome"
    uint   chromEnd;   "End position in chromosome"
    string name;       "Name of item"
    uint   score;      "Score from 0-1000"
    char[1] strand;    "+ or -"
    uint thickStart;   "Start of where display should be thick (start codon)"
    uint thickEnd;     "End of where display should be thick (stop codon)"
    uint reserved;     "Used as itemRgb as of 2004-11-22"
    int blockCount;    "Number of blocks"
    int[blockCount] blockSizes;  "Comma separated list of block sizes"
    int[blockCount] chromStarts; "Start positions relative to chromStart"
    string hugo;       "Gene ID"
    string geneType;    "Gene type"
    lstring url;        "Link to pseudogene"
    )
_EOF_

# sort so that ENSG comes before PGO Ids, putting the parent at the top
sort -k1,1 -k2,2n -k4 parents.bed > pseudopipeParents.bed 
bedToBigBed -type=bed12+3 -tab -as=pseudoPipeParents.as pseudopipeParents.bed /hive/data/genomes/hg38/chrom.sizes pseudoPipeParents.bb
cut -f4,10 pseudopipeParents.bed > name.table
ixIxx name.table pseudoPipeParents.ix pseudoPipeParents.ixx

rm addMe v37.tsv v38.sub V38.bed v37.sub sub.me sub.list idMiss.txt
rm *.bed
rm name.table uniq.gtf all.ids

