##################
# ASE (Allele-Specific Expression) from Lappalainen lab, NY Genome Center
# (1016-06-20 kate)

cd /hive/data/outside/GTEx/V6
makdir ase
cd ase

###################3
# June 28 full dataset from Stephane

# Move earlier experiments to old/ directory.

# combined:  all*txt

# filter to >1 sample, >30 reads at the site

# extract highest scoring:
awk '$3 > 1 && $4 > 30 {print}' *.txt > all.1-30.txt

awk '$5==.5 && $6==.5 && $7==.5 && $8==.5 {print}' all.1-30.txt > all.high.txt

awk '{printf "chr%d %d %d rsNNNN %.2f %d %.1f \n", $1, $2-1, $2, $5, $3, $4}' all.high.txt | sed 's/ /\t/g' > all.high.bed

# limit further - cov=100, donors=20
awk '$6>100 {print}' ../all*txt | awk '$5>20' > ../all-100-20.txt
[kate@hgwdev results]$ wc -l !$
wc -l ../all-100-20.txt


#2780

$ textHistogram -col=5 -real -skip=1 -binSize=.1 all.1-30.txt
0.000000 ************************************************************ 82477
0.100000 *********** 15192
0.200000 *** 3852
0.300000 * 1529
0.400000 * 1130
0.500000 *** 4155


# Load GTEx tissue abbreviations into new column (abbrev) of gtexTissue table
#       alter table gtexTissue add column abbrev varchar(255) after color
# (Actually, use hgLoadSqlTab for this)


# Parse per-tissue ASE files into BED 9+9 format  (gtexAse.as)
cd /hive/data/outside/GTEx/V6/ase
mkdir -p out/hg19 out/hg38
mkdir -p hub/hg19 hub/hg38
set bin = ~/bin/x86_64
$bin/hgGtexAse hg19 gtexAwgAse lab/results out/hg19 -verbose=3

cd out/hg19
set lib = ~/kent/src/hg/lib
set sizes = /hive/data/genomes/hg19/chrom.sizes
foreach f (`ls *.tab`)
    set t = $f:r
    echo $t
    bedToBigBed -type=bed9+9 -as=$lib/gtexAse.as $t.tab $sizes ../../hub/hg19/$t.bb
end

set chain = /hive/data/genomes/hg19/bed/liftOver/hg19ToHg38.over.chain.gz
set sizes = /hive/data/genomes/hg38/chrom.sizes
set files = `ls *.tab`
foreach f ($files)
    set t = $f:r
    echo "Lifting $t"
    liftOver -bedPlus=9 $f $chain stdout ../hg38/$t.unmapped  | bedSort stdin ../hg38/$f
    echo "Biggifying $t"
    bedToBigBed -type=bed9+9 -as=$lib/gtexAse.as ../hg38/$f $sizes ../hg38/$t.bb
end

# Combined track
cd /hive/data/outside/GTEx/V6/ase
set bin = ~/bin/x86_64
$bin/hgGtexAse hg19 gtexAwgAse lab/all out/hg19 -verbose=3
cd out/hg19
set t = gtexAwgAseCombined
set lib = ~/kent/src/hg/lib
set sizes = /hive/data/genomes/hg19/chrom.sizes
bedToBigBed -type=bed9+9 -as=$lib/gtexAse.as $t.tab $sizes ../../hub/hg19/$t.bb

# lift to hg38
set chain = /hive/data/genomes/hg19/bed/liftOver/hg19ToHg38.over.chain.gz
set sizes = /hive/data/genomes/hg38/chrom.sizes
liftOver -bedPlus=9 $t.tab $chain stdout ../hg38/$t.unmapped  | bedSort stdin ../hg38/$t.tab
bedToBigBed -type=bed8+10 -as=$lib/gtexAse.as ../hg38/$t.tab $sizes ../../hub/hg38/$t.bb

###############
# Cross-tissue sliding window analysis from Stephane (on 7/28).  After review during the July
# meeting, he suggested this would be more informative as summary than the Combined track.

cd /hive/data/outside/GTEx/V6/ase
mkdir windows; cd windows

# process file from dropbox
gunzip -c all_tissues.ase_stats.window.10.2.bedgraph.gz | \
        sed 's/^/chr/' | sort -k1,1 -k2,2n > ase.hg19.bg

# To remove overlaps in bedGraph, but maintain shape of graph:
# break into 1bp segments, replace overlaps w/ median, merge, as per S. Castel
# Really inefficient way to do this, but let's give it a go.
cat > break.pl << 'EOF'
while (<>) {
    ($chr, $s, $e, $val) = split;
    $s--; # correct start base
    while ($s lt $e) {
        print $chr, "\t", $s, "\t", $s+1, "\t", $val, "\n";
        $s++;
    }
}
'EOF'
perl break.pl < ase.hg19.bg > ase.hg19.bases.bg
# this takes a few hours

bedSplitOnChrom ase.hg19.bases.bg split
cd split

cat > merge.csh << 'EOF'
foreach f (*.bed)
    echo $f
    bedSort $f $f.sorted
    /cluster/bin/bedtools/mergeBed -i $f.sorted -c 4 -o median > $f.merged
    bedGraphPack $f.merged $f.packed
end
'EOF'

csh merge.csh > merge.log
cat *.packed > ../out/ase_density.hg19.bg

# create hg38 version
cd out
set chain = /hive/data/genomes/hg19/bed/liftOver/hg19ToHg38.over.chain.gz
liftOver -bedPlus=3 ase_density.hg19.bg $chain stdout unmapped.txt  | \
        bedSort stdin ase_density.hg38.bg

# convert to bigWig
foreach db (hg19 hg38)
    set sizes = /hive/data/genomes/$db/chrom.sizes
    bedGraphToBigWig ase_density.$db.bg $sizes gtexAwgAseDensity.$db.bw
end

###########
# Hub construction

cd /hive/data/gbdb
ln -s /hive/data/outside/gtex/analysis/hub gtexAnalysis

cp gtex/hub.test.txt gtexAnalysis
cp -rp gtex/images gtexAnalysis
cd gtexAnalysis

# create filelist.txt (for push to hgdownload)

echo hub.txt genomes.txt gtexAnalysis.html > filelist.txt
find images hg19 hg38 -type f -print >> filelist.txt

