# Fixes to methyl450 track (RM #10447)  (kate 5/16/13)
#
# There are 63 tables, format BED 9:  wgEncodeHaibMethyl450*SitesRep1
#
# An alert mailing list user noticed that probe mappings were incorrect:
# 1) Reverse strand probe is 'flipped'
# 2) Both are off by one (starting at 1 instead of 0)
# 3) Should be single nucleotide dataset, not full probe

    cd /hive/groups/encode/dcc/data
    mkdir HudsonAlphaMethyl450.hg19
    cd HudsonAlphaMethyl450.hg19

# Post new supplemental file with 'beta values' for users and warn them off download files in track

    wget http://mendel.hudsonalpha.org/Public/fpauli/DCC/wgEncodeHaibMethyl450BetaValues.txt

# Add README.txt file to supplemental directory (text from Flo Pauli, HudsonAlpha)

    cp wgEncodeHaibMethyl450BetaValues.txt README.txt /hive/groups/encode/dcc/analysis/ftp/pipeline/hg19/wgEncodeHaibMethyl450/supplemental

# Correct coordinates in tables:
#  forward strand: start -= 1, end -= 50
#  reverse strand: end -= 49

cat > fixCoords.pl << 'EOF'
# correct Illumina Infinium probe mappings and truncate regions to 1 bp (BED 9 format)
while (<>) {
    @bed = split;
    if ($bed[5] =~ /\+/) {
        $bed[1] -= 1;
    }
    $bed[2] = $bed[1] + 1;
    $bed[6] = $bed[1];
    $bed[7] = $bed[2];
    print join("\t", @bed) . "\n";
}
'EOF'

# reload tables

cat > reload.csh << 'EOF'
    set tables = `hgsql hg19 -Ne "select tableName from trackDb where tableName like 'wgEncodeHaibMethyl450%Rep1'"`
    foreach t ($tables)
        echo "Reloading: $t"
        echo "select count(*) from $t" | hgsql -N hg19
        echo "select * from $t" | hgsql -N hg19 | cut --complement -f 1 | perl fixCoords.pl | \
            hgLoadBed hg19 $t stdin
    end
'EOF'
csh reload.csh

cp fixCoords.pl /hive/groups/encode/dcc/analysis/ftp/pipeline/hg19/wgEncodeHaibMethyl450/fixMethyl450.pl


# problem with A549 -- redo


 dump all tables and grep for probe having bad coord in A549 
 % grep cg02533120 *.loaded.bed

 # wgEncodeHaibMethyl450A549Etoh02SitesRep1.loaded.bed:chr21       33031836        33031837   
 # others:  chr21     33031839        33031840        cg02533120   
 # Only A549 has bad coords!

 # re-fix A549 and reload

    zcat /hive/groups/encode/dcc/analysis/ftp/pipeline/hg19/wgEncodeHaibMethyl450/wgEncodeHaibMethyl450A549Etoh02SitesRep1.bed.gz | perl fixCoords.pl > A549.fixed.bed

    % grep cg02533120 A549.fixed.bed
    #chr21   33031839        33031840        cg02533120      81      +       33031839        33031840        0,0,205

   hgLoadBed hg19 wgEncodeHaibMethyl450A549Etoh02SitesRep1 A549.fixed.bed
   # Read 482421 elements of size 9 from A549.fixed.bed





