#!/usr/local/bin/perl

# vim: tw=78: sw=4: ts=4: et: 

# $Id: cmap-cmp-matrix-bw-builds.pl,v 1.8 2007/06/05 20:18:55 kclark Exp $

use strict;
use warnings;
use Bio::GMOD::CMap;
use Data::Dumper;
use English qw( -no_match_vars );
use File::Basename;
use Getopt::Long;
use Gramene::Utils qw( commify );
use List::Util qw( max );
use Pod::Usage;
use Readonly;
use Text::TabularDisplay;

Readonly my $VERSION => sprintf '%d.%02d', 
                        qq$Revision: 1.8 $ =~ /(\d+)\.(\d+)/;

my $map_set_accs       = '';
my $map_set_accs_skip  = '';
my $aliases            = '';
my $tolerance          = 0.1; # default ten percent
my $number_change      = 0; 
my $show_decrease_only = 0;

my ( $help, $man_page, $show_version );
GetOptions(
    'help'             => \$help,
    'man'              => \$man_page,
    'version'          => \$show_version,
    'm|map_set_accs:s' => \$map_set_accs,
    's|skip:s'         => \$map_set_accs_skip,
    'a|aliases:s'      => \$aliases,
    't|tolerance:s'    => \$tolerance,
    'n|number:s'       => \$number_change,
    'd|decrease'       => \$show_decrease_only,
) or pod2usage(2);

if ( $help || $man_page ) {
    pod2usage({
        -exitval => 0,
        -verbose => $man_page ? 2 : 1
    });
}; 

if ( $show_version ) {
    my $prog = basename( $PROGRAM_NAME );
    print "$prog v$VERSION\n";
    exit 0;
}

my @builds = @ARGV;

unless ( scalar @builds == 2 ) {
    pod2usage('Need two build names (e.g., "19" and "20")');
}

if ( $tolerance < 0 ) {
    pod2usage('Tolerance must be a positive number'); 
}
elsif ( $tolerance > 1 ) {
    $tolerance = $tolerance / 100;
}

my %alias = map { split /=/ } split /,/, $aliases;

my %matrix;
for my $build ( @builds ) {
    if ( $build !~ /^Build/ ) {
        $build = "Build${build}";
    }

    my $cmap = Bio::GMOD::CMap->new;
    $cmap->data_source( $build ) or die $cmap->error;

    my $data = $cmap->data_module;

    $matrix{ $build } = $data->matrix_correspondence_data( show_matrix => 1 );
}

my $m1 = $matrix{ $builds[0] };
my $m2 = $matrix{ $builds[1] };

my %matrix1;
for my $ms1 ( @{ $m1->{'matrix'}{'data'} } ) {
    for my $corr ( @{ $ms1->{'correspondences'} } ) {
        $matrix1{ $ms1->{'map_set_acc'} }{ $corr->{'map_set_acc'} } = 
            $corr->{'number'};
    }
}

complement(\%matrix1);

for my $k1 ( keys %matrix1 ) {
    if ( my $alias = $alias{ $k1 } ) {
        for my $k2 ( keys %{ $matrix1{ $k1 } } ) {
            $matrix1{ $alias }{ $k2 } = $matrix1{ $k1 }{ $k2 };
        }
    }
}

complement(\%matrix1);

my %restrict_to = map { $_, 1 } split /,/, $map_set_accs;
my %skip        = map { $_, 1 } split /,/, $map_set_accs_skip;
for my $alias ( map { $alias{ $_ } || () } keys %restrict_to ) {
    $restrict_to{ $alias } = 1;
}

my @problems;
my %seen;
for my $ms2 ( @{ $m2->{'matrix'}{'data'} } ) {
    CORR:
    for my $corr ( @{ $ms2->{'correspondences'} } ) {
        my $from_ms_acc = $ms2->{'map_set_acc'};
        my $to_ms_acc   = $corr->{'map_set_acc'};
        my $seen_key    = join('::', sort($from_ms_acc, $to_ms_acc));

        next CORR if $seen{ $seen_key }++;

        next CORR if $skip{ $from_ms_acc } || $skip{ $to_ms_acc };

#        next CORR unless exists $matrix1{ $from_ms_acc }{ $to_ms_acc };

        if ( %restrict_to ) {
            next CORR unless 
                $restrict_to{ $from_ms_acc } || $restrict_to{ $to_ms_acc }
            ;
        }

        my $n1 = $matrix1{ $from_ms_acc }{ $to_ms_acc } || 0;
        my $n2 = $corr->{'number'}                      || 0;

        next unless $n1 && $n2;

        my $diff = abs($n1-$n2);

        if ( $number_change && $diff < $number_change ) {
            next CORR;
        }

        my $pct_diff = sprintf('%.2f', $n1 > 0 ? $diff/$n1 : '100');
        my $diff_display 
            = join('', $n1 > $n2 ? '-' : '', commify($pct_diff * 100));

        if ( $pct_diff >= $tolerance ) {
            push @problems, {
                ms1          => $from_ms_acc,
                ms2          => $to_ms_acc,
                n1           => $n1,
                n2           => $n2,
                $builds[0]   => commify($n1),
                $builds[1]   => commify($n2),
                num_diff     => $diff,
                diff         => $pct_diff,
                diff_display => $diff_display,
            };
        }
    }
}

if ( @problems ) {
    my @tab_cols = ( 'Map Set 1','Map Set 2', @builds, '% Diff' );
    my $tab      = Text::TabularDisplay->new( @tab_cols );

    my $n1_len = max( 
        map { length $_ } 
        ( $builds[0], ( map { $_->{ $builds[0] } } @problems ) )
    );

    my $n2_len = max( 
        map { length $_ } 
        ( $builds[1], ( map { $_->{ $builds[1] } } @problems ) )
    );

    my $diff_len = max( 
        ( map { length($_->{'diff_display'}) } @problems ),
        length($tab_cols[-1])
    );

    my $column = set_cols( \@problems );

    my $num_problems = 0;
    my ( %seen, %column );
    PROBLEM:
    for my $p ( sort { $b->{'diff'} <=> $a->{'diff'} } @problems ) {
        my ( $ms1, $ms2 ) = 
            sort { $column->{ $a } <=> $column->{ $b } } 
            $p->{'ms1'}, $p->{'ms2'}
        ;

        next PROBLEM if $seen{ $ms1 }{ $ms2 }++;

        my $n1       = $p->{'n1'};
        my $n2       = $p->{'n2'};
        my $display1 = $p->{ $builds[0] }; 
        my $display2 = $p->{ $builds[1] }; 

        if ( $show_decrease_only ) {
            next PROBLEM unless $n1 > $n2;
        }

        $tab->add( 
            $ms1, 
            $ms2,
            sprintf("%${n1_len}s", $display1),
            sprintf("%${n2_len}s", $display2),
            sprintf("%${diff_len}s", $p->{'diff_display'}),
        );

        $num_problems++;
    }

    print join "\n", $tab->render, "Found $num_problems problems.", '';
}
else {
    print "No problems between $builds[0] and $builds[1]\n";
}

sub complement {
    my $hash = shift;

    for my $k1 ( keys %$hash ) {
        for my $k2 ( keys %{ $hash->{ $k1 } } ) {
            $hash->{ $k2 }{ $k1 } = $hash->{ $k1 }{ $k2 };
        }
    }
}

sub set_cols {
    my $data = shift;

    my %col;
    for my $rec ( @$data ) {
        my $v1 = $rec->{'ms1'};
        my $v2 = $rec->{'ms2'};
        $col{ $_ } ||= 0 for $v1, $v2;

        if ( 
              !$col{ $v1 } && !$col{ $v2 }
            || $col{ $v1 } ==  $col{ $v2 } 
        ) {
            $col{ $v1 } = 1;
            $col{ $v2 } = 2;
        }
        elsif ( $col{ $v1 } && !$col{ $v2 } ) {
            $col{ $v2 } = $col{ $v1 } == 1 ? 2 : 1;
        }
        elsif ( !$col{ $v1 } && $col{ $v2 } ) {
            $col{ $v1 } = $col{ $v2 } == 1 ? 2 : 1;
        }
    }
    
    return \%col;
}

__END__

# ----------------------------------------------------

=pod 

=head1 NAME

cmap-cmp-matrix-bw-builds.pl - compare CMap correspondences b/w builds

=head1 VERSION

This documentation refers to cmap-cmp-matrix-bw-builds.pl 
version $Revision: 1.8 $

=head1 SYNOPSIS

  cmap-cmp-matrix-bw-builds.pl [options] build1 build2

Options:

  -t|--tolerance=X       Percent difference threshold, e.g., 
                         "0.50" for 50% (default is 10%)
  -d|--decrease          Show only where number has decreased
  -m|--map_set_accs=X,Y  Map set accessions to restrict search
  -s|--skip=X,Y          Map set accessions to skip
  -n|--number=10         Minimum number for change to count
  -a|--aliases=A=B,Y=Z   A comma-separated string of map set 
                         accessions to aliases, e.g., in one build 
                         a map set was called "gt0205" and the next it 
                         was called "gt1005"

  --help                 Show brief help and exit
  --man                  Show full documentation
  --version              Show version and exit

=head1 DESCRIPTION

This script connects to two builds of CMap and queries the
correspondence matrix and compares the common map set pairs to see how
they changed.  The required arguments are two build names (or just the
numbers, e.g., "19" and "20" because all Gramene builds are assumed to
be named like "Build" plus the build number).

To change the tolerable difference, use the "-t" option with an
integer or decimal value of the percent difference threshold, e.g.,
"25" or "0.25" for 25%.

To just restrict the checks to a list of map sets, specify their map
set accessions as a comma-separated list with the "-m" option.

If map sets have changed accessions between builds, use the "-a"
option to indicate this by saying "foo=bar,baz=quux".

The results are sorted in descending order by the percentage difference 
change.

=head1 SEE ALSO

Bio::GMOD::CMap.

=head1 AUTHOR

Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.

=head1 COPYRIGHT

Copyright (c) 2006 Cold Spring Harbor Laboratory

This library is free software;  you can redistribute it and/or modify 
it under the same terms as Perl itself.

=cut
