package Gramene::Marker::Export;

# vim: tw=78: sw=4: ts=4: et: 

# $Id: Export.pm,v 1.10 2007/06/05 19:25:54 kclark Exp $

use strict;
use warnings;
use Carp qw( croak );
use Data::Dumper;
use English qw( -no_match_vars );
use File::Basename;
use File::Spec::Functions;
use File::Path;
use File::Temp qw( tempfile );
use Getopt::Long;
use Gramene::Config;
use Gramene::DB;
use Gramene::CDBI::Markers;
use Gramene::Utils qw( get_logger );
use Pod::Usage;
use Readonly;
use XML::Simple;
use Time::ParseDate qw( parsedate );

delete @ENV{qw(IFS CDPATH ENV BASH_ENV)};
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';

Readonly my $CMAP_ADMIN             
    => q{/usr/local/bin/perl /usr/local/bin/cmap_admin.pl};
Readonly my $COMMA                  => q{,};
Readonly my $COMMA_SPACE            => q{, };
Readonly my $DASH                   => q{-};
Readonly my $DOUBLE_COLON           => q{::};
Readonly my $ID_CORR_EVIDENCE       => q{ID};
Readonly my $MAP_SET_INFO_URL       
    => '/db/markers/marker_view?action=view_map_set&map_set_acc=';
Readonly my $MARKER                 => 'marker';
Readonly my $MARKER_ID_EVIDENCE     => q{ID};
Readonly my $NL                     => qq{\n};
Readonly my $SORT                   => q{/bin/sort};
Readonly my $TAB                    => qq{\t};
#Readonly my $TIGR                   => 'gt0506';
#Readonly my $OMAP_TIGR              => 'omap-' . $TIGR;
Readonly my $UNIQ                   => q{/usr/bin/uniq};
Readonly my @OUT_FIELDS             => qw(
    map_name map_acc map_start map_stop feature_name feature_acc
    feature_aliases feature_stop feature_start feature_type_acc
);
Readonly my $MAPPING_SQL            => q[
    select m.mapping_id,
           m.marker_id, 
           m.display_synonym_id,
           m.cmap_feature_accession as feature_acc,
           m.start as feature_start,
           m.end as feature_stop,
           s.marker_name as feature_name
    from   mapping m, marker_synonym s, analysis a
    where  m.map_id=?
    and    m.analysis_id=a.analysis_id
    and    a.ok_to_release=1
    and    m.display_synonym_id=s.marker_synonym_id
    and    m.cmap_feature_accession is not null
];

Readonly my $ALIAS_SQL              => q[
    select marker_synonym_id, marker_name
    from   marker_synonym
    where  marker_id=?
];

Readonly my $VERSION 
    => sprintf '%d.%02d', qq$Revision: 1.10 $ =~ /(\d+)\.(\d+)/;

# ----------------------------------------------------
sub mappings_to_cmap {
    my %args                 = ref $_[0] eq 'HASH' ? %{ $_[0] } : @_;
    my $out_dir              = $args{'out_dir'}           
                               or croak('No out directory');
    my $cmap_ds              = $args{'cmap_data_source'}  
                               or croak('No CMap data source');
    my $allow_update         = $args{'allow_update'} ? '--allow_update' : '';
    my @ms_accs              = ref $args{'map_set_accessions'} eq 'ARRAY'
                               ? @{ $args{'map_set_accessions'} }
                               : split(/,/, $args{'map_set_accessions'});
    my %skip_ms_accs         = map { $_, 1 } (
                               ref $args{'skip_map_set_accessions'} eq 'ARRAY'
                               ? @{ $args{'skip_map_set_accessions'} }
                               : split(/,/, $args{'skip_map_set_accessions'})
                               );
    my %take_marker_type     = map { lc $_, 1 } (
                               ref $args{'marker_types'} eq 'ARRAY'
                               ? @{ $args{'marker_types'} }
                               : split(/,/, $args{'marker_types'})
                               );
    my $mappings_only        = defined $args{'mappings_only'} 
                               ? $args{'mappings_only'} : 0;
    my $correspondences_only = defined $args{'correspondences_only'} 
                               ? $args{'correspondences_only'} : 0;
    my $log_level            = $args{'log_level'}         || 'info';
    my $logger               = get_logger({ log_level => $log_level });
    my $config               = Gramene::Config->new->get('markers');

    my %map_set_export_intramap_correspondences
        = map { $_, 1 }
        split $COMMA, $config->{'cmap_export_intramap_correspondences'} || '';
    my %map_set_export_intermap_correspondences
        = map { $_, 1 }
        split $COMMA, $config->{'cmap_export_intermap_correspondences'} || '';

    $logger->info("Exporting mappings to CMap for map sets ",
        join( $COMMA_SPACE, @ms_accs )
    );

    if ( !@ms_accs ) {
        croak('No map set accession');
    }

    my ( @not_found, %map_set_id );
    for my $ms_acc ( @ms_accs ) {
        next if !$ms_acc || $ms_acc !~ /\w+/;
        next if $skip_ms_accs{ $ms_acc };

        my ($MapSet) = Gramene::CDBI::Markers::MapSet->search(
            cmap_map_set_accession => $ms_acc
        );

        if ( $MapSet ) {
            $map_set_id{ $ms_acc } = $MapSet->id;

#            if ( $ms_acc eq $TIGR ) {
#                $map_set_id{ $OMAP_TIGR } = $MapSet->id;
#            }
        }
        else {
            push @not_found, $ms_acc;
        }
    }

    if ( @not_found ) {
        croak("Couldn't find the following map set accessions:\n",
            join($COMMA_SPACE, @not_found)
        );
    }

    # Populate lookup tables for the marker type and species ids
    my %marker_type_id = 
        map { $_->marker_type, $_->id } 
        Gramene::CDBI::Markers::MarkerType->retrieve_all;
    my %species_id = 
        map { $_->species, $_->id } 
        Gramene::CDBI::Markers::Species->retrieve_all;
    my %marker_type_by_id = reverse %marker_type_id;
    my %species_by_id     = reverse %species_id;

    if ( !-d $out_dir ) {
        mkpath( $out_dir );
    }

    my ( %xml_data, %xml_species, @out_files, @correspondence_files );
    my ( $num_map_sets, $num_mappings ) = ( 0, 0 );

    my $db        = Gramene::CDBI::Markers->db_Main;
    my $db_select = Gramene::DB->new('markers');

    MAP_SET:
    for my $ms_acc ( keys %map_set_id ) {
        $num_map_sets++;
        my $map_set_id = $map_set_id{ $ms_acc };
        my $MapSet     = Gramene::CDBI::Markers::MapSet->retrieve($map_set_id);

        $logger->info("Processing map set '$ms_acc' ($map_set_id)");

        if ( $correspondences_only ) {
            my $export_intramap_correspondences
                = $map_set_export_intramap_correspondences{ $ms_acc } || 0;
            my $export_intermap_correspondences
                = $map_set_export_intermap_correspondences{ $ms_acc } || 0;

            my $corr_filename = catfile( 
                $out_dir, "${ms_acc}-correspondences.tab"
            );

            export_correspondences(
                map_set_id  => $map_set_id,
                map_set_acc => $ms_acc,
                inter_map   => $export_intermap_correspondences,
                intra_map   => $export_intramap_correspondences,
                file_name   => $corr_filename,
                db_select   => $db_select,
                db          => $db,
                logger      => $logger,
            );

            push @correspondence_files, $corr_filename;

            next MAP_SET;
        }

        # 
        # Prepare out file
        # 
        my $out_path = catfile( $out_dir, "${ms_acc}.tab" );

        print STDERR "$num_map_sets: Writing '$out_path'\n";

        open my $out_fh, '>', $out_path 
            or die "Can't write $out_path: $!\n";
        
        print $out_fh join( $TAB, @OUT_FIELDS ), $NL;

        $db_select->{'mysql_use_result'} = 1;

        my $mapping_sth = $db_select->prepare( $MAPPING_SQL );

        my @maps 
            = Gramene::CDBI::Markers::Map->search( map_set_id => $map_set_id );

        my @map_data;
        for my $map ( @maps ) {
            next if !$map->cmap_map_accession;

            $logger->info("Processing map " . $map->map_name);

            my $start = $map->start;
            my $end   = $map->end;

            if ( !defined $start || !defined $end ) {
                ( $start, $end ) = $db->selectrow_array(
                    q[
                        select min(m.start), max(m.end)
                        from   mapping m
                        where  m.map_id=?
                    ],
                    {},
                    ( $map->id )
                );

                $start = 0 if $start > 1;
                $map->start( $start );
                $map->end( $end );
                $map->update;
            }

            my $map_acc = $map->cmap_map_accession;

#            if ( $ms_acc eq $OMAP_TIGR ) {
#                $map_acc = 'omap-' . $map_acc;
#            }

            push @map_data, {
                map_acc       => $map_acc,
                map_name      => $map->map_name,
                map_start     => $map->start,
                map_end       => $map->end,
                display_order => $map->display_order,
            };

            $mapping_sth->execute( $map->id );

            MAPPING:
            while ( my $m = $mapping_sth->fetchrow_hashref ) {
                next if !$m->{'feature_acc'};

#                if ( $ms_acc eq $TIGR ) {
#                    next if $m->{'feature_acc'} =~ /^OMAP/;
#                }
#                elsif ( $ms_acc eq $OMAP_TIGR ) {
#                    if ( $m->{'feature_acc'} !~ /^OMAP/ ) {
#                        $m->{'feature_acc'} = 'OMAP' . $m->{'feature_acc'};
#                    }
#                }

                my $mk_sth = $db->prepare(
                    q[
                        select m.marker_type_id, m.source_species_id,
                               s.marker_name
                        from   marker m, marker_synonym s
                        where  m.marker_id=?
                        and    m.display_synonym_id=s.marker_synonym_id
                    ],
                );
                $mk_sth->execute( $m->{'marker_id'} );
                my $marker = $mk_sth->fetchrow_hashref;

                $m->{'marker_type'}     =
                    $marker_type_by_id{ $marker->{'marker_type_id'} };
                $m->{'species'}         =
                    $species_by_id{ $marker->{'source_species_id'} };
                $m->{'map_acc'}         = $map_acc;
                $m->{'map_name'}        = $map->map_name;
                $m->{'map_start'}       = $map->start;
                $m->{'map_stop'}        = $map->end;
                $m->{'feature_name'}    =~ s/'/\\'/g;
                $m->{'feature_aliases'} = join( $COMMA, 
                    map { s/'/\\'/g; $_ }
                    map { $_->{'marker_name'} }
                    grep { 
                        $_->{'marker_synonym_id'} != 
                        $m->{'display_synonym_id'} 
                    }
                    @{ $db->selectall_arrayref( 
                        $ALIAS_SQL, { Columns => {} }, ( $m->{'marker_id'} )
                    ) } 
                );

                my $marker_type = lc $m->{'marker_type'};

                if ( %take_marker_type && !$take_marker_type{$marker_type} ) {
                    next MAPPING;
                }

                if ( $marker_type eq 'undefined' ) {
                    $m->{'feature_type_acc'} = 'marker';
                }
                else {
                    ( $m->{'feature_type_acc'} = lc $m->{'marker_type'} ) =~
                            s/\s+/-/g;
                }

                print $out_fh join( $TAB, 
                    map { defined $m->{ $_ } ? $m->{ $_ } : '' } @OUT_FIELDS 
                ), $NL;

                $num_mappings++;
            }

            $mapping_sth->finish;
        }

        close $out_fh;
        push @out_files, [ $out_path, $ms_acc ];

        ( my $map_type_acc = lc $MapSet->map_type->map_type ) =~ s/\s+/-/g;
        my $ms_xml = {
            map_set_short_name => $MapSet->map_set_short_name,
            map_set_name       => $MapSet->map_set_name,
            map_set_acc        => $ms_acc,
            map_type           => $MapSet->map_type->map_type,
            map_type_acc       => $map_type_acc,
            species_id         => $MapSet->species->id,
            map_units          => $MapSet->distance_unit,
            published_on       => $MapSet->published_on,
            map                => \@map_data,
            is_relational_map  => $map_type_acc eq 'physical' ? 1 : 0,
        };

#        if ( $ms_acc eq $OMAP_TIGR ) {
#            for my $fld ( qw[ map_set_short_name map_set_name ] ) {
#                $ms_xml->{ $fld } = 'OMAP ' . $ms_xml->{ $fld };
#            }
#        }

        if ( my $desc = $MapSet->description ) {
            $desc =~ s/\r?\n/\n/g;
            $desc =~ s/[^[:ascii:]]//g;
            push @{ $ms_xml->{'attribute'} }, {
                object_type     => 'map_set',
                object_id       => $MapSet->id,
                attribute_name  => 'Description',
                attribute_value => $desc,
            };
        }

        push @{ $ms_xml->{'xref'} }, {
            table_name => 'map_set',
            object_id  => $MapSet->id,
            xref_name  => 'Map Set Details',
            xref_url   => $MAP_SET_INFO_URL . $MapSet->cmap_map_set_accession
        };

        push @{ $xml_data{'cmap_map_set'} }, $ms_xml;

        my $species_acc = $MapSet->species->cmap_species_accession || '';
        if ( !$species_acc ) {
            print STDERR "Warning: No CMap species accession for ",
                $MapSet->species->species, ".\n",
                "This may cause duplication in CMap!\n";
        }

        $xml_species{ $MapSet->species->id } = {
            species_id          => $MapSet->species->id,
            species_acc         => $species_acc,
            species_common_name => 
                $MapSet->species->common_name || $MapSet->species->species,
            species_full_name   => $MapSet->species->species,
        };
    }


    # Create the shell file with the args to "cmap_admin.pl" to import data
    my $cmd_file = catfile( $out_dir, 'cmap_import_commands.sh' );
    open my $cmd_fh, '>', $cmd_file or die "Can't write $cmd_file: $!\n";

    if ( %xml_species ) {
        $xml_data{'cmap_species'} = [ values %xml_species ];
    }

    if ( %xml_data ) {
        $logger->info('Writing XML');
        my $xml = join(
            $NL,
            "<?xml version='1.0'?>",
            XMLout(
                \%xml_data,
                RootName      => 'cmap_export',
                NoAttr        => 1,
                SuppressEmpty => 1,
                XMLDecl       => 0,
            )
        );

        my $xml_file = catfile( $out_dir, 'map_set_data.xml');
        open my $xml_fh, '>',  $xml_file or die "Can't write $xml_file: $!\n";
        print $xml_fh $xml;
        close $xml_fh;
        push @out_files, [ $xml_file, '' ];

        print $cmd_fh "$CMAP_ADMIN --allow_update -d $cmap_ds ",
            "-a import_object_data $xml_file\n";
    }

    for my $file ( @out_files ) {
        my ( $file_name, $ms_acc ) = @$file;

        if ( $file_name =~ /\.tab$/ ) {
            print $cmd_fh "$CMAP_ADMIN -d $cmap_ds -a import_tab_data ",
                "--map_set_acc $ms_acc $allow_update $file_name\n";
        }
    }

    for my $corr_filename ( @correspondence_files ) {
        push @out_files, [ $corr_filename, '' ];

        # Add the correspondence file to the command file
        my $ms_accs = join $COMMA, @ms_accs;
        print $cmd_fh "$CMAP_ADMIN -d $cmap_ds -a import_correspondences ",
            "--map_set_accs $ms_accs $corr_filename\n";

        print $cmd_fh 
            "$CMAP_ADMIN -d $cmap_ds -a reload_correspondence_matrix\n";
    }

    print $cmd_fh "$CMAP_ADMIN -d $cmap_ds -a purge_query_cache ",
        "--cache_level 1\n";

    close $cmd_fh;

    $logger->info('Finished export');
    print STDERR join("\n",
        "Done, processed $num_mappings mappings for $num_map_sets map sets.",
        "Wrote the following files:",
        ( map { $_->[0] } @out_files ), 
        '',
        'Now do this:',
        "sh $cmd_file",
        '',
    );
}

# ----------------------------------------------------
sub export_correspondences {
    my %args        = @_;
    my $map_set_id  = $args{'map_set_id'};
    my $map_set_acc = $args{'map_set_acc'};
    my $inter_map   = $args{'inter_map'} || 0;
    my $intra_map   = $args{'intra_map'} || 0;
    my $file_name   = $args{'file_name'};
    my $db_select   = $args{'db_select'};
    my $db          = $args{'db'};
    my $logger      = $args{'logger'};

    open my $fh, '>', $file_name or die "Can't write $file_name: $!\n";
    print $fh join( $TAB, qw[feature_acc1 feature_acc2 evidence] ), $NL;

    $logger->info(
        "Exporting correspondences for map set $map_set_acc ($map_set_id) "
        . "to '$file_name'."
    );

    print STDERR "Writing correspondence file '$file_name'\n";

    my @maps = Gramene::CDBI::Markers::Map->search( map_set_id => $map_set_id );

    my %seen;
    for my $map ( @maps ) {
        next if !$map->cmap_map_accession;

        $logger->info(
            'Exporting correspondences for map ' . $map->map_name 
            . ' (' . $map->id . ')'
        );
        my $marker_sth = $db_select->prepare(
            q[
                select m.marker_id, m.cmap_feature_accession
                from   mapping m, analysis a
                where  m.map_id=?
                and    m.cmap_feature_accession is not null
                and    m.analysis_id=a.analysis_id
                and    a.ok_to_release=1
            ]
        );
        $marker_sth->execute( $map->id );

        MARKER:
        while ( my ($marker_id, $feature_acc) = $marker_sth->fetchrow_array ) {
            next if $seen{ $marker_id }++;
            next if !$feature_acc;

#            if ( $map_set_acc eq $TIGR ) {
#                next MARKER if $feature_acc =~ /^OMAP/;
#            }

            my $mappings = $db->selectall_arrayref(
                q[
                    select map.map_set_id,
                           map.map_id,
                           mapping.cmap_feature_accession as feature_acc
                    from   mapping, map, analysis a
                    where  mapping.marker_id=?
                    and    mapping.cmap_feature_accession is not null
                    and    mapping.analysis_id=a.analysis_id
                    and    a.ok_to_release=1
                    and    mapping.map_id=map.map_id
                ],
                { Columns => {} },
                ( $marker_id )
            );

            my @pairs = make_pairs( scalar @$mappings );

            PAIR:
            for my $pair ( @pairs ) {
                my $f1 = $mappings->[ $pair->[0] ];
                my $f2 = $mappings->[ $pair->[1] ];

                next unless $f1->{'feature_acc'} && $f2->{'feature_acc'};
                next if     $f1->{'feature_acc'} eq $f2->{'feature_acc'};

                next unless 
                       $f1->{'map_set_id'} == $map_set_id
                    || $f2->{'map_set_id'} == $map_set_id;

#                if ( $map_set_acc eq $OMAP_TIGR ) {
#                    for ( $f1, $f2 ) {
#                        if ( $_->{'map_set_id'} == $map_set_id ) {
#                            if ( $_->{'feature_acc'} !~ /^OMAP/ ) {
#                                $_->{'feature_acc'} 
#                                    = 'OMAP' .  $_->{'feature_acc'};
#                            }
#                        }
#                    }
#                }

                if ( $f1->{'map_id'} == $f2->{'map_id'} ) {
                    next PAIR unless $intra_map;
                }

                if ( $f1->{'map_set_id'} == $f2->{'map_set_id'} ) {
                    next PAIR unless $inter_map;
                }

                print $fh join( $TAB,
                    sort( $f1->{'feature_acc'}, $f2->{'feature_acc'} ),
                    $MARKER_ID_EVIDENCE
                ), $NL;
            }

            my $corr_sql = qq[
                select ac.analytical_correspondence_id,
                       mapping.cmap_feature_accession as feature_acc,
                       map.map_id,
                       map.map_set_id,
                       a1.analysis_name
                from   analytical_correspondence ac,
                       analysis a1,
                       analysis a2,
                       mapping, 
                       map
                where  ac.from_marker_id=?
                and    ac.analysis_id=a1.analysis_id
                and    a1.ok_to_release=1
                and    ac.to_marker_id=mapping.marker_id
                and    mapping.cmap_feature_accession is not null
                and    mapping.analysis_id=a2.analysis_id
                and    a2.ok_to_release=1
                and    mapping.map_id=map.map_id
            ];

            my $analytical_correspondences = $db->selectall_arrayref(
                $corr_sql, { Columns => {} }, ( $marker_id )
            );

            if ( @$analytical_correspondences ) {
                MAPPING:
                for my $mapping ( @$mappings ) {
                    next MAPPING unless $mapping->{'map_set_id'} == $map_set_id;
                    next MAPPING unless $mapping->{'feature_acc'};

                    ANALYTICAL_CORR:
                    for my $c ( @$analytical_correspondences ) {
                        my $to_feature_acc = $c->{'feature_acc'} or next;

                        if ( $c->{'map_id'} == $mapping->{'map_id'} ) {
                            next ANALYTICAL_CORR if !$intra_map;
                        }

                        if ( $c->{'map_set_id'} == $mapping->{'map_set_id'} ) {
                            next ANALYTICAL_CORR if !$inter_map;
                        }

                        (my $evidence = lc $c->{'analysis_name'}) =~ s/\s+/_/g;

                        print $fh join( $TAB, 
                            sort( $mapping->{'feature_acc'}, $to_feature_acc ), 
                            $evidence
                        ), $NL;
                    }
                }
            }
        }

        $marker_sth->finish;
    }

    close $fh;

    return 1;
}

# ----------------------------------------------------
sub make_pairs {
    my $limit = shift || 0;

    my @pairs = ();
    for my $i ( 0 .. $limit ) {
        for my $j ( $i .. $limit ) {
            next if $i == $j;
            push @pairs, [ $i, $j ];
        }
    }

    return @pairs;
}

__END__

# ----------------------------------------------------

=pod

=head1 NAME

Gramene::Marker::Export - export mappings to CMap

=head1 SYNOPSIS

  Gramene::Marker::Export::export_to_cmap(;
    out_dir              => '/tmp/foo',
    cmap_data_source     => 'Build21',
    allow_update         => 1,
    map_set_accessions   => [ 'foo', 'bar' ],
    mappings_only        => 0, # not implemented
    correspondences_only => 1, 
    log_level            => 'warn', # default 'info'
  );

=head1 DESCRIPTION

Exports marker db mappings to CMap.

=head1 SEE ALSO

Gramene::Marker::DB, Bio::GMOD::CMap.

=head1 AUTHOR

Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.

=head1 COPYRIGHT

Copyright (c) 2006 Cold Spring Harbor Laboratory

This library is free software;  you can redistribute it and/or modify 
it under the same terms as Perl itself.

=cut
