#!/usr/local/bin/perl

# $Id: load-marker-correspondences.pl,v 1.6 2007/05/22 19:46:12 kclark Exp $

=head1 NAME

load-marker-correspondences.pl - loads marker correspondence data

=head1 SYNOPSIS

  load-marker-correspondences.pl [options] file1 [...]

Options:

  -a|--analysis Analysis name (or put in the file)
  -t|--type     Analytical correspondence type (or put in the file)

  -h|--help     Show brief help and exit
  -v|--version  Show version and exit

=head1 DESCRIPTION

Accepts a tab-delimited file with the following columns:

  from_marker_id      \
  to_marker_id        /

  from_marker_name    \
  from_marker_species  |
  from_marker_type     |
  to_marker_name       |
  to_marker_species    |
  to_marker_type      /

  analysis_name                  - optional if on command-line
  analytical_correspondence_type - optional if on command-line

Obviously, if providing "from/to_marker_id," nothing else is required to 
identify the markers.  However, if providing names, you may also choose to 
specify the species and type.  Even then the markers may not be completely 
unique, in which case an error will be generated and the correspondence 
will not be created.

=head1 SEE ALSO

Gramene::Marker::DB.

=head1 AUTHOR

Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.

=head1 COPYRIGHT

Copyright (c) 2005 Cold Spring Harbor Laboratory

This library is free software;  you can redistribute it and/or modify 
it under the same terms as Perl itself.

=cut

# ----------------------------------------------------

use strict;
use DateTime;
use File::Basename;
use Getopt::Long;
use Gramene::CDBI::Markers;
use Gramene::Marker::DB;
use Pod::Usage;
use Readonly;
use Text::RecordParser;
use Time::ParseDate;

Readonly my $VERSION => sprintf "%d.%02d", q$Revision: 1.6 $ =~ /(\d+)\.(\d+)/;
Readonly my $EMPTY_STR => q{};

my ( $no_processed, $no_files ) = ( 0, 0 );
$SIG{'INT'} = \&report;

my $analysis_name                  = $EMPTY_STR;
my $analytical_correspondence_type = $EMPTY_STR;
my ( $help, $show_version );
GetOptions(
    'a|analysis:s' => \$analysis_name,
    't|type:s'     => \$analytical_correspondence_type,
    'h|help'       => \$help,
    'v|version'    => \$show_version,
);
pod2usage(2) if $help;

if ( $show_version ) {
    my $prog = basename( $0 );
    print "$prog v$VERSION\n";
    exit(0);
}

my @files = @ARGV or pod2usage('No marker data file');
my $mdb   = Gramene::Marker::DB->new( admin => 1 ) 
            or die Gramene::Marker::DB->error;
my %valid_fields = map { $_ => 1 } qw[
    from_marker_id
    to_marker_id
    from_marker_name
    from_marker_species
    from_marker_type
    to_marker_name
    to_marker_species
    to_marker_type
    analysis_name
    analytical_correspondence_type
];

my %analysis_used; # keep track of all used
my $Analysis;
if ( $analysis_name ) {
    $Analysis = Gramene::CDBI::Markers::Analysis->find_or_create(
        { analysis_name => $analysis_name }
    );

    $analysis_used{ $analysis_name } = $Analysis;
}

my $AnalyticalCorrespondenceType;
if ( $analytical_correspondence_type ) {
    $AnalyticalCorrespondenceType
    = Gramene::CDBI::Markers::AnalyticalCorrespondenceType->find_or_create(
        { type => $analytical_correspondence_type }
    );
}

FILE:
for my $file ( @files ) {
    $no_files++;

    unless ( -e $file && -r _ && -s _ ) {
        print "Something about the file '$file' isn't right, skipping.\n";
        next;
    }

    print "Processing file '$file'\n";
    my $p = Text::RecordParser->new( 
        filename        => $file,
        field_separator => "\t",
        header_filter   => sub { $_ = shift; s/\s+/_/g; lc $_ },
    );
    $p->bind_header;

    my @bad_fields;
    for my $fld_name ( $p->field_list ) {
        push @bad_fields, $fld_name unless $valid_fields{ $fld_name };
    }

    if ( @bad_fields ) {
        print STDERR "File '$file' contains the following invalid fields:\n",
            join(', ', @bad_fields), "\n";
        next FILE;
    } 

    my $line_no = 0;
    LINE:
    while ( my $rec = $p->fetchrow_hashref ) {
        $line_no++;

        my ( $from_marker_id, $to_marker_id );
        for my $x ( 
            [ 'from', \$from_marker_id ],
            [ 'to'  , \$to_marker_id   ]
        ) {
            my $direction   = $x->[0];
            my $variable    = $x->[1];
            my $marker_id   = $rec->{ $direction . '_marker_id' }   || '';
            my $marker_name = $rec->{ $direction . '_marker_name' } || '';

            if ( 
                $marker_name 
                && $marker_name =~ /\s+/ 
                && $marker_name !~ /^".*"$/
            ) {
                $marker_name = qq["$marker_name"];
            }

            unless ( $marker_id || $marker_name ) {
                print STDERR 
                "Must provide at least marker name or id at line $line_no\n";
                next LINE;
            }

            my @markers = $mdb->marker_search(
                marker_id   => $marker_id,
                marker_name => $marker_name,
                marker_type => $rec->{ $direction . '_marker_type' }    || '',
                species     => $rec->{ $direction . '_marker_species' } || '',
            );

            my $no_markers = scalar @markers;
            if ( $no_markers == 1 ) {
                $$variable = $markers[0]->{'marker_id'};
            }
            elsif ( $no_markers > 1 ) {
                print STDERR 
                    "$no_markers match $direction marker at line $line_no\n";
            }
            else {
                print STDERR 
                    "Cannot find $direction marker at line $line_no\n";
            }
        }

        unless ( $from_marker_id && $to_marker_id ) {
            print STDERR 
                "Error line $line_no: Cannot find one or both markers!\n";
            next LINE;
        }

        my $analysis = $rec->{'analysis_name'} or do {
            print STDERR "No analysis provided at line $line_no\n";
            next LINE;
        };

        my $analysis_id;
        if ( my $analysis = $rec->{'analysis_name'} ) {
            my $RecAnalysis = Gramene::CDBI::Markers::Analysis->find_or_create(
                { analysis_name => $analysis } 
            );

            $analysis_id = $RecAnalysis->id;

            $analysis_used{ $analysis } ||= $RecAnalysis;
        }
        else {
            $analysis_id = $Analysis->id;
        }

        my $analytical_correspondenc_type_id;
        if ( my $type = $rec->{'analytical_correspondence_type'} ) {
            my $RecType 
            = Gramene::CDBI::Markers::AnalyticalCorrespondenceType->find_or_create(
                { type => $type } 
            );

            $analytical_correspondenc_type_id = $RecType->id;
        }
        else {
            $analytical_correspondenc_type_id 
                = $AnalyticalCorrespondenceType->id;
        }

        my $corr_id = $mdb->set_correspondence(
            from_marker_id                    => $from_marker_id,
            to_marker_id                      => $to_marker_id,
            analysis_id                       => $analysis_id,
            analytical_correspondence_type_id 
                => $analytical_correspondenc_type_id,
        ) or do {
            print STDERR "Error creating correspondence at line $line_no: ",
                $mdb->error;
            next LINE;
        };

        print "$line_no: Corr $from_marker_id => $to_marker_id ($corr_id)\n";
        $no_processed++;
    }
}

for my $Ana ( values %analysis_used ) {
    printf "Updating analysis '%s' last_run\n", $Ana->analysis_name;

    my $epoch = parsedate('today');
    my $dt    = DateTime->from_epoch( epoch => $epoch );
    $Ana->last_run( $dt->strftime('%Y-%m-%d') );
    $Ana->update;
}

report('Finished');

sub report { 
    my $action = shift || 'Finished';

    printf( 
        "%s: Processed %s record%s in %s file%s.\n",
        $action,
        $no_processed, ( $no_processed == 1 ) ? $EMPTY_STR : 's',
        $no_files    , ( $no_files     == 1 ) ? $EMPTY_STR : 's',
    ); 
    exit(0); 
};

