#!/usr/local/bin/perl

# vim: tw=78: sw=4: ts=4: et: 

# $Id: load-mappings.pl,v 1.22 2007/05/11 15:31:15 kclark Exp $

use strict;
use warnings;
use DateTime;
use English qw( -no_match_vars );
use File::Basename;
use Getopt::Long;
use Gramene::CDBI::Markers;
use Gramene::Marker::DB;
use Gramene::Utils qw( get_logger table_name_to_gramene_cdbi_class );
use IO::Prompt;
use List::MoreUtils qw( uniq );
use List::Util qw( max );
use Pod::Usage;
use Readonly;
use Text::RecordParser::Tab;
use Time::ParseDate;

Readonly my @FILE_SUFFIXES   => ( '.dat', '.txt', '.tab' );
Readonly my $VERSION         => sprintf '%d.%02d', 
    q$Revision: 1.22 $ =~ /(\d+)\.(\d+)/;
Readonly my %OPTIONAL_FIELDS => (
    cigar_line               => [],
    score                    => [],
    evalue                   => [],
    percent_identity         => [ qw( perc_id ) ],
    remark                   => [ qw( comments ) ],
);
Readonly my $COMMA           => q{,};
Readonly my $EMPTY_STRING    => q{};
Readonly my $NL              => qq{\n};
Readonly my $UNDEFINED       => 'UNDEFINED';
Readonly my $UNKNOWN         => 'UNKNOWN';

# Command-line args
my $analysis_name            = $EMPTY_STRING;
my $library_name             = $UNKNOWN;
my $ms_acc                   = $EMPTY_STRING;
my $ms_species               = $EMPTY_STRING;
my $ms_map_type              = $EMPTY_STRING;
my $map_set_name             = $EMPTY_STRING;
my $no_prompt                = 0;
my $ensembl                  = 0;
my $search_only_primary_name = 0;

my ( $help, $man_page, $show_version, $analysis_id, $marker_id_required ); 
GetOptions(
    'm|ms-acc=s'         => \$ms_acc,
    'p|primary'          => \$search_only_primary_name,
    'help'               => \$help,
    'man'                => \$man_page,
    'version'            => \$show_version,
    'species:s'          => \$ms_species,
    'map-type:s'         => \$ms_map_type,
    'map-set-name:s'     => \$map_set_name,
    'no-prompt'          => \$no_prompt,
    'ensembl'            => \$ensembl,
    'analysis-id:i'      => \$analysis_id,
    'analysis-name:s'    => \$analysis_name,
    'library-name:s'     => \$library_name,
    'marker-id-required' => \$marker_id_required,
) or pod2usage( 2 );

if ( $help || $man_page ) {
    pod2usage({
        -exitval => 0,
        -verbose => $man_page ? 2 : 1
    });
}; 

if ( $show_version ) {
    my $prog = basename( $PROGRAM_NAME );
    print "$prog v$VERSION\n";
    exit 0;
}

my @files = @ARGV or pod2usage('No input files');
@ARGV = ();

my $map_set_id;
if ( $ms_acc ) {
    my @map_sets = Gramene::CDBI::Markers::MapSet->search(
        cmap_map_set_accession => $ms_acc
    );

    if ( scalar @map_sets > 1 ) {
        die "More than one map set with that accession?!\n";
    }
    elsif ( scalar @map_sets == 1 ) {
        $map_set_id = $map_sets[0]->map_set_id;
    }
    else {
        if ( !$ms_species || !$ms_map_type || !$map_set_name ) {
            my $create = prompt -yn,
                "Can't find CMap map set accession '$ms_acc'. Create? ";

            if ( !$create ) {
                die "Can't do anything.  Please try again.\n";
            }
        }

        my $species_id   = prompt_for('species', $ms_species);
        my $map_type_id  = prompt_for('map_type', $ms_map_type);
        $map_set_name  ||= prompt 'Map Set Name? ';

        if ( $species_id && $map_type_id && $map_set_name ) {
            my $map_set = Gramene::CDBI::Markers::MapSet->insert({
                species_id             => $species_id,
                map_type_id            => $map_type_id,
                map_set_name           => $map_set_name, 
                cmap_map_set_accession => $ms_acc,
            });
            $map_set_id = $map_set->id;
        }
    }
}
else {
    pod2usage('No map set accession');
}

unless ( $no_prompt ) {
    my $ok = prompt( 
        -yn, "OK to load mappings for the markers on map set '$ms_acc'? "
    );

    if ( !$ok ) {
        print "Exiting.\n";
        exit 0;
    }
}

my ($MapSet) = Gramene::CDBI::Markers::MapSet->search(
    map_set_id => $map_set_id
) or die "Bad map set id ($map_set_id)";

my $Library  = Gramene::CDBI::Markers::Library->find_or_create(
    species_id   => $MapSet->species_id,
    library_name => $library_name,
);

my $p   = Text::RecordParser::Tab->new;
my $mdb = Gramene::Marker::DB->new;

# trim
$p->field_filter( sub { $_ = shift; if (defined $_){s/^\s+|\s+$//g;} $_ } ); 
$p->header_filter( sub { $_ = shift; s/\s+/_/g; lc $_ } );

my ( $num_files, $num_mappings, $num_errors ) = ( 0, 0, 0 );
FILE:
for my $file ( @files ) {
    print "Processing file '$file'\n";
    $p->filename( $file );
    $p->bind_header;

    my $file_basename = basename( $file, @FILE_SUFFIXES );
    $file_basename    =~ s/final_filtered_//;

    my $Analysis;
    if ( $analysis_id ) {
        $Analysis
            = Gramene::CDBI::Markers::Analysis->retrieve( $analysis_id );
    }
    else {
        $Analysis = Gramene::CDBI::Markers::Analysis->find_or_create(
            { analysis_name => $analysis_name || $file_basename }
        );
    }

    my $line_num = 1;
    MAPPING:
    while ( my $rec = $p->fetchrow_hashref ) {
        $line_num++;

        my $marker_id = $rec->{'marker_id'} || 0;
        my $marker_type = $rec->{'marker_type'}
            || $rec->{'feature_type'}
            || $rec->{'feature_type_acc'}
            || $EMPTY_STRING;
        my $marker_species = $rec->{'marker_species'}
            || $rec->{'feature_species'}
            || $EMPTY_STRING;
        my $marker_name = $rec->{'marker_name'}
            || $rec->{'feature_name'}
            || $rec->{'hit_name'}
            || $EMPTY_STRING;
        my @marker_synonyms = split(/$COMMA/, 
               $rec->{'marker_synonyms'}
            || $rec->{'synonyms'}
            || $rec->{'feature_aliases'}
            || $EMPTY_STRING
        );
        my $map_name = $rec->{'map_name'}
            || $rec->{'chr'}
            || $rec->{'chr_with_offset'}
            || $EMPTY_STRING;
        my $start = $rec->{'marker_start'}
            || $rec->{'feature_start'}
            || $rec->{'chr_start'}
            || $EMPTY_STRING;
        my $end = $rec->{'marker_end'}
            || $rec->{'marker_stop'}
            || $rec->{'feature_stop'}
            || $rec->{'chr_stop'}
            || $EMPTY_STRING;
        my $map_acc = $rec->{'cmap_map_accession'}
            || $rec->{'cmap_map_acc'}
            || $rec->{'map_acc'};
        my $cmap_feature_accession = $rec->{'cmap_feature_accession'}
            || $rec->{'cmap_feature_acc'}
            || $rec->{'feature_acc'};

        my $map_start     = $rec->{'map_start'};
        my $map_end       = $rec->{'map_end'}   || $rec->{'map_stop'};
        my $marker_start  = $rec->{'hit_start'} || $EMPTY_STRING;
        my $marker_end    = $rec->{'hit_stop'}  || $EMPTY_STRING;
        my $marker_strand = $rec->{'strand'}    || $EMPTY_STRING;

        my $ThisAnalysis;
        if ( my $rec_analysis_name = $rec->{'analysis'} ) {
            $ThisAnalysis = Gramene::CDBI::Markers::Analysis->find_or_create(
                { analysis_name => $rec_analysis_name } 
            );
        }
        else {
            $ThisAnalysis = $Analysis;
        }

        my $ThisLibrary;
        if ( 
            my $rec_library_name = $rec->{'library'} || $rec->{'library_name'} 
        ) {
            $ThisLibrary = Gramene::CDBI::Markers::Library->find_or_create(
                { library_name => $rec_library_name } 
            );
        }
        else {
            $ThisLibrary = $Library;
        }

        if ( !$marker_id ) {
	        if ( $marker_id_required ) {
                complain(
                    "No marker id (marker_id required), $file line $line_num"
                );
                $num_errors++;
                next MAPPING;
            }
            elsif( !$marker_name ) {
                complain( "No marker name or marker id, $file line $line_num" );
                $num_errors++;
                next MAPPING;
            }
        }

        unless ( $marker_id ) {
            my %search_args = (
                species     => $marker_species,
                search_only_primary_name => $search_only_primary_name,
            );

            # types can be tricky, so first check without and then with
            my @markers;
            MSEARCH:
            for my $sname ( map { $_ || () } $marker_name, @marker_synonyms ) {
                if ( $sname =~ /\s+/ && $sname !~ /^".+"$/ ) {
                    $sname = qq["$sname"];
                }

                $search_args{'marker_name'} = $sname;

                if ( $ThisLibrary->library_name ne $UNKNOWN ) {
                    $search_args{'library_id'} = $ThisLibrary->id;
                }

                for my $type ( $EMPTY_STRING, $marker_type ) {
                    $type = $EMPTY_STRING if uc $type eq $UNDEFINED;

                    if ( $type ) {
                        $search_args{'marker_type'} = $type;
                    }

                    @markers = $mdb->marker_search( %search_args );

                    last MSEARCH if scalar @markers <= 1;
                }
            }

            my $num_markers = scalar @markers;

            if ( $num_markers > 1 ) {
                complain(
                    "Too many markers ($num_markers) match $marker_name ", 
                    "$file line $line_num" );
                $num_errors++;
                next MAPPING;
            }
            elsif ( $num_markers == 0 ) {
                if ( $marker_name && $marker_type && $marker_species ) {
                    $marker_id = $mdb->create_marker(
                        marker_name => $marker_name,
                        marker_type => $marker_type,
                        species     => $marker_species,
                        library_id  => $ThisLibrary->id,
                        analysis_id => $ThisAnalysis->id,
                    );
                }
                else {
                    complain(
                        "$file, line $line_num: ",
                        "Can't create new marker without ",
                        "name, type, and species ($marker_name)"
                    );
                    $num_errors++;
                    next MAPPING;
                }
            }
            else {
                $marker_id = $markers[0]->{'marker_id'};
            }
	    }

        if ( !$marker_id ) {
            print "$file, line $line_num: can't figure out marker id\n";
            $num_errors++;
            next MAPPING;
        }

        my $marker = Gramene::CDBI::Markers::Marker->retrieve( $marker_id );
        unless ( $marker ) {
            complain(
                "$file, line $line_num: Marker id '$marker_id' not found!" 
            );
            $num_errors++;
            next MAPPING;
        }

        if (   $marker_species ne $EMPTY_STRING
            && $marker->source_species->to_string ne $marker_species )
        {
            complain(
                sprintf(
                    '%s (%s) species "%s" not the same as file "%s"',
                    $marker->display_synonym->to_string, $marker->id,
                    $marker->source_species->to_string,  $marker_species
                )
            );

            if ( $marker->source_species->to_string eq 'UNKNOWN' ) {
                $mdb->update_Marker(
                    marker_id => $marker_id,
                    species   => $marker_species
                );
            }
        }

        my $display_synonym = $mdb->add_synonyms_to_Marker(
            Marker   => $marker,
            synonyms => [ $marker_name, @marker_synonyms ],
        );

        if ( my $feature_name = $rec->{'feature_name'} ) {
            $display_synonym = $mdb->add_synonyms_to_Marker(
                Marker   => $marker,
                synonyms => [ $feature_name ],
            );
        }

        $marker_type ||= $marker->marker_type->marker_type;
        if ( 
            my $details_table 
                = $mdb->marker_type_to_table_name( $marker_type )
        ) {
            my $details_class 
                = table_name_to_gramene_cdbi_class( 'Markers', $details_table );
            my $pk_name = $details_class->columns('Primary');
            my @columns = $details_class->columns('Ordered');

            my %set;
            for my $column ( @columns ) {
                next if $column eq $pk_name;
                my $value = $rec->{ $column };
                next unless defined $value && $value ne $EMPTY_STRING;
                $set{ $column } = $value;
            }

            if ( %set ) {
                my $details = $details_class->find_or_create(
                    { marker_id => $marker->id }
                );

                while ( my ( $column, $value ) = each %set ) {
                    $details->$column( $value );
                }

                $details->update;
            }
        }

        my $map = Gramene::CDBI::Markers::Map->find_or_create(
            {
                map_set_id => $map_set_id,
                map_name   => $map_name,
            }
        );

        if ( $map_acc ) {
            $map->cmap_map_accession( $map_acc );
            $map->update;
        }

        if ( defined $map_start ) {
            $map->start( $map_start );
            $map->update;
        }

        if ( defined $map_end ) {
            $map->end( $map_end );
            $map->update;
        }

        my $mapping;
        if ( $cmap_feature_accession ) {
            ($mapping) = Gramene::CDBI::Markers::Mapping->search(
                { cmap_feature_accession => $cmap_feature_accession }
            );

            if ( $mapping ) {
                $mapping->marker_id( $marker->id );
                $mapping->map_id( $map->id );
            }
            else {
                $mapping = Gramene::CDBI::Markers::Mapping->find_or_create(
                    { 
                        marker_id              => $marker->id,
                        map_id                 => $map->id,
                        cmap_feature_accession => $cmap_feature_accession,
                    }
                );
            }

            $mapping->display_synonym_id( $display_synonym->id );
            $mapping->start( $start );
            $mapping->end( $end );
            $mapping->marker_start( $marker_start );
            $mapping->marker_end( $marker_end );
            $mapping->marker_strand( $marker_strand );
            $mapping->update;
        }
        else {
            ($mapping) = Gramene::CDBI::Markers::Mapping->search(
                marker_id => $marker->id,
                map_id    => $map->id,
                start     => $start,
                end       => $end,
            );

            if ( $mapping ) {
                $mapping->display_synonym_id( $display_synonym->id );
                $mapping->marker_start( $marker_start );
                $mapping->marker_end( $marker_end );
                $mapping->marker_strand( $marker_strand );
                $mapping->update;
            }
            else {
                $mapping = Gramene::CDBI::Markers::Mapping->find_or_create(
                    {
                        marker_id          => $marker->id,
                        display_synonym_id => $display_synonym->id,
                        map_id             => $map->id,
                        start              => $start,
                        end                => $end,
                        marker_start       => $marker_start,
                        marker_end         => $marker_end,
                        marker_strand      => $marker_strand,
                    }
                );
            }
        }

        if ( $ThisAnalysis ) {
            $mapping->analysis_id( $ThisAnalysis->id );
            $mapping->update;
        }

        while ( my ( $field, $aliases ) = each %OPTIONAL_FIELDS ) {
            for my $fld ( $field, @{ $aliases || [] } ) {
                my $val = $rec->{ $fld };
                next unless defined $val && $val ne $EMPTY_STRING;
                next if lc( $mapping->$field ) eq lc( $val );
                $mapping->$field( $val );
                $mapping->update;
            }
        }

        printf "%s: %s %s %s on %s (%s-%s)\n", ++$num_mappings,
            $marker->source_species->species,
            $marker->marker_type->marker_type,
            $marker->display_synonym->marker_name, 
            $map_name, 
            $mapping->start, 
            $mapping->end
        ;
    }

    my $map_set = Gramene::CDBI::Markers::MapSet->retrieve( $map_set_id );
    for my $map ( $map_set->maps ) {
        my $map_end = $map->end || 0;
        my ( $h1, $h2 ) = $mdb->db->selectrow_array(
            'select max(start), max(end) from mapping where map_id=?',
            {}, ( $map->id )
        );
        my $highest = max( grep { defined $_ } $h1, $h2 ) || 0;

        if ( !$map_end || $map_end < $highest ) {
            $map->end( $highest );
            $map->update;
        }
    }

    if ( $Analysis ) {
        my $today = 
        my $epoch = parsedate('today');
        my $dt    = DateTime->from_epoch( epoch => $epoch );
        $Analysis->last_run( $dt->strftime('%Y-%m-%d') );
        $Analysis->update;
    }

    $num_files++;
}

print "Done, processed $num_files files, $num_mappings mappings ",
    "with $num_errors errors.\n"; 

sub complain {
    print STDERR @_, "\n";
}

sub prompt_for {
    my $object_type = shift or return;
    my $value       = shift || $EMPTY_STRING;

    my $class = 'Gramene::CDBI::Markers::' . 
        join( $EMPTY_STRING, map { ucfirst $_ } split /_/, $object_type );

    for (;;) {
        $value ||= prompt "$object_type? ";

        if ( $value eq 'q' ) {
            print "Quitting.\n";
            exit 0;
        }

        my @found = $class->search(
            $object_type => $value
        );

        my $num_found = scalar @found;

        if ( $num_found == 1 ) {
            return $found[0]->id;
        }
        else {
            print join($NL,
                "Found $num_found for '$value'",
                'Please be more specific ("q" to quit)',
                $EMPTY_STRING
            );
            $value = $EMPTY_STRING;
        }
    }
}

__END__

# ----------------------------------------------------

=head1 NAME

load-mappings.pl - load mapping data into markers db

=head1 VERSION

This documentation refers to load-mappings.pl version $Revision: 1.22 $

=head1 SYNOPSIS

  load-mappings.pl [options] file1.dat [file2.dat ...]

Required Arguments:

  -m|--ms-acc   CMap map set accession, e.g., "fef2006a"

Options:

  --species         Species of the mapping data
  --map-type        Map set map type, for example, 
                      "Genetic", "QTL", "mbin", "sequence"
  --map-set-name    Map set name, for example,
                      "TIGR/IRGSP Assembly v4 (Jan 2006)",
  --no-prompt       No prompt
  --ensembl         The input mapping file are in the format of loading
                    into ensembl database, analysis name will be parsed 
                    out from file name, the species and type can be decided 
                    by consulting a pre-defined hash
  --analysis-id     "analysis_id" provided, the program will not create 
                    any analysis
  --analysis-name   Analysis name to use for mappings, will be created
  --library-name    Library name to use for mappings, will be created;
                      assumes the same species as the map set; if not
                      defined, uses "UNKNOWN"
  --marker-id-required 
                   "marker_id" in the data file, no need to create 
                     or search for marker of the mapping

  --help         Show brief help and exit
  --man          Show full documentation
  --version      Show version and exit

=head1 DESCRIPTION

This script loads marker mappings into the markers database.  You must
specify the CMap map set accession to load into.  If the map set
cannot be found, you will be prompted to create it.  You must supply
the species, map type, and map set name before continuing.

The files should be tab-delimited with the first line containing the
field names.  Ideally, the field names will be the same as what's in
the markers database, but many aliases are accepted, e.g., "perc_id"
for "percent_identity" (these aliases taken from BioPipe output).

Each line of the input files must identify the marker to which to link
the mapping.  Ideally, "marker_id" will do this, but some combination
of "marker_name," "marker_species" and "marker_type" may also be used.
The file names may be used to specify the species and marker types of
the markers if they are not specified in the data, e.g., for the TIGR
tracks (please view the %FILES_TO_SPECIES_TYPE hash in the code).
Otherwise, the following fields are required:

=over 4

=item * marker_id

=item * marker_type

=item * marker_species

=item * marker_name (or "hit_name")

=item * map_name (or "chr" or "chr_with_offset")

=item * marker_start (or "chr_start")

=item * marker_end (or "chr_stop")

=back

The following are optional fields:

=over 4

=item * map_start, map_end (start and stop coordinates for the map)

=item * analysis

=item * marker_strand (or "strand")

=item * cigar_line

=item * score

=item * evalue

=item * percent_identity (or "perc_id")

=item * remark (or "comments")

=back

If no markers can be found for the available criteria, then a new
marker will be created.  If more than one marker matches, then an
error will be printed (to STDERR) and the mapping will be skipped.

When searching for a marker when "marker_id" is not present, then at
least "marker_name" must be present.  If it is not, then an error will
be printed and the mapping skipped.  The same will happen if a
"marker_id" is supplied that does not exist in the database.  The
other criteria (marker type and species) are used in various
combinations until just one marker can be found.  If a marker is found
without using the species in the file when one is present and then the
found marker does not have the same species, an error will be printed
that there is a mismatch.  If the marker db has an "UNKNOWN" species
for the marker and a different value is present in the file, then the
marker db will be updated to match the file.

For each record processed, a line will be printed to STDOUT and a
summary will be shown when the script has finished.
  
=head1 SEE ALSO

Gramene::Markers::DB, Gramene::CDBI::Markers.

=head1 AUTHOR

Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.

=head1 COPYRIGHT

Copyright (c) 2006 Cold Spring Harbor Laboratory

This library is free software;  you can redistribute it and/or modify 
it under the same terms as Perl itself.

=cut
