package Gramene::Marker::Import::Tab;

# $Id: Tab.pm,v 1.2 2007/05/23 01:31:48 kclark Exp $

=head1 NAME

Gramene::Marker::Import::Tab - a Gramene module

=head1 SYNOPSIS

  use Gramene::Marker::Import::Tab;

=head1 DESCRIPTION

Description of module goes here.

=head1 SEE ALSO

perl.

=head1 AUTHOR

Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.

=head1 COPYRIGHT

Copyright (c) 2007 Cold Spring Harbor Laboratory

This library is free software;  you can redistribute it and/or modify 
it under the same terms as Perl itself.

=cut

# ----------------------------------------------------

use strict;
use Carp qw( croak );
use Gramene::Marker::DB;
use Readonly;
use Text::ParseWords qw( parse_line );
use Text::RecordParser::Tab;

use base qw( Class::Base );

Readonly my $EMPTY_STR   => q{};
Readonly my $COMMA       => q{,};
Readonly my $COMMA_SPACE => q{, };
Readonly my %XREF_TYPES  => (
    genbank   => 'GenBank',
    reference => 'Gramene Literature',
);
Readonly my $VERSION => sprintf "%d.%02d", q$Revision: 1.2 $ =~ /(\d+)\.(\d+)/;

sub import_data {
    my ( $self, %args ) = @_;
    my @files    = @{ $args{'files'} || [] } or croak( 'No marker data file' );
    my $analysis = $args{'analysis'} || '';
    my $library  = $args{'library'}  || '';
    my $verbose  = defined $args{'verbose'} ? $args{'verbose'} : 1;
    my $search_only_primary_name = $args{'search_only_primary_name'} || '';

    my $mdb   = Gramene::Marker::DB->new or die Gramene::Marker::DB->error;

    my ( $no_processed, $no_files, $no_errors ) = ( 0, 0, 0 );

    $SIG{'INT'} = sub { 
        printf 
            "Halted.  Processed %s record%s in %s file%s with %s error%s.\n",
            $no_processed, ( $no_processed == 1 ) ? $EMPTY_STR : 's',
            $no_files    , ( $no_files     == 1 ) ? $EMPTY_STR : 's',
            $no_errors   , ( $no_errors    == 1 ) ? $EMPTY_STR : 's',
        ; 
        exit 0; 
    };

    my (
        %species_id,   %germplasm_id, %marker_type_id,
        %xref_type_id, %analysis_id, %library_id
    );

    my $default_analysis_id = 0;
    if ( $analysis ) {
        my ($Analysis) = Gramene::CDBI::Markers::Analysis->search(
            analysis_name => $analysis,
        );

        if ( !$Analysis && $analysis =~ /^\d+$/ ) {
            ($Analysis) = Gramene::CDBI::Markers::Analysis->retrieve($analysis);
        }

        if ( !$Analysis ) {
            $Analysis = Gramene::CDBI::Markers::Analysis->insert({
                analysis_name => $analysis 
            });
        }

        if ( $Analysis ) {
            $default_analysis_id = $Analysis->id;
        }
    }

    my $default_library_id = 0;
    if ( $library ) {
        my ($Library) = Gramene::CDBI::Markers::Library->search(
            library_name => $library,
        );

        if ( !$Library && $library =~ /^\d+$/ ) {
            ($Library) = Gramene::CDBI::Markers::Library->retrieve($library);
        }

        if ( !$Library ) {
            $Library = Gramene::CDBI::Markers::Library->insert({
                library_name => $library 
            });
        }

        if ( $Library ) {
            $default_library_id = $Library->id;
        }
    }

    for my $file ( @files ) {
        $no_files++;

        print "Processing file '$file'\n" if $verbose;
        my $p = Text::RecordParser->new( 
            filename        => $file,
            field_separator => "\t",
            header_filter   => sub { $_ = shift; s/\s+/_/g; lc $_ },
        );
        $p->bind_header;

        my $columns_checked = 0;
        my $line_no = 0;
        RECORD:
        while ( my $rec = $p->fetchrow_hashref ) {
            unless ( $columns_checked ) {
                check_input_columns(
                    file        => $file,
                    mdb         => $mdb,
                    marker_type => $rec->{'marker_type'},
                    parser      => $p,
                );
                $columns_checked = 1;
            }

            $line_no++;

            my $name    = $rec->{'marker_name'} or do {
                warn "Line $line_no: no marker name\n"; 
                $no_errors++; 
                next RECORD;
            };

            my $type    = $rec->{'marker_type'} or do {
                warn "Line $line_no: no marker type\n"; 
                $no_errors++; 
                next RECORD;
            };

            my $species = $rec->{'species'} || $rec->{'marker_species'} || '';
            if ( !$species ) {
                warn "Line $line_no: no species\n"; $no_errors++; 
                next RECORD;
            };

            my $germplasm   = $rec->{'germplasm'} 
                              || $rec->{'germplasm_name'} 
                              || 'UNKNOWN';
            my $description = $rec->{'description'} || $EMPTY_STR;
            $description    =~ s/^"|"$//g; # kill quotes

            print "$line_no: $species $type '$name'\n" if $verbose;

            my $species_id = $species_id{ $species };
            if ( !$species_id ) {
                $species_id = $mdb->find_or_create_species( $species )
                              or die $mdb->error;
                $species_id{ $species } = $species_id;
            }

            my $marker_type_id = $marker_type_id{ $type };
            if ( !$marker_type_id ) {
                $marker_type_id = $mdb->find_or_create_marker_type( $type ) 
                                  or die $mdb->error;
                $marker_type_id{ $type } = $marker_type_id;
            }

            my $germplasm_id = $germplasm_id{ $germplasm };
            if ( $germplasm && ! $germplasm_id ) {
                $germplasm_id = $mdb->find_or_create_germplasm( 
                    $germplasm, $species_id
                ) or die $mdb->error;
                $germplasm_id{ $germplasm } = $germplasm_id;
            }

            my $analysis_id   = $rec->{'analysis_id'}   || 0;
            my $analysis_name 
                = $rec->{'analysis_name'} 
                ? $rec->{'analysis_name'} 
                : $default_analysis_id 
                    ? $EMPTY_STR : 'UNKNOWN';

            if ( $analysis_id ) {
                if ( $analysis_id !~ /^\d+$/ ) {
                    die "Analysis id ($analysis_id) not an integer value\n";
                }
            }
            elsif ( $analysis_name ) {
                if ( !$analysis_id{ $analysis_name } ) {
                    my $Analysis 
                        = Gramene::CDBI::Markers::Analysis->find_or_create({
                            analysis_name => $analysis_name
                        });

                    $analysis_id{ $analysis_name } = $Analysis->id;
                }

                $analysis_id = $analysis_id{ $analysis_name };
            }

            $analysis_id ||= $default_analysis_id;

            my $library_id   = $rec->{'library_id'}   || 0;
            my $library_name = $rec->{'library_name'} || 'UNKNOWN';
            if ( $library_id ) {
                if ( $library_id !~ /^\d+$/ ) {
                    die "Library id ($library_id) not an integer value\n";
                }
            }
            elsif ( $library_name ) {
                if ( !$library_id{ $library_name } ) {
                    my $Library 
                        = Gramene::CDBI::Markers::Library->find_or_create({
                            library_name => $library_name
                        });

                    $library_id{ $library_name } = $Library->id;
                }

                $library_id = $library_id{ $library_name };
            }
            $library_id ||= $default_library_id;

            my $marker_id = $rec->{'marker_id'} || 0;

            if ( $marker_id ) {
                my $marker = $mdb->view_marker( marker_id => $marker_id )
                    or die "Bad marker id '$marker_id'\n";
            }
            else {
                $marker_id         =  $mdb->find_or_create_marker(
                    search_only_primary_name => $search_only_primary_name,
                    marker_name              => qq["$name"],
                    marker_type_id           => $marker_type_id,
                    species_id               => $species_id,
                    analysis_id              => $analysis_id,
                    library_id               => $library_id,
                    synonyms                 => [
                        parse_line( $COMMA, 1,
                            $rec->{'marker_synonyms'} 
                            || $rec->{'synonyms'}
                            || $rec->{'marker_aliases'}
                        )
                    ]
                );
            }

            if ( $marker_id ) {
                $mdb->set_marker_details(
                    %$rec,
                    marker_id      => $marker_id,
                    marker_type_id => $marker_type_id,
                    germplasm_id   => $germplasm_id,
                    analysis_id    => $analysis_id,
                    description    => $description,
                ) or die $mdb->error;

                for my $xt ( qw[ genbank reference ] ) {
                    if ( my $xref_value = $rec->{ $xt } ) {
                        my $xref_type = $XREF_TYPES{ $xt } or 
                            die "Bad xref type ($xt)";

                        my $xref_type_id = $xref_type_id{ $xref_type };
                        unless ( $xref_type_id ) {
                            $xref_type_id = $mdb->find_or_create_xref_type( 
                                xref_type => $xref_type
                            ) or die $mdb->error;
                            $xref_type_id{ $xref_type } = $xref_type_id;
                        }

                        my $xref_id = $mdb->find_or_create_xref(
                            table_name   => 'marker',
                            record_id    => $marker_id,
                            xref_type_id => $xref_type_id,
                            xref_value   => $xref_value,
                        );
                    }
                }

                $no_processed++;
            }
            else {
                warn $mdb->error;
                next;
            }
        }
    }

    print "Processed '$no_files' files, '$no_processed' markers.\n" if $verbose;

    return 1;
}

# ----------------------------------------------------
sub check_input_columns {
    my %args        = @_;
    my $marker_type = $args{'marker_type'} or die 'No marker type';
    my $mdb         = $args{'mdb'};
    my $p           = $args{'parser'};
    my $file        = $args{'file'};

    my @non_details_fields = qw[ 
        marker_id 
        marker_name 
        marker_synonyms 
        marker_type 
        species 
        marker_species 
        germplasm 
        germplasm_name
        description 
        genbank
        reference
        seq
        analysis_name
        analysis_id 
    ];

    my ( $table_name, @valid_columns ) = $mdb->marker_type_to_table_name(
        $marker_type
    ) or return $mdb->error;
    my %valid_columns = map { $_, 1 } @valid_columns, @non_details_fields;

    my @invalid_columns;
    for my $col ( $p->field_list ) {
        if ( !$valid_columns{ $col } ) {
            push @invalid_columns, $col;
        }
    }

    if ( @invalid_columns ) {
        die join "\n",
            qq[File '$file'],
            qq[contains invalid columns for marker type '$marker_type':], 
            join( $COMMA_SPACE, @invalid_columns ), 
            $EMPTY_STR
        ;
    }
}

1;
