package Gramene::Marker::Import::GenBank;

# $Id: GenBank.pm,v 1.2 2007/06/05 19:25:54 kclark Exp $

=head1 NAME

Gramene::Marker::Import::GenBank - import GenBank data

=head1 SYNOPSIS

  use Gramene::Marker::Import::GenBank;

=head1 DESCRIPTION

Imports a GenBank SeqIO into the markers db.

=head1 SEE ALSO

Bio::SeqIO.

=head1 AUTHOR

William Spooner E<lt>whs@ebi.ac.ukE<gt>,
Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.

=head1 COPYRIGHT

Copyright (c) 2007 Cold Spring Harbor Laboratory

This library is free software;  you can redistribute it and/or modify 
it under the same terms as Perl itself.

=cut

# ----------------------------------------------------

use strict;
use Carp qw( croak );
use Data::Dumper;
use Gramene::CDBI::Markers;
use Gramene::Marker::DB;
use Perl6::Say;
use Readonly;

use lib '/home/kclark/src/bioperl-1.5.2_102/blib/lib';
use Bio::SeqIO;
use Bio::DB::Taxonomy;
use Bio::Taxon;

use base qw( Class::Base );

Readonly my $MAX_SEQ_LENGTH => 1_000_000;
Readonly my %ACCEPTABLE_TAX => (
    family => { poaceae => 1 },
    genus  => { arabidopsis => 1, poplar => 1 },
);
Readonly my $VERSION => sprintf "%d.%02d", q$Revision: 1.2 $ =~ /(\d+)\.(\d+)/;

# ----------------------------------------------------
sub import_data {
    my ( $self, %args ) = @_;
    my $verbose = defined $args{'verbose'} ? $args{'verbose'} : 1;
    my @files   = @{ $args{'files'} || [] } or croak( 'No marker data file' );

    my $conf   = Gramene::Config->new;
    my $mconf  = $conf->get('markers');
    my $tax_db = Bio::DB::Taxonomy->new( -source => 'entrez' );
    my $mdb    = Gramene::Marker::DB->new;

    my ( %family_by_tax_id, %species_by_tax_id );

    my $num_files             = 0; 
    my $total_num_sequences   = 0; 
    my $num_sequences_skipped = 0; 

    for my $file ( @files ) {
        $num_files++;
        print "file $num_files = '$file'\n" if $verbose;
        my $seqio = Bio::SeqIO->new(
            -format => 'GenBank', 
            -file   => $file
        );  

        my $num_sequences = 0;
        SEQUENCE:
        while ( my $seq = $seqio->next_seq ) {
            $num_sequences++;

            my $sp           = $seq->species or next SEQUENCE;
            my $tax_id       = $sp->ncbi_taxid;
            my $family       = $family_by_tax_id{ $tax_id };
            my $species_name = $species_by_tax_id{ $tax_id };

            if ( !$family ) {
                my $tax  = $tax_db->get_taxon( -taxonid => $tax_id );
                my $tree = Bio::Tree::Tree->new( -node => $tax );
                my $ftax = $tree->find_node( -rank => 'family' );
                if ( $ftax ) {
                    $family  = lc $ftax->scientific_name;
                    $family_by_tax_id{ $tax_id } = $family;
                }
            }

            if ( !$species_name ) {
                my $tax  = $tax_db->get_taxon( -taxonid => $tax_id );
                $species_name = $tax->scientific_name;
            }

            my %marker;
            if ( 
                $family && (
                       exists $ACCEPTABLE_TAX{'family'}{ $family }
                    || exists $ACCEPTABLE_TAX{'genus'}{ lc $sp->genus }
                )
            ) {
                my ($Species) = Gramene::CDBI::Markers::Species->search(
                    species => $species_name,
                );

                if ( $Species ) {
                    $marker{'Species'}    = $Species;
                    $marker{'species_id'} = $Species->id;
                }
            }

print "marker = ", Dumper(\%marker), "\n";

            my $sequence = $seq->seq;

            if ( 
                !$marker{'species_id'} 
                || length $sequence > $MAX_SEQ_LENGTH 
            ) {
                $num_sequences_skipped++;
print "no species or seq too long\n";
                next SEQUENCE;
            }

            $marker{'seq_io'}      = $seq;
            $marker{'details'}     = get_attributes( $seq, $sp );
            $marker{'synonyms'}    = get_synonyms( \%marker );
            $marker{'marker_type'} = 
                $marker{'details'}{'keyword'} =~ /^(GSS|EST)$/i
                    ? uc $1 
                    : $marker{'details'}{'mol_type'} =~ /mrna/i
                        ? 'mRNA'
                        : 'other_nucleotide'
            ;

            $marker{'correspondences'} = get_correspondences( \%marker, $mdb );

            print "marker = ", Dumper(\%marker);

            $total_num_sequences += $num_sequences;
            last if $marker{'marker_type'} ne 'other_nucleotide';
        }

        last;
    }

    if ( $verbose ) {
        printf 
            join("\n", 
                'Done', 
                'Files Processed    : %s', 
                'Sequences Processed: %s', 
                'Sequences Skipped  : %s', 
                '',
            ),
            $num_files, 
            $total_num_sequences,
            $num_sequences_skipped,
        ;
    }

    return 1;
}

# ----------------------------------------------------
sub get_attributes {
    my ( $seq, $species ) = @_;

    my %attr;
    if ( my $ac = $seq->annotation ) {
        $ac->isa( 'Bio::AnnotationCollectionI' )
            || die( "Need an AnnotationCollection" );

        for my $key ( $ac->get_all_annotation_keys() ) {
            for my $annot ( $ac->get_Annotations( $key ) ) {
                if ( $annot->isa( 'Bio::Annotation::SimpleValue' ) ) {
                    $attr{ $key } = $annot->value;
                }
                elsif ( $annot->isa( 'Bio::Annotation::Comment' ) ) {
                    $attr{ $key } = $annot->text;
                }
                elsif ( $annot->isa( 'Bio::Annotation::Reference' ) ) {
                    $attr{'ref_authors'}  = $annot->authors;
                    $attr{'ref_title'}    = $annot->title;
                    $attr{'ref_location'} = $annot->location;
                    $attr{'ref_pubmed'}   = $annot->pubmed;
                    $attr{'ref_medline'}  = $annot->medline;
                    if ( $attr{'ref_location'} =~ /\((\d{4,4})\)/ ) {
                        $attr{'ref_year'} = $1;
                    }
                }
                else {
                    $attr{ $key } = $annot;
                }
            }
        }
    }

    for my $sf ( $seq->get_all_SeqFeatures ) {
        for my $tag ( $sf->get_all_tags ) {
            $attr{ $tag } = join( ' || ', $sf->get_tag_values( $tag ) );
        }
    }

    if ( my $v = $species->sub_species ) {
        $attr{'sub_species'} = $v;
    }

    if ( my $v = $species->organelle ) { 
        $attr{'organelle'} = $v;
    }

    return wantarray ? %attr : \%attr;
}

# ----------------------------------------------------
sub get_correspondences {
    my $marker = shift or croak('No marker');
    my $mdb    = shift or croak('No mdb');

    my @syns     = ( map { $_->{'marker_name'} } @{ $marker->{'synonyms'} } );
    my %syn_type = ( 
        map { $_->{'marker_name'} => $_->synonym_type }
        @{ $marker->{'synonyms'} } 
    );

    my $marker_type = uc $marker->marker_type;

    #
    # Synonym types to never synonymise, e.g., integer database IDs
    #
    my %ignore_syntype = ( TRACE_TI => 1, GENBANK_TI => 1 );

    #
    # BACend/GSS specific correspondences
    #
    my @correspondences;
    my %corr_marker_ids;
    if ( $marker_type =~ /^(BAC|GSS)/ ) {
        # 
        # Explicit mate_pair via trace_id
        # 
        if ( my $mate_ti = $marker->{'details'}{'mate_pair'} ) {
            for my $mrk (
                $mdb->search_marker_synonyms(
                    synonyms     => $mate_ti,
                    Species      => $marker->{'Species'},
                    marker_type  => $marker->{'marker_type'},
                    synonym_type => 'TRACE_TI'
                )
            ) {
                my $mate_marker = $mdb->retrieve_Marker( $mrk->{'marker_id'} );
                if ( ! $corr_marker_ids{ $mrk->{'marker_id'} } ) {
                    $corr_marker_ids{ $mrk->{'marker_id'} } = $mrk;
                    push @correspondences, [
                        # $CDBI_CORR_TYPES{'MATE_PAIR'}, 
                        'MATE_PAIR', 
                        $mate_marker->id, undef
                    ];
                }
            }
        }

        #
        # Implicit mate_pair via template (same as clone, but indexed so fast)
        #
        my $search = 'search_MarkerDetailsGss';
        if ( my $template = $marker->{'details'}{'template'} ) {
            for my $md ( $mdb->$search( { template => $template } ) ) {
                my $mid = $md->id;
                $corr_marker_ids{ $mid } && next;
                $corr_marker_ids{ $mid } = {};
                push @correspondences, [ 
                    # $CDBI_CORR_TYPES{'MATE_PAIR'}, 
                    'MATE_PAIR', $mid, undef 
                ];
            }
        }

        #
        # Clone correspondence; create new clone if not found in database
        #
        if ( my $clone = $marker->{'details'}{'clone'} ) {
            my $cdbi_clone;
            for my $mrk (
                $mdb->search_marker_synonyms(
                    synonyms    => $clone,
                    # MarkerType => $CDBI_MARKER_TYPES{'clone'},
                    marker_type => 'Clone',
                    Species     => $marker->{'Species'}
                )
            ) {
                next if $ignore_syntype{ uc $mrk->{'synonym_type'} };
                $cdbi_clone = $mdb->retrieve_Marker( $mrk->{'marker_id'} );
            }

            if ( !$cdbi_clone ) {
                # Create a new clone marker - copy sequence and clone_lib
                my $clone_seq
                    = Bio::Seq::RichSeq->new( -accession_number => $clone );

                $clone_seq->species( $marker->{'_seq'}->species );

                if ( my $lib = $marker->{'details'}{'clone_lib'} ) {
                    my $feat = Bio::SeqFeature::Generic->new(
                        -tag => { clone_lib => $lib } 
                    );

                    $clone_seq->add_SeqFeature( $feat );
                }

                $cdbi_clone     = process_seq_to_marker( 
                    seq         => $clone_seq,
                    analysis    => $marker->{'Analysis'},
                    marker_type => 'Clone',
                    # marker_type => $CDBI_MARKER_TYPES{'clone'},
                );

                $cdbi_clone || die( "Could not create clone for $clone" );
            }

            my $mid = $cdbi_clone->{'marker_id'};
            $corr_marker_ids{ $mid } && next;
            $corr_marker_ids{ $mid } = {};
            push @correspondences, [ 
                # $CDBI_CORR_TYPES{CLONE_END}, 
                'CLONE_END', undef, $mid 
            ];
        }
    }

    #
    # Shared synony correspondences
    #
    my $found_synonym = 0;
    for my $mrk (
        $mdb->search_marker_synonyms(
            synonyms => \@syns,
            Species  => $marker->{'Species'}
        )
    ) {
        next if $corr_marker_ids{ $mrk->{'marker_type_id'} };

        #
        # Look for synonymous marker
        #
        if (
            uc $mrk->{'marker_type'} eq $marker_type
            and ( 
                uc $mrk->{'synonym_type'} eq 'UNKNOWN'
                or $mrk->{'synonym_type'} eq $syn_type{ $mrk->{'marker_name'} } 
            )
        ) {
            $found_synonym++;
            unshift @correspondences, [ 'SYNONYMOUS', $mrk->{'marker_id'} ];
        }
        else {
            #
            # Assume shared synonym
            #
            next if $ignore_syntype{ uc( $mrk->{synonym_type} ) };

            push @correspondences, [
                # $CDBI_CORR_TYPES{SHARED_SYNONYM}, 
                'SHARED_SYNONYM', $mrk->{marker_id}, undef
            ];
        }

        $corr_marker_ids{ $mrk->{marker_type_id} } = {};
    }

    if ( !$found_synonym ) {
        unshift @correspondences, undef;
    }

    return wantarray ? @correspondences : \@correspondences;
}

# ----------------------------------------------------
sub get_synonyms {
    my $marker = shift;
    my $seq    = $marker->{'seq_io'};
    my $attr   = $marker->{'details'};

    my @synonyms = ({
        synonym_type => 'GENBANK_ACCESSION',
        marker_name  => $seq->accession,
    });

    if ( my $syn = $seq->primary_id || $attr->{'ti'} ) {
        if ( $syn !~ /^Bio\:\:/ ) {    
            # A primary_seq is returned if no primary_id
            push @synonyms, {   
                marker_name  => $syn,
                synonym_type => 'GENBANK_GI',
            };
        }
    }

    if ( $seq->can( 'seq_version' ) ) {
        my $acc = $seq->accession;

        # Versioned accession
        for ( my $i = 1; $i <= $seq->seq_version; $i++ ) {
            my $syn = "$acc.$i";
            push @synonyms, {
                marker_name  => $syn,
                synonym_type => 'GENBANK_VERSION',
            };
        }
    }

    if ( $seq->can( 'get_secondary_accessions' ) ) {
        #Secondary accession
        my @synary = $seq->get_secondary_accessions();

        for my $syn ( @synary ) {
            push @synonyms, {
                marker_name  => $syn,
                synonym_type => 'UNKNOWN',
            };
        }
    }

    if ( my $syn = $attr->{'trace'} ) {
        push @synonyms, {
            marker_name  => $syn,
            synonym_type => 'UNKNOWN'
        };
    }  

    return wantarray ? @synonyms : \@synonyms;
}

1;
