package GDPDM::Import;

# vim: set ts=4 sw=4 et tw=78:

# $Id: Import.pm,v 1.7 2006/10/27 14:35:29 kycl4rk Exp $

use strict;
use Carp qw( croak );
use Data::Dumper;
use Data::Stag;
use GDPDM::CDBI;
use GDPDM::Utils qw( table_name_to_class get_logger );
use Graph::Directed;
use Graph::Traversal::DFS;
use Readonly;

Readonly my $EMPTY_STR => q{};

my $DB;

$SIG{'INT'} = sub {
    my $logger = get_logger;
    $logger->alert('Import aborted');
    $DB->rollback if defined $DB;
    exit 0;
};

# --------------------------------------------------------
sub import_data {
    my $class     = shift;
    my $file      = shift or croak 'No input file';
    my $opts_ref  = shift || {};
    my $logger    = get_logger( $opts_ref );

    $logger->info("Starting parse of '$file' with Data::Stag");
    my $stag;
    eval {
        $stag = Data::Stag->parse( $file );
    };

    if ( $@ || ! defined $stag ) {
        croak sprintf( "Error parsing '$file'%s\n", $@ ? ": $@" : $EMPTY_STR);
    }

    if ( my $db_name = $opts_ref->{'db_name'} ) {
        GDPDM::CDBI->db_Main( $db_name );
    }

    $logger->info("Starting import of $file");

    my $DB = GDPDM::CDBI->db_Main;
    $DB->{'RaiseError'} = 1;
    $DB->{'PrintError'} = 0;
    $DB->{'AutoCommit'} = 0;

    my %class_to_node 
        = map { table_name_to_class($_), $_ } GDPDM::CDBI->represented_tables;

    my $lookup = _verify( $stag, \%class_to_node, $logger );

    if ( $opts_ref->{'verify_only'} ) {
        $logger->info("Verification complete.");
        return 1;
    }

    # Make a depth-first search of our graph.  This ensures that we
    # insert records first upon which later inserts will rely.  E.g.,
    # we will insert all the taxonomy records found first before
    # trying to insert passport records that would be linked.
    my $g = GDPDM::CDBI->as_graph;
    my $d = Graph::Traversal::DFS->new( $g );
    $d->preorder;

    my %valid_node = map { $_, 1 } GDPDM::CDBI->represented_tables;

    my $num_imported = 0;
    eval {
        for my $node_name ( $d->dfs ) {
            my $num_in_node = scalar values %{ $lookup->{$node_name} } or next;
            $logger->info("Importing $num_in_node $node_name records");

            my $class   = table_name_to_class( $node_name );
            my $pk_name = $class->columns('Primary');
            my $has_a   = $class->meta_info('has_a') || {};

            my @has_a;
            while ( my ( $fk_field, $fk_info ) = each %$has_a ) {
                my $fk_class = $fk_info->{'foreign_class'};
                my $fk_node  = $class_to_node{ $fk_class } or 
                               die "No node for '$fk_class'\n";
                push @has_a, [ $fk_field, $fk_node ];
            }

            for my $data ( values %{ $lookup->{ $node_name } } ) {
                my @fields = keys %$data;

                # Need to unescape parens from S-expression
                for my $value ( values %$data ) {
                    next if !defined $value || $value eq $EMPTY_STR;
                    $value =~ s{ \\ ([()]) }{$1}xmsg; 
                } 

                for my $fk ( @has_a ) {
                    my ( $fk_field, $fk_node ) = @$fk;
                    my $fk_value = $data->{ $fk_field };
                    next if !defined $fk_value || $fk_value eq $EMPTY_STR;

                    my $new_fk_id  
                        = $lookup->{ $fk_node }{ $fk_value }{'new_pk_id'} or
                        croak "No new PK for $fk_node '$fk_value'";

                    $data->{ $fk_field } = $new_fk_id;
                }

                my $object = _find_or_create(
                    class  => $class,
                    fields => \@fields,
                    data   => $data,
                    update => $opts_ref->{'update'},
                    logger => $logger,
                );

                for my $field ( @fields ) {
                    next if $field eq $pk_name;
                    my $val = $data->{ $field };
                    if ( defined $val && $val ne $EMPTY_STR ) {
                        $object->$field( $val );
                    }
                }

                $object->update;

                $logger->info( "  + $pk_name $object->{ $pk_name }");

                # Remember the new PK value for referencing objects.
                $data->{'new_pk_id'} = $object->id;
                $num_imported++;
            }
        }

        $DB->commit if defined $DB;
    };

    if ( my $err = $@ ) {
        eval { $DB->rollback if defined $DB };
        $logger->alert("Error importing $file: $err");
        croak "ERROR: $err\n";
    }
    else {
        $logger->info(
            "Finished processing '$file', imported $num_imported records."
        );
    }

    return $num_imported;
}

# --------------------------------------------------------
sub _verify {
    my ( $stag, $class_to_node, $logger ) = @_;

    my %nodes      = map { $_->[0], 1 } $stag->kids;
    my %valid_node = map { $_, 1 } GDPDM::CDBI->represented_tables;
    my $g          = GDPDM::CDBI->as_graph;
    my $d          = Graph::Traversal::DFS->new( $g, next_alphabetic => 1 );
    $d->preorder;

    #
    # First we'll traverse the nodes of the Stag structure and check
    # all the field names, see if foreign key references make sense,
    # and en route create a graph of the nodes.
    #
    my %lookup;
    my @errors;
    NODE:
    for my $node ( keys %nodes ) {
        $logger->info("Verifying node '$node'");

        if ( !$valid_node{ $node } ) {
            my $e = "Bad node '$node'";
            $logger->info( $e );
            push @errors, $e;
            next NODE;
        }

        my $class   = table_name_to_class( $node );
        my %valid   = map { $_, 1 } $class->columns('All');
        my $pk_name = $class->columns('Primary');
        my @fks     = keys %{ $class->meta_info('has_a') || {} };

        OBJECT:
        for my $object ( $stag->find( $node ) ) {
            next OBJECT unless UNIVERSAL::isa($object, 'Data::Stag::StagImpl');
            my %data = $object->hash;

            # Make sure fields are valid
            if ( my @bad = grep { ! defined $valid{ $_ } } keys %data ) {
                push @errors, sprintf(
                    "Unrecognized field%s in $node (%s = %s): %s", 
                    @bad > 1 ? 's' : $EMPTY_STR, 
                    $pk_name,
                    $data{ $pk_name },
                    join( ', ', @bad ),
                );
            }

            for my $fk ( @fks ) {
                if ( !defined $data{ $fk } || $data{ $fk } eq $EMPTY_STR ) {
                    delete $data{ $fk };
                }
            }

            $lookup{ $node }{ $data{ $pk_name } } = \%data;
        }
    }

    for my $node ( keys %lookup ) {
        my $class   = table_name_to_class( $node );
        my $pk_name = $class->columns('Primary');
        my $has_a   = $class->meta_info('has_a') or next;
    
        while ( my ( $fk_field, $fk_info ) = each %$has_a ) {
            my $fk_class   = $fk_info->{'foreign_class'};
            my $fk_id_name = $fk_class->columns('Primary');
            my $fk_node    = $class_to_node->{ $fk_class }
                or die "No node for '$fk_class'\n";
            $has_a->{ $fk_field }{'fk_check'} = [ $fk_node, "$fk_id_name" ];
        }

        # Check foreign key references.
        for my $object ( values %{ $lookup{ $node } } ) {
            while ( my ( $fk_field, $fk_info ) = each %$has_a ) {
                my $fk_value = $object->{ $fk_field };
                next unless defined $fk_value && $fk_value ne $EMPTY_STR;

                my ( $fk_node, $fk_id_name ) = @{ $fk_info->{'fk_check'} };

                $logger->debug(
                    "Checking FK to $fk_node.$fk_id_name $fk_value"
                );

                if (!exists $lookup{ $fk_node }{ $fk_value }) {
                    push @errors, 
                        "$node.$pk_name '$object->{ $pk_name }' " .
                        "references missing $fk_node.$fk_id_name " .
                        "'$fk_value'";
                }
            }
        }
    }

    if ( @errors ) {
        croak join "\n", 
            'Errors in verification:',
            @errors,
            'Import aborted.',
        ;
    }

    return \%lookup;
}

# ----------------------------------------------------
sub _find_or_create {
    my %args     = @_;
    my $data     = $args{'data'} or croak 'No data';
    my $class    = $args{'class'};
    my $update   = $args{'update'};
    my $logger   = $args{'logger'};
    my $pk_name  = $class->columns('Primary');
    my @unique   = ( $class->columns('Unique') );

    my $new;
    if ( $update ) {
        $new = $class->retrieve( $data->{ $pk_name } );
    }
    else {
        # Use a unique key to lookup
        if ( @unique ) {
            for my $field ( @unique ) {
                my $value = $data->{ $field };
                next if !defined $value || $value eq $EMPTY_STR;
                ( $new ) = $class->search( $field, $value );
                last if $new;
            }

            # If found, update with new data
            if ( $new ) {
                for my $field ( keys %$data ) {
                    next if $field eq $pk_name;
                    my $value = $data->{ $field };
                    next if !defined $value || $value eq $EMPTY_STR;
                    $new->$field( $value );
                }
                $new->update;
            }
        }
        
        if ( !$new ) {
            my @fields = grep { $_ ne $pk_name } @{ $args{'fields'} || [] };

            # Search using just the defined fields with data
            my %search;
            for my $fld ( @fields ) {
                if ( 
                       defined $data->{ $fld } 
                    && $data->{ $fld } ne $EMPTY_STR 
                ) {
                    $search{ $fld } = $data->{ $fld };
                }
            }

            my @found = $class->search( %search );

            # If something is found, see if it matches entirely
            FIND:
            for my $find ( @found ) {
                my $ok = 1;

                CHECK:
                for my $fld ( @fields ) {
                    next CHECK if !defined $data->{ $fld };

                    my $cur_value = $find->$fld();
                    my $new_value = $data->{ $fld };

                    # If there's a difference, skip it
                    if ( $cur_value ne $new_value ) {
                        $ok = 0;
                        last CHECK;
                    }
                }

                if ( $ok ) {
                    $new = $find;
                    last FIND;
                }
            }

            # If all else fails, create it anew
            if ( !$new ) {
                $new = $class->insert( \%search );
            }
        }
    }

    return $new;
}

1;

# --------------------------------------------------------

=pod

=head1 NAME

GDPDM::Import - import data file into a GDPDM schema

=head1 VERSION

This documentation refers to version $Revision: 1.7 $.

=head1 SYNOPSIS

  use GDPDM::Import;

=head1 DESCRIPTION

Imports a data file.

=head1 METHODS

=head2 import_data

  GDPDM::Import->import_data( $file );

Imports a structured file (S-expression, XML, indented).

=head1 AUTHOR

Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.

=head1 COPYRIGHT

Copyright (c) 2006 Cold Spring Harbor Laboratory

This library is free software;  you can redistribute it and/or modify 
it under the same terms as Perl itself.

=cut
