package CSHL::ComparativeMaps::Admin::DataImport;

#-----------------------------------------------------
# $Id: DataImport.pm,v 1.13 2002/04/18 18:07:53 kclark Exp $
#
# File       : DataImport.pm
# Programmer : Ken Y. Clark, kclark@logsoft.com
# Created    : 2001/12/18
# Purpose    : import map data
#-----------------------------------------------------

=pod

=head1 NAME

CSHL::ComparativeMaps::Admin::DataImport - import map data

=head1 SYNOPSIS

  use CSHL::ComparativeMaps::Admin::DataImport;

  my $importer = CSHL::ComparativeMaps::Admin::DataImport->new(db=>$db);
  $importer->import(
      map_study_id => $map_study_id,
      fh           => $fh,
      map_type     => $map_type,
  ) or print "Error: ", $importer->error, "\n";

=head1 DESCRIPTION

This module encapsulates the logic for importing all the various types
of maps into the database.

=cut

use strict;
use vars qw( $VERSION %DISPATCH );
$VERSION  = (qw$Revision: 1.13 $)[-1];
%DISPATCH = (
    +GENETIC  => \&import_genetic,
    +PHYSICAL => \&import_physical,
);

use CSHL::ComparativeMaps::Constants;
use CSHL::ComparativeMaps::Admin::BaseObject;
use CSHL::ComparativeMaps::Admin::UpdatePhysicalFeatures;

use base qw( CSHL::ComparativeMaps::Admin::BaseObject );

#-----------------------------------------------------
sub import {
    my ( $self, %args ) = @_;
    my $db              = $self->db             or die 'No database handle';
    my $map_study_id    = $args{'map_study_id'} or die 'No map study id';
    my $fh              = $args{'fh'}           or die 'No file';
    my $overwrite       = $args{'overwrite'}    || 0;

    $self->be_quiet( $args{'be_quiet'} );

    my $map_type = uc $args{'map_type'} || $db->selectrow_array(
        q[
            select upper(mt.map_type) as map_type
            from   cmap_map_study ms, cmap_map_type mt
            where  ms.map_study_id=?
            and    ms.map_type_id=mt.map_type_id
        ],
        {}, ( $map_study_id )
    );

    my $updater = CSHL::ComparativeMaps::Admin::UpdatePhysicalFeatures->new;

    if ( my $sub = $DISPATCH{ $map_type } ) {
        my $result = $self->$sub(
            map_study_id => $map_study_id,
            fh           => $fh,
            db           => $db,
            overwrite    => $overwrite,
            updater      => $updater,
        );
    }
    else {
        return $self->err_out(
            "Don't know how to import data for maps of type '$map_type.'"
        );
    }

#    if ( $map_type eq GENETIC ) {
#        my $result = $self->import_genetic(
#            map_study_id => $map_study_id,
#            fh           => $fh,
#            db           => $db,
#            overwrite    => $overwrite,
#        );
#    }
#    elsif ( $map_type eq PHYSICAL ) {
#        my $result = $self->import_physical(
#            map_study_id => $map_study_id,
#            fh           => $fh,
#            db           => $db,
#            overwrite    => $overwrite,
#        );
#    }
#    else {
#        return $self->err_out(
#            "Don't know how to import data for maps of type '$map_type.'"
#        );
#    }
}

#-----------------------------------------------------
sub import_genetic {

=pod

=head2 import_genetic

Imports tab- or comma-delimited file with the following format:

  study linkage_group map_position_name position feature_name feature_type

=over 4

=item *   "position" can be in the format "start" or "start-stop"

=item *   "feature_type" must be either "marker" or "centromere"

=back

=cut

    my ( $self, %args ) = @_;
    my $map_study_id    = $args{'map_study_id'};
    my $fh              = $args{'fh'};
    my $db              = $args{'db'};
    my $overwrite       = $args{'overwrite'};
    my $updater         = $args{'updater'};

    #
    # First delete any existing information for the map_study
    #
    my @map_ids = @{
        $db->selectcol_arrayref(
            q[
                select genetic_map_id
                from   cmap_genetic_map
                where  map_study_id=?
            ],
            {}, ( $map_study_id )
        )
    };

    if ( $overwrite ) {
        $self->Print("Deleting map positions for ", scalar @map_ids, 
            " maps for map study $map_study_id\n" );

        $db->do(
            q[
                delete 
                from   cmap_genetic_map_position
                where  genetic_map_id in (].
                       join( ',', @map_ids ).q[)
            ]
        ) if @map_ids;
    }

    my ( %map_ids, %feature_type_ids, %feature_ids );
    $self->Print("Parsing file...\n");
    my $i = 0; # line counter
    while ( <$fh> ) {
        $i++;
        if ( $i % 50 == 0 ) {
            $self->Print(" $i\n");
        }
        else {
            $self->Print('#');
        }

        chomp;
        my ( $study, $linkage_group, $map_position_name, 
             $position, $feature_name, $feature_type 
        ) = map{ s/"//g; $_ } split /\t/;
        next unless $linkage_group;
        next unless $map_position_name;
        next unless $feature_name;
        next if     $position =~ m/[a-zA-Z]/; #NAN

        $feature_type ||= MARKER;
        my $feature_type_id = $feature_type_ids{ uc $feature_type };
        unless ( $feature_type_id ) {
            $feature_type_id = $db->selectrow_array(
                q[
                    select feature_type_id
                    from   cmap_feature_type
                    where  upper(feature_type)=?
                ],
                {},
                ( uc $feature_type )
            ) or die "No feature type id for '$feature_type'!\n";

            $feature_type_ids{ uc $feature_type } = $feature_type_id;
        }

        my ( $start, $stop );
        if ( $position =~ m/(\d+\.?\d+?)-(\d+\.?\d+?)/ ) {
#            ( $start, $stop ) = split( /-/, $position );
            $start = $1;
            $stop  = $2;
        }
        else {
            $start = $position;
            $stop  = undef;
        }
        next if $start && $start !~ FLOAT_REGEX;
        next if $stop  && $stop  !~ FLOAT_REGEX;

        my $feature_key = join('::', uc $feature_name, $feature_type_id);
        my $feature_id  = $feature_ids{ $feature_key };
        unless ( $feature_id ) {
            $feature_id = $db->selectrow_array(
                q[  
                    select feature_id
                    from   cmap_feature
                    where  upper(feature_name)=? 
                    and    feature_type_id=?
                ],
                {}, ( uc $feature_name, $feature_type_id )
            );
            
            unless ( $feature_id ) {
                $feature_id = next_number(
                    db           => $db, 
                    table_name   => 'cmap_feature',
                    id_field     => 'feature_id',
                ) or die 'No feature id';

                $db->do(
                    q[
                        insert 
                        into   cmap_feature 
                               ( feature_id, feature_name, feature_type_id )
                        values ( ?, ?, ? )
                    ],
                    {}, ( $feature_id, $feature_name, $feature_type_id )
                );

                $self->Print(
                    "Created $feature_type $feature_name ($feature_id).\n"
                );
            }

            $self->Print( "No feature_id for $feature_name\n"), next 
                unless $feature_id;
            $feature_ids{ $feature_key } = $feature_id;
        }

        my $genetic_map_id = $map_ids{ $linkage_group };
        unless ( $genetic_map_id ) {
            $genetic_map_id = $db->selectrow_array(
                q[
                    select genetic_map_id
                    from   cmap_genetic_map
                    where  map_study_id=?
                    and    linkage_group=?
                ],
                {}, ( $map_study_id, $linkage_group )
            );

            unless ( $genetic_map_id ) {
                $genetic_map_id = next_number(
                    db           => $db, 
                    table_name   => 'cmap_genetic_map',
                    id_field     => 'genetic_map_id',
                ) or die 'No map id';

                $db->do(
                    q[
                        insert
                        into   cmap_genetic_map 
                               ( genetic_map_id, map_study_id, linkage_group )
                        values ( ?, ?, ? )
                    ],
                    {}, 
                    ( $genetic_map_id, $map_study_id, $linkage_group )
                );
                $self->Print(
                    "Created map $linkage_group ($genetic_map_id).\n"
                );
            }

            $self->Print("No map id for chr. $linkage_group\n"), next 
                unless $genetic_map_id;
            $map_ids{ $linkage_group } = $genetic_map_id;
        }

        my $map_position_id = next_number(
            db           => $db, 
            table_name   => 'cmap_genetic_map_position',
            id_field     => 'genetic_map_position_id',
        ) or die 'No map position id';

        $db->do(
            q[
                insert
                into   cmap_genetic_map_position 
                       ( genetic_map_position_id, map_position_name, 
                         feature_id, genetic_map_id, position_start, 
                         position_stop )
                values ( ?, ?, ?, ?, ?, ? )
            ],
            {}, 
            ( $map_position_id, $map_position_name, 
              $feature_id, $genetic_map_id, $start, $stop 
            )
        );

        my $pos = join('-', map { defined $_ ? $_ : () } $start, $stop);
        $self->Print("Inserted $feature_type '$feature_name' on LG ".
              "$linkage_group ($genetic_map_id) at $pos.\n"
        );
    }

    #
    # Now update the physical relationships just for this genetic map study.
    #
    $updater->update(
        dry_run              => 0,
        genetic_map_study_id => $map_study_id,
        no_prompt            => 0,
    );

    $self->Print("Done\n");
    
    return 1;
}

#-----------------------------------------------------
sub import_physical {

=pod

=head2 import_physical

Purpose: 

This program will parse a tab-delimited input file with columns:

  clone   contig_name    start    end    markers

The data will look like this:

  a0001A06	ctg125	200	233
  a0001A07	ctg219	409	441	C189	RZ537	E018N17r
  a0001A08	ctg224	192	215	30A09r

Any existing records for the map study associated with the 
$map_study_id will be deleted, and the data will be 
reinserted.  Also, a file containing marker aliases will be 
used to determine other names a marker could go by.

=cut

    #
    # Make sure we can open the file arg and a db connection
    #
    my ( $self, %args ) = @_;
    my $map_study_id    = $args{'map_study_id'};
    my $fh              = $args{'fh'};
    my $db              = $args{'db'};
    my $overwrite       = $args{'overwrite'};
    my $updater         = $args{'updater'};

    $self->Print("Parsing file...\n");

    #
    # Parse the file into contig and markers information
    #
    my $i = 0; # line counter
    my ( %contigs, %feature_ids );
    while ( my $line = <$fh> ) {
        $i++;
        if ( $i % 50 == 0 ) {
            $self->Print(" $i\n");
        }
        else {
            $self->Print('#');
        }

        chomp $line;
        my ( $clone, $contig_name, $start, $end, $markers ) =
            map{ s/"//g; $_ } split( /\t/, $line, 5 );

        my $contig = $contigs{$contig_name} || ImportContig->new($contig_name);
        $contig->contig_name( $contig_name );
        $contig->start( $start );
        $contig->end( $end );

        if ( $clone =~ m/^(.+)sd\d+$/ ) {
            push @{ $contig->{'clones'} }, {
                clone_name => $1,
                start      => $start,
                end        => $end,
            };
        }

        #
        # Look up the info for each marker
        #
        my @markers = split( /\t/, $markers ) or next;
        for my $marker_name ( @markers ) {
            my $marker = $contig->{'markers'}{ $marker_name };

            unless ( $marker ) {
                my $feature_id = $feature_ids{ uc $marker_name } || 0;

                unless ( $feature_id ) {
                    $feature_id = $db->selectrow_array( 
                        q[
                            select feature_id 
                            from   cmap_feature
                            where  upper(feature_name)=?
                        ],
                        {}, ( uc $marker_name )
                    ) or next;

                    $feature_ids{ uc $marker_name } = $feature_id;
                }

                $marker = Marker->new( $feature_id );
            }

            $marker->marker_name( $marker_name );
            $marker->contig( $contig_name );
            $marker->start( $start );
            $marker->end( $end );
            $marker->confidence( '+' );

            $contig->{'markers'}{ $marker_name } = $marker;
        }

        $contigs{ $contig_name } = $contig;
    }

    #
    # Delete all the existing records for this map study
    #
    my @map_ids = @{ 
        $db->selectcol_arrayref( 
            q[
                select physical_map_id
                from   cmap_physical_map
                where  map_study_id=?
            ],
            {}, ( $map_study_id )
        )
    };

    if ( $overwrite ) {
        $self->Print( "Deleting ". scalar @map_ids.
            " contigs for physical map study id $map_study_id\n");

        if ( my $map_id_string = join( ',', @map_ids ) ) {
            for my $table ( 
                qw[ cmap_physical_map_position cmap_physical_map ] 
            ) {
                $db->do( 
                    qq[
                        delete 
                        from   $table 
                        where  physical_map_id in ($map_id_string)
                    ]
                );
            }
        }
    }

    #
    # Insert contig information
    #
    for my $contig ( values %contigs ) {
        my $contig_name = $contig->contig_name;
        my $start       = defined $contig->start ? $contig->start : 1;
        my $end         = $contig->end;

        my $physical_map_id = next_number(
                db           => $db, 
                table_name   => 'cmap_physical_map',
                id_field     => 'physical_map_id',
            ) or do { $self->Print( "Couldn't get map id\n" ); next }
        ;

        $db->do(
            q[
                insert 
                into   cmap_physical_map 
                       ( physical_map_id, map_study_id, map_name, 
                         position_start, position_stop )
                values ( ?, ?, ?, ?, ? )
            ],
            {}, ( $physical_map_id, $map_study_id, $contig_name, $start, $end ) 
        );
        $contig->map_id( $physical_map_id );

        $self->Print(
        "Inserted $contig_name ($physical_map_id) with ".
            "start = $start, end = $end\n");

        for my $marker ( values %{ $contig->{'markers'} } ) {
            my $marker_name = $marker->marker_name;
            my $marker_id   = $marker->marker_id;
            my $contig      = $marker->contig;
            my $start       = $marker->start || 1;
            my $end         = $marker->end;
            my $position    = ( $start + $end ) / 2;
            my $confidence  = $marker->confidence;

            my $physical_map_position_id = next_number(
                    db           => $db, 
                    table_name   => 'cmap_physical_map_position',
                    id_field     => 'physical_map_position_id',
                ) or do { warn "Couldn't get map id\n"; next }
            ;

            $db->do( 
                q[
                    insert
                    into   cmap_physical_map_position
                           ( physical_map_position_id, map_position_name, 
                             feature_id, physical_map_id, position_start, 
                             bac_hits 
                           )
                    values ( ?, ?, ?, ?, ?, ? )
                ],
                {}, 
                ( $physical_map_position_id, $marker_name, $marker_id, 
                  $physical_map_id, $position, $confidence 
                ) 
            );

            $self->Print("  Inserted marker $marker_name at $position\n");
        }

        #
        # Get the feature type id for a clone.
        #
        my $feature_type_id = $db->selectrow_array(
            q[
                select feature_type_id
                from   cmap_feature_type
                where  upper(feature_type)=?
            ],
            {},
            ( CLONE )
        ) or die "No feature type id for CLONE!\n";

        #
        # Insert each clone as a feature on a physical map.
        #
        for my $clone ( @{ $contig->{'clones'} } ) {
            my $clone_name = $clone->{'clone_name'};
            my $feature_id = $feature_ids{ $clone_name } || 0;
            unless ( $feature_id ) {
                $feature_id = $db->selectrow_array(
                    q[
                        select feature_id
                        from   cmap_feature
                        where  upper(feature_name)=?
                    ],
                    {},
                    ( uc $clone_name )
                ) || 0; 
            }

            unless ( $feature_id ) {
                $feature_id    =  next_number(
                    db         => $db,
                    table_name => 'cmap_feature',
                    id_field   => 'feature_id',
                ) or die "Couldn't get next numbmer for cmap_feature.\n";

                $db->do(
                    q[
                        insert 
                        into   cmap_feature 
                               ( feature_id, feature_name, feature_type_id )
                        values ( ?, ?, ? )
                    ],
                    {},
                    ( $feature_id, $clone_name, $feature_type_id )
                );
            }

            $feature_ids{ $clone_name } = $feature_id;

            my $physical_map_position_id = next_number(
                db         => $db,
                table_name => 'cmap_physical_map_position',
                id_field   => 'physical_map_position_id',
            ) or die 
            "Couldn't get next number for cmap_physical_map_position.\n";

            $db->do(
                q[
                    insert
                    into   cmap_physical_map_position
                           ( physical_map_position_id, map_position_name,
                             feature_id, physical_map_id, position_start,
                             position_stop
                           )
                    values ( ?, ?, ?, ?, ?, ? )
                ],
                {},
                ( $physical_map_position_id, $clone_name, $feature_id,
                  $contig->map_id, $clone->{'start'}, $clone->{'end'} 
                )
            );

            $self->Print(
                "  Inserted clone ", $clone->{'clone_name'}, 
                " (", $clone->{'start'}, ", ", $clone->{'end'}, ").\n"
            );
        }
    }

    #
    # Now update the physical relationships just for this physical map study.
    #
    $updater->update(
        dry_run               => 0,
        physical_map_study_id => $map_study_id,
        no_prompt             => 0,
    );


    $self->Print("Done\n"); 
    return 1;
}

#-----------------------------------------------------
sub extract_numbers {
    my $arg = shift;
    $arg =~ s/[^\d]//g;
    return $arg;
}

# ----------------------------------------------------
package ImportContig;
# ----------------------------------------------------

sub new {
    my $class     = shift;
    my $self      = { contig_name => shift() };
    return bless $self, $class;
}

#-----------------------------------------------------
sub start {
#
# Take the minimum start
#
    my $self = shift;
    if ( my $arg = shift ) {
        $self->{'start'} = $arg unless defined $self->{'start'};
        $self->{'start'} = $arg if $arg < $self->{'start'};
    }
    return $self->{'start'};
}

#-----------------------------------------------------
sub end {
#
# Take the maximum end
#
    my $self = shift;
    if ( my $arg = shift ) { 
        $self->{'end'} = $arg if $arg > $self->{'end'};
    }
    return $self->{'end'};
}

#-----------------------------------------------------
sub contig_name {
    my $self = shift;
    if ( my $arg = shift ) {
        $self->{'contig_name'} = $arg;
    }
    return $self->{'contig_name'};
}

#-----------------------------------------------------
sub map_id {
    my $self = shift;
    $self->{'map_id'} = shift if @_;
    return $self->{'map_id'};
}

# ----------------------------------------------------
package Marker;
# ----------------------------------------------------

sub new {
    my $class     = shift;
    my $marker_id = shift;
    my $self      = { marker_id => $marker_id };
    return bless $self, $class;
}

#-----------------------------------------------------
sub start {
#
# Take the maximum start
#
    my $self = shift;
    if ( my $arg = shift ) {
        $self->{'start'} = $arg if $arg > $self->{'start'};
    }
    return $self->{'start'};
}

#-----------------------------------------------------
sub end {
#
# Take the minimum end
#
    my $self = shift;
    if ( my $arg = shift ) { 
        $self->{'end'} = $arg unless defined $self->{'end'};
        $self->{'end'} = $arg if $arg < $self->{'end'};
    }
    return $self->{'end'};
}

#-----------------------------------------------------
sub marker_name {
    my $self = shift;
    if ( my $arg = shift ) {
        $self->{'marker_name'} = $arg;
    }
    return $self->{'marker_name'};
}


#-----------------------------------------------------
sub contig {
    my $self = shift;
    if ( my $arg = shift ) {
        $self->{'contig'} = $arg;
    }
    return $self->{'contig'};
}

#-----------------------------------------------------
sub marker_id {
    return shift->{'marker_id'};
}

#-----------------------------------------------------
sub confidence {
    my ( $self, $arg ) = @_;
    if ( $arg =~ m/^\d+$/ ) {
        $self->{'confidence'} = $arg;
    }
    elsif ( $arg =~ m/^\+$/ ) {
        $self->{'confidence'} += 1;
    }
    return $self->{'confidence'} || 0;
}

1;

#-----------------------------------------------------
# Which way does your beard point tonight?
# Allen Ginsberg
#-----------------------------------------------------

=pod

=head1 AUTHOR

Ken Y. Clark, kclark@logsoft.com

=head1 SEE ALSO

perl(1).

=cut
