#!/usr/local/bin/perl

# $Id: import-cmap.pl,v 1.3 2005/10/20 16:06:39 steven Exp $

=head1 NAME

import-cmap.pl - imports CMap data into marker db

=head1 SYNOPSIS

  import-cmap.pl [options]

Options:

  -h|--help                Show brief help and exit
  -v|--version             Show version and exit
  -m|--map-set-aids=X[,Y]  CMap map set accession IDs
  -d|--datasource=X        CMap data source
  -l|--log=X               Create a log file "X"
  -f|--feature-type=X[,Y]  Restrict to only features of 
                           defined types
  -c|--create              Create markers found in CMap 
                           but not marker db

=head1 DESCRIPTION

Describe what the script does, what input it expects, what output it
creates, etc.

Notes from Noel:
 
 RICE MARKER PREFICES
 ====================
 
 Prefix Used_by Species Type                            Corresponds_to  Suffix
 -----------------------------------------------------------------------------
 RG     Cornell rice    RFLP genomic
 RZ     Cornell rice    RFLP cDNA
 RM     Cornell rice    SSR
 
 CDO    Cornell oat     RFLP cDNA
 BCD    Cornell barley  RFLP cDNA
 
 C      JRGP    rice    RFLP cDNA (Nipponbare callus)                   S
 R      JRGP    rice    RFLP cDNA (Nipponbare root)                     S
 G      JRGP    rice    RFLP genomic                                    S
 L      JRGP    rice    RFLP genomic (NotI-linking)                     S
 Y      JRGP    rice    RFLP (Nipponbare YAC-end clone)                 L,R
 TEL    JRGP    rice    RFLP (subtelomere clone)                        S
 W      JRGP    wheat   RFLP                            PSR             S
 S      JRGP    rice    RFLP cDNA (Nipponbare shoot)                    S
 F      JRGP    rice    RFLP cDNA (photoperiod-1)                       S
 B      JRGP    barley  RFLP                            ABC,ABG,ksu     S
 M      JRGP    maize   RFLP                            umc,csu,bnl,asg S
 P      JRGP    rice    RAPD                                            S
 T      JRGP    rice    STS                                             S
 V      JRGP    rice    RFLP                            (various)       S

 E                      RFLP derived from EST
 
A copy number suffix (consisting of "A" or "B", sometimes "C" or "D",
rarely "E", "F" or "X") may be appended to the marker name.  To
construct a well-formed alternate name, this suffix should be removed.

The JRGP markers may have a suffix "S" before the copy number suffix.
This suffix should also be removed.

The YAC-end clones may have a suffix "L" or "R" before the copy number
suffix.  Do NOT remove this suffix.

=head1 SEE ALSO

Gramene::Marker::DB.

=head1 AUTHOR

Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.

=head1 COPYRIGHT

Copyright (c) 2005 Cold Spring Harbor Laboratory

This library is free software;  you can redistribute it and/or modify 
it under the same terms as Perl itself.

=cut

# ----------------------------------------------------

use strict;
use Bio::GMOD::CMap;
use File::Basename;
use Getopt::Long;
use Gramene::DB;
use Gramene::Marker::DB;
use Pod::Usage;

use vars qw[ $VERSION ];
$VERSION = sprintf "%d.%02d", q$Revision: 1.3 $ =~ /(\d+)\.(\d+)/;

#
# Marker types
#
use constant AFLP  => 'AFLP';
use constant INDEL => 'Indel';
use constant RFLP  => 'RFLP';
use constant RAPD  => 'RAPD';
use constant SSR   => 'SSR';
use constant STS   => 'STS';

#
# Species names
#
use constant BARLEY => 'barley';
use constant OAT    => 'oat';
use constant RICE   => 'rice';
use constant MAIZE  => 'maize';
use constant WHEAT  => 'wheat';

my ( $help, $show_version, $data_source, $map_set_aids, $create, $log,
    $feature_types );
GetOptions(
    'h|help'            => \$help,
    'v|version'         => \$show_version,
    'm|map-set-aids=s'  => \$map_set_aids,
    'd|datasource:s'    => \$data_source,
    'c|create'          => \$create,
    'l|log:s'           => \$log,
    'f|feature-types:s' => \$feature_types,
);
pod2usage(2) if $help;

if ( $show_version ) {
    my $prog = basename( $0 );
    print "$prog v$VERSION\n";
    exit(0);
}

my @map_set_aids = split( /,/, $map_set_aids );

my $cmap = Bio::GMOD::CMap->new;
if ( $data_source ) {
    $cmap->data_source( $data_source ) or die $cmap->error;
}
$data_source = $cmap->data_source;
my $cdb = $cmap->db or die $cmap->error;
my $lib = Gramene::Marker::DB->new( admin => 1 );
my $mdb = $lib->db;

my ( $log_fh, %done );
if ( $log ) {
    if ( -e $log ) {
        open my $fh, $log or die "Can't read '$log': $!\n";
        my $hdr = <$fh>;
        while ( <$fh> ) {
            chomp;
            my @flds = split( /\t/, $_ );
            $done{ $flds[-1] }++;
        }
        close $fh;

        open $log_fh, ">>$log" or die "Can't write to '$log': $!\n";
    }
    else {
        open $log_fh, ">$log" or die "Can't write to '$log': $!\n";
        print $log_fh join("\t", qw/
            species
            marker_type
            feature_name
            marker_id
            action
            cmap_feature_aid
        / ), "\n";
    }
}

my @map_sets;
if ( @map_set_aids ) {
    for my $ms_aid ( @map_set_aids ) {
        my $sth = $cdb->prepare(
            q[
                select ms.map_set_id,
                       ms.map_set_acc as map_set_aid,
                       ms.map_type_acc as map_type_aid,
                       ms.map_set_short_name as map_set_name,
                       s.species_acc as species_aid,
                       s.species_common_name as species
                from   cmap_map_set ms, cmap_species s
                where  ms.map_set_acc=?
                and    ms.species_id=s.species_id
            ]
        );
        $sth->execute( $ms_aid );
        my $map_set = $sth->fetchrow_hashref or next;
        push @map_sets, $map_set;
    }

    die "No valid map sets in arg '$map_set_aids'\n" unless @map_sets;
}
else {
    @map_sets = @{
        $cdb->selectall_arrayref(
            q[
                select   ms.map_set_id,
                         ms.map_set_acc as map_set_aid,
                         ms.map_type_acc as map_type_aid,
                         ms.map_set_short_name as map_set_name,
                         s.species_acc as species_aid,
                         s.species_common_name as species
                from     cmap_map_set ms, cmap_species s
                where    ms.species_id=s.species_id
                order by map_type_acc, s.display_order, s.species_common_name,
                         ms.display_order, ms.map_set_short_name
            ],
            { Columns => {} }
        )
    };
}

my @ft_aids = split( /,/, $feature_types );
    
print join("\n",
    "OK to import the following from CMap's '$data_source':",
    ( map { "  $_->{'map_type_aid'}-$_->{'species'}-$_->{'map_set_name'}" } 
        @map_sets ),
    sprintf( 'Create features: %s', $create  ? 'Yes' : 'No' ),
    sprintf( 'Feature Types  : %s', @ft_aids ? join(', ', @ft_aids) : 'All' ),
    sprintf( 'Log file       : %s', $log_fh  ? $log : 'None' ),
    '[Y/n] '
);
chomp( my $answer = <STDIN> );
if ( $answer =~ /^[Nn]/ ) {
    print "Not OK, exiting.\n";
    exit(0);
}

my $feature_sql = sprintf(
    q[
        select f.feature_id,
               f.feature_acc as feature_aid,
               f.feature_name,
               f.feature_type_acc as feature_type_aid,
               f.feature_start as start_position,
               f.feature_stop as stop_position
        from   cmap_feature f
        where  f.map_id=?
        %s
    ],
    scalar @ft_aids == 1 ? "and f.feature_type_acc='$ft_aids[0]' " :
    scalar @ft_aids  > 1 ? 'and f.feature_type_acc in (' . 
        join(', ', map { qq['$_'] } @ft_aids ) . ')' : 
    ''
);

my ( $no_map_sets, $no_features ) = (0,0);
for my $map_set ( @map_sets ) {
    my $ms_aid = $map_set->{'map_set_aid'};
    print "Processing map set '$ms_aid'\n";

    unless ( $map_set ) {
        print "Bad map set accession '$ms_aid'\n";
        next;
    }

    #
    # Verify the species, map type, germplasm
    #
    my $map_set_species = $map_set->{'species'};
    my $germplasm       = 'Unknown';
    my ( $ms_species_id, $ms_germplasm_id, $marker_species_id, 
        $map_type_id, $marker_germplasm_id, $map_set_id );

    my $maps = $cdb->selectall_arrayref(
        q[
            select map.map_id,
                   map.map_acc as map_aid,
                   map.map_name,
                   map.map_start as start_position,
                   map.map_start as stop_position
            from   cmap_map map
            where  map.map_set_id=?
        ],
        { Columns => {} },
        ( $map_set->{'map_set_id'} )
    );

    for my $map ( @$maps ) {
        my $features = $cdb->selectall_arrayref(
            $feature_sql, { Columns => {} }, ( $map->{'map_id'} )
        );
        print "Found ", scalar @$features, 
            " features on map $map->{map_name}\n";

        my $map_id;
        for my $f ( @$features ) {
            next if $f->{'feature_type_aid'} =~ /^(qtl|phen|int-phen|cen)$/;
            next if $done{ $f->{'feature_aid'} }++;

            my $feature_name   = $f->{'feature_name'};
            my $marker_type    = $f->{'feature_type_aid'};
            my $marker_species = $map_set->{'species'};

            $f->{'aliases'} = $cdb->selectcol_arrayref(
                q[
                    select alias
                    from   cmap_feature_alias
                    where  feature_id=?
                ],
                {},
                ( $f->{'feature_id'} )
            );

            my $new_species;
            if ( 
                $marker_type eq 'marker' || $marker_type eq 'low-lod'
            ) {
                if ( $feature_name =~ /^CDO\d+/i ) {
                    $marker_type = RFLP;
                    $new_species = OAT;
                }
                elsif ( $feature_name =~ /^(BCD|B)\d+/i ) {
                    $marker_type = RFLP;
                    $new_species = BARLEY;
                }
                elsif ( $feature_name =~ /^RM\d+/i ) {
                    $marker_type = SSR;
                    $new_species = RICE;
                }
                elsif ( $feature_name =~ /^W\d+/i ) {
                    $marker_type = RFLP;
                    $new_species = 'wheat';
                }
                elsif ( $feature_name =~ /^M\d+/i ) {
                    $marker_type = RFLP;
                    $new_species = MAIZE;
                }
                elsif ( $feature_name =~ /^P\d+/i ) {
                    $marker_type = 'RAPD';
                    $new_species = RICE;
                }
                elsif ( $feature_name =~ /^T\d+/i ) {
                    $marker_type = STS;
                    $new_species = RICE;
                }
                elsif ( $feature_name =~ /^P\d+\/M\d+/i ) {
                    $marker_type = AFLP;
                    $new_species = RICE;
                }
                elsif ( $feature_name =~ /^PC\d+M\d+/i ) {
                    $marker_type = AFLP;
                    $new_species = RICE;
                }
                elsif ( $feature_name =~ /^P\d+\/M\d+-\d+/ ) {
                    $marker_type = AFLP;
                    $new_species = RICE;
                }
                elsif ( $feature_name =~ /^E\d+M\d+.*\d+-P[12]$/i ) {
                    $marker_type = AFLP;
                    $new_species = RICE;
                }
                elsif ( $feature_name =~ /^E\d+M\d+-\d+/i ) {
                    $marker_type = AFLP;
                    $new_species = RICE;
                }
                elsif ( $feature_name =~ /^E\d+\/M\d+-\d+/i ) {
                    $marker_type = AFLP;
                    $new_species = RICE;
                }
                elsif ( $feature_name =~ /^EMP?\d+_\d+/i ) {
                    $marker_type = AFLP;
                    $new_species = RICE;
                }
                elsif ( $feature_name =~ /^ME\d+_\d+/i ) {
                    $marker_type = AFLP;
                    $new_species = RICE;
                }
                elsif ( $feature_name =~ /^CSU\d+/i ) {
                    $marker_type = RFLP;
                    $new_species = MAIZE;
                }
                elsif ( $feature_name =~ /^CT\d+/i ) {
                    $marker_type = SSR;
                    $new_species = RICE;
                }
                elsif ( $feature_name =~ /^GA\d+/i ) {
                    $marker_type = SSR;
                    $new_species = RICE;
                }
                elsif ( $feature_name =~ /^HHU\d+/i ) {
                    $marker_type = RFLP;
                    $new_species = MAIZE;
                }
                elsif ( $feature_name =~ /^ISU\d+/i ) {
                    $marker_type = RFLP;
                    $new_species = MAIZE;
                }
                elsif ( $feature_name =~ /^OSR\d+/i ) {
                    $marker_type = SSR;
                    $new_species = RICE;
                }
                elsif ( $feature_name =~ /^UAZ\d+/i ) {
                    $marker_type = RFLP;
                    $new_species = MAIZE;
                }
                elsif ( $feature_name =~ /^(UMC|BNL)\d+/i ) {
                    $marker_type = RFLP;
                    $new_species = MAIZE;
                }
                elsif ( $feature_name =~ /^XNpb\d+/i ) {
                    $marker_type = RFLP;
                    $new_species = RICE;
                }
                elsif ( 
                    $feature_name =~ /^(C|E|F|G|L|R|S|V|Y|RG|RZ|TEL)\d+/i 
                ) {
                    $marker_type = RFLP;
                    $new_species = RICE;
                }
                elsif ( $feature_name =~ /^WG\d+/i ) {
                    $marker_type = RFLP;
                    $new_species = WHEAT;
                }
                elsif ( $feature_name =~ /kyy03/i ) {
                    $marker_type = RFLP;
                }
                elsif ( 
                    $map_set->{'map_set_aid'} eq 'kl2004' &&
                    $feature_name =~ /^TXA/i 
                ) {
                    $marker_type = AFLP;
                }
                elsif ( 
                    $map_set->{'map_set_aid'} eq 'kl2004' &&
                    $feature_name =~ /^TXI/i 
                ) {
                    $marker_type = INDEL;
                }
                elsif ( 
                    $map_set->{'map_set_aid'} eq 'kl2004' &&
                    $feature_name =~ /^(TXP|GAP)/i 
                ) {
                    $marker_type = SSR;
                }
                elsif ( 
                    $map_set->{'map_set_aid'} eq 'kl2004' &&
                    $feature_name =~ /^(TXS|STS)/i 
                ) {
                    $marker_type = RFLP;
                }
            }
            elsif ( $marker_type =~ /^(BAC end sequence)-(.*)/ ) {
                $marker_type = $1;
                $new_species = $2;
            }

            $marker_type = 'Undefined' if $marker_type eq 'marker';

            #
            # Reset species_id? 
            #
            my $primary_feature_name = $feature_name;
            if ( $new_species ) {
                $marker_species = $new_species;

                if (
                    #
                    # Some BNL marker have dot replaced by underscore,
                    # also a trailing "a" or "b" suffix
                    #
                    $feature_name =~ /^BNL\d+_\d+[a-zA-Z]?/i
                ) {
                    push @{ $f->{'aliases'} }, $feature_name;
                    $primary_feature_name = $feature_name;
                    $primary_feature_name =~ s/_/./;
                    $primary_feature_name =~ s/[a-zA-Z]$//;
                }
                elsif ( 
                    #
                    # "Y" markers may have also have "L" or "R" penultimate
                    # suffix before the "A" or "B" suffix.  Remove that last
                    # letter, but not the "L" or "R".
                    #
                    $feature_name =~ /^(Y.*\d+[LR]?)[a-zA-Z]$/ ||

                    #
                    # Remove (K) or (C) suffix.
                    #
                    $feature_name =~ /^(\S*)\s*\((K|C)\)/ ||

                    #
                    # Otherwise just try to remove any suffix
                    #
                    $feature_name =~ /(.*\d+)S?[a-zA-Z]$/
                ) {
                    my $unsuffixed = $1;
                    push @{ $f->{'aliases'} }, $feature_name;
                    $primary_feature_name = $unsuffixed;
                }
                elsif (
                    #
                    # RM markers should have leading zeros removed
                    #
                    $feature_name =~ /^(RM)0+(\d+)/i
                ) {
                    my $unpadded = "$1$2";
                    push @{ $f->{'aliases'} }, $feature_name;
                    $primary_feature_name = $unpadded;
                }
            }

            my ( $marker_id, $marker_existed, @marker_ids );
            if ( $create ) {
                $marker_species_id = 
                    $lib->find_or_create_species( $marker_species );
                ( $marker_id, $marker_existed ) = $lib->find_or_create_marker( 
                    marker_name => $primary_feature_name,
                    synonyms    => $f->{'aliases'},
                    marker_type => $marker_type,
                    species_id  => $marker_species_id,
                ) or die $lib->error;
                @marker_ids = ( $marker_id );
            }
            else {
                my %args = (
                    marker_name => join(',', 
                        $primary_feature_name, @{ $f->{'aliases'} }
                    )
                );

#                $args{'marker_type'} = $marker_type 
#                    unless $marker_type eq 'Undefined';

                # 
                # Only include species for markers on rice maps.
                # 
                if ( $map_set_species eq 'Rice' ) {
                    $args{'species_id'} = 
                        $lib->find_or_create_species( $marker_species );
                }

                my $matches = $lib->marker_search( %args );
                @marker_ids = map { $_->{'marker_id'} } @$matches;
            }

            my $action = '';
            if ( $marker_id ) {
                $action = $marker_existed 
                    ? 'existed' 
                    : $create ? 'created' : 'updated';
            }
            elsif ( @marker_ids ) {
                $action = 'matched multiple';
            }
            else {
                $action = 'skipped';
            }

            print $log_fh join("\t",
                $marker_species,
                $marker_type,
                $primary_feature_name,
                @marker_ids ? join(',', @marker_ids) : 'Not found',
                $action,
                $f->{'feature_aid'},
            ) . "\n" if $log_fh;

            next unless @marker_ids;

            $marker_species_id ||= 
                $lib->find_or_create_species($marker_species);

            $map_type_id   ||= $lib->find_or_create_map_type(
                $map_set->{'map_type_aid'}
            ) or die $lib->error;

            $marker_germplasm_id  ||= $lib->find_or_create_germplasm( 
                $germplasm, $marker_species_id 
            ) or die $lib->error;

            $ms_species_id ||= $lib->find_or_create_species(
                $map_set_species
            ) or die $lib->error;

            $ms_germplasm_id ||= $lib->find_or_create_germplasm(
                $germplasm, $ms_species_id
            ) or die $lib->error;

            $map_set_id ||= $lib->find_or_create_map_set(
                map_set_name           => $map_set->{'map_set_name'},
                map_type_id            => $map_type_id,
                species_id             => $ms_species_id,
                germplasm_id1          => $ms_germplasm_id,
                cmap_map_set_accession => $map_set->{'map_set_aid'},
            ) or die $lib->error;

            $map_id ||= $lib->find_or_create_map( 
                cmap_map_accession => $map->{'map_aid'},
                map_set_id         => $map_set_id,
                map_name           => $map->{'map_name'},
                start              => $map->{'start_position'},
                end                => $map->{'stop_position'},
            ) or die $lib->error;

            for my $m_id ( @marker_ids ) {
                $lib->set_marker_mapping(
                    marker_id              => $m_id,
                    marker_name            => $feature_name,
                    #synonyms               => $f->{'aliases'}       || [],
                    map_id                 => $map_id,
                    cmap_feature_accession => $f->{'feature_aid'}   || '',
                    start                  => $f->{'start_position'},
                    end                    => $f->{'stop_position'},
                ) or die $lib->error;
            }
            $no_features++;
        }
    }
    $no_map_sets++;
}

print "Done.  Processed $no_map_sets map sets and $no_features features.\n";
