#!/usr/local/bin/perl

=pod

=head1 NAME

load-markers-by-analysis.pl - load MarkersDB data via an analysis

=head1 SYNOPSIS

  load-markers-by-analysis.pl [options]

Options:

  -h|--help         Show brief help and exit.
  -v|--verbose      Talk about what's happening.
  -c|--config_file  Path to DB config file, def $ENV{GrameneConfPath}.
  -a|--analysis     analysis_name in MarkersDB analysis table.
  -s|--species      Use this species if none found in sequence stream
  -d|--datadir      Path to tmp dir, or dir containing existing files.
  -r|--reuse        Use existing data files rather than downloading. 
  -n|--no_insert    Do not make changes to the database. Useful for debug.
  --logdir          Directory to write logs into. Def $PWD
  --logfile         File to write logs into. Def date.pid.analysis.log

=head1 DESCRIPTION

The program retrieves the requested 'analysis' from the MarkersDB
(connection defined by config_file) and uses the query defined therein
to pull entries from the indicated data source (or from file if
'--reuse') and load them into the MarkersDB.

=head2 --config_file

The config_file configures the connection to the Gramene MarkersDB
Both read-write and read-only connections should be configured (the
latter for debug). A read-only connection to the OntologyDB should
also be configured. Finally, the base directory of the gramene project
should be configured.

If unspecified, the GrameneConfPath environment variable will be used,
followed by /usr/local/gramene/conf/gramene.conf.

An example config_file;

  <markers_admin>
    db_dsn   dbi:mysql:markersXX:myhost:3307
    db_user  marker_rw_user
    db_pass  secret
  </markers_admin>
  <markers>
    db_dsn   dbi:mysql:markersXX:myhost:3307
    db_user  marker_ro_user                                                   
    db_pass  secret
  </markers>
  <ontology>
    db_dsn   dbi:mysql:ontologyXX:myhost:3306
    db_user  ontology_ro_user
    db_pass  secret
  </ontology>    
  <gramene>
    base_dir /usr/local/gramene
  </gramene>


=head2 --analysis

The value of the analysis param must refer to an entry in the analysis
table of the MarkersDB that is configured in config_file. The fields
in the analysis table are described below;

=head3 analysis.analysis_name

An identifier for the analysis. Used for the --analysis arg.  The name
is parsed to determine the marker types that will be loaded. The
setting of the marker_type may change in future.

=head3 analysis.type

The type indicates the source of the data, and the format in which the
raw data is expected.

 * entrez
   Query NCBI Entrez database, raw data in genbank format.

 * ncbitrace
   Query NCBI Trace archive, raw data in fasta and traceinfo format.  

 * unigene
   Download from NCBI UniGene, raw data in fasta and unigenedata format

=head3 analysis.description

The description contains the source-specific query that will be
run. E.g. an entrez or trace archive query. Ignored in the case of unigene

=head3 last_run

The last_run field contains the date on which the analysis was last
run. This is used to set date constraints on queries so that the
database can be incrementally updated. No constraints will be applied
if last_run is NULL.

=head3 Example MarkersDB.analysis entry;

  analysis_id: 24
analysis_name: cloneend_maize
         type: ncbitrace
  description: species_code='ZEA MAYS' and trace_type_code='CLONEEND'
     last_run: 2005-07-26

=head3 NOTE

Some analyses create correspondences to others. E.g. estcluster
analyses link to their component EST/mRNA markers. In these cases
users are advised to update the component (i.e. est, mrna) anlyses
_before_ loading the clusters (i.e. estcluster).

=head2 --species

The spceies for a given marker will generally be taken from the
annotation of the source. For sources where this is not possible, then
the name of the species to use may be provided.

=head2 --datadir

The datadir is the directory into which the analysis-specific files
are downloaded. The default is /usr/local/data/<analysis_type>, e.g.
/usr/local/data/entrez. The name of the file(s) created will depend on
the source, but will generally be named after the date and the analysis. 

=head2 --reuse

If the reuse flag is set, then the source data already in datadir will
be used, rather than downloading a new set.

=head2 --no_insert

For debug, there is a --no_insert flag that causes the program to go
through the motions, but ommits any methods that cause the database to
be updated.

=head2 --logfile, --logdir

Everything printed to STDERR by the program will be logged to a
file. Lines start with '[INFO]', '[WARN]' or '[*DIE]' for ease of
grepping. The logfile and logdir can be specified as program
arguments. You can also specify '--logfile=STDOUT' to get the log to
print to the terminal (useful for debug).

=head2 Program Flow

The program itterates through each sequence returned by the trace 
query and uses the data to;

  - Construct a variable representing a MarkersDB-compliant marker,
  - Update the MarkersDB if the marker already exists,
  - Insert the Marker into the DB if it is 'new',
  - Create a correspondence between the marker and markers of the 
    same species that share a synonym,
  - Create a correspondence between the marker and its read pair,

=head2 Oops

If it all goes wrong. Here is some SQL that blows away all traces of
an analysis and its markers. Run in order due to key constraints;

  #Clean up correspondences

  delete ca
  from  marker m, analysis a, analytical_correspondence ac,
        correspondence_to_analysis ca
  where m.analysis_id=a.analysis_id
  and   m.marker_id=ac.from_marker_id
  and   ac.analytical_correspondence_id=ca.analytical_correspondence_id
  and   a.analysis_name="bacend_maize";
 
  delete ac
  from  marker m, analysis a, analytical_correspondence ac
  where m.analysis_id=a.analysis_id 
  and   m.marker_id=ac.from_marker_id
  and   a.analysis_name="bacend_maize";

  delete ca
  from  marker m, analysis a, analytical_correspondence ac,
        correspondence_to_analysis ca
  where m.analysis_id=a.analysis_id
  and   m.marker_id=ac.to_marker_id
  and   ac.analytical_correspondence_id=ca.analytical_correspondence_id
  and   a.analysis_name="bacend_maize";
 
  delete ac
  from marker m, analysis a, analytical_correspondence ac
  where m.analysis_id=a.analysis_id 
  and m.marker_id=ac.to_marker_id
  and a.analysis_name="bacend_maize";

  # And the marker
  update marker m, analysis a 
  set    m.display_synonym_id=1
  where  m.analysis_id=a.analysis_id 
  and    a.analysis_name="bacend_maize";

  delete m 
  from  marker m, analysis a 
  where m.analysis_id=a.analysis_id 
  and   a.analysis_name="bacend_maize";

  # Reset the analysis to re-run

  update analysis 
  set last_run=NULL 
  where analysis_name="bacend_maize";


=head1 SEE ALSO

Gramene::Marker::DB, Gramene::CDBI::Markers

=head1 AUTHOR

Will Spooner E<lt>whs@ebi.ac.ukE<gt>.

=cut
# ----------------------------------------------------

# Ensure that code is using perl lib in same project
use FindBin qw($Bin);
use File::Basename qw( dirname );
use Digest::MD5 qw(md5_hex);

BEGIN {
    my $project_root = dirname( dirname( $Bin ) );
    unshift( @INC, $project_root . '/lib/perl' );
}

use strict;
use Bio::DB::GenBank;
use Bio::DB::Query::GenBank;
use Bio::FeatureIO;
use Bio::Root::IO;
use Bio::Seq::RichSeq;
use Bio::SeqFeature::Generic;
use Bio::SeqIO;
use Carp;
use Data::Dumper qw(Dumper);
use Date::Calc;
use File::Copy;
use File::Path;
use File::Spec::Functions;
use Getopt::Long;
use Gramene::Marker::DB;
use Gramene::Ontology::OntologyDB;
use HTTP::Request::Common;
use LWP::UserAgent;
use Net::FTP;
use Pod::Usage;
use Readonly;
use Text::RecordParser;

# Default network/filesystem locations
Readonly my $UNIGENE_URL  => 'ftp://ftp.ncbi.nih.gov/repository/UniGene';
Readonly my $TIGR_GI_URL  => 'ftp://ftp.tigr.org//pub/data/tgi';
Readonly my $DEFAULT_CONF_PATH 
    => '/usr/local/gramene/conf/gramene.conf';
Readonly my $TRACE_URL    
    => 'http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=raw';
Readonly my $PGDB_TUG_URL 
    => 'Currently unavailable '.
    '- http://www.plantgdb.org/download/download.php?dir=/Sequence/ESTcontig#';

our ( $no_processed, $count, $no_errors ) = ( 0, '???', 0 );
our ( %all_attribs );
END { &print_report; exit(0) }
$SIG{'INT'} = sub { exit(0) };

my $reuse   = 0;
my $in_file = '';
my ( $help, $v, $conffile, $analysis_name, $species, $man_page,
     $datadir, $no_insert, $logdir, $logfile );
GetOptions(
    'analysis:s'    => \$analysis_name,
    'config_file:s' => \$conffile,
    'datadir:s'     => \$datadir,
    'f|file:s'      => \$in_file,
    'help'          => \$help,
    'logdir:s'      => \$logdir,
    'logfile:s'     => \$logfile,
    'man'           => \$man_page,
    'no_insert'     => \$no_insert,
    'reuse_data'    => \$reuse,
    'species:s'     => \$species,
    'verbose'       => \$v,
);

if ( $help || $man_page ) {
    pod2usage({
        -exitval => 0,
        -verbose => $man_page ? 2 : 1
    });
}; 

#----
# Validate params
my $I = $no_insert ? 0 : 1;
unless ( $I ) {
    warn( "[INFO] Evaluation run - no db changes will be made\n" );
}

unless ( $analysis_name ) {
    warn( "\n[*DIE] Must specify a --analysis\n\n" );
    pod2usage;
}

if ( defined $conffile ) { 
    $ENV{'GrameneConfPath'} = $conffile 
}
$ENV{'GrameneConfPath'} ||= $DEFAULT_CONF_PATH;

for my $file ( $ENV{'GrameneConfPath'} ) {
    unless ( $file ) {
        warn( "\n[*DIE] Unable to find config file\n\n" );
        pod2usage;
    }
    unless ( -e $file ) {
        warn( "\n[*DIE] File $file does not exist\n\n" );
        pod2usage;
    }
    unless ( -r $file ) {
        warn( "\n[*DIE] Cannot read $file\n\n" );
        pod2usage;
    }
    unless ( -f $file ) {
        warn( "\n[*DIE] File $file is not plain-text\n\n" );
        pod2usage;
    }
    unless ( -s $file ) {
        warn( "\n[*DIE] File $file is empty\n\n" );
        pod2usage;
    }
}

my $rundate = sprintf( '%4.4i%2.2i%2.2i', Date::Calc::Today );

#----
# Log stderr to logfile. TODO: Change this so that logs to STDERR by default
$logdir ||= $ENV{PWD};
if ( !$logfile ) {
    my ( $file ) = ( $0 =~ m/([^\/]+)$/ );
    $file =~ s/\.\w+$//;

    #my $date = sprintf('%4.4i%2.2i%2.2i',Date::Calc::Today);
    $logfile
        = join( ".", $rundate, $$, $file, ( $analysis_name || () ), 'log' );
}

if ( uc( $logfile ) eq 'STDERR' or uc( $logfile ) eq 'STDOUT' ) {
    # Log to STDOUT
    open( LOG, ">&$logfile" ) or die( $! );
}
else {
    if ( !-d $logdir ) {
        mkpath $logdir;
    }

    $logfile = catfile( $logdir, $logfile );

    # Log to file
    open( LOG, ">$logfile" )
        or die( "Cannot open $logdir/$logfile for write: $!" );
    open( STDERR, ">&LOG" ) or die( $! );
}

#-----
# Connect to Marker and Ontology DB
$ENV{GATEWAY_INTERFACE}++;    # tell DBD::mysql to auto-reconnect
our ( $MDB, $ODB );
$MDB = Gramene::Marker::DB->new
    || die "\n[*DIE] " . Gramene::Marker::DB->error . "\n\n";
$ODB = Gramene::Ontology::OntologyDB->new
    || die "\n[*DIE] Cannot get Gramene::Ontology::OntologyDB\n\n";

#unless( $I ){
#  # No insert - set MDB to read-only
#  $MDB = Gramene::Marker::DB->new( admin => 0 );
#  prlog( "[INFO] no_insert set. Database will not be updated" );
#}

#----
# Get analysis data from DB and validate
our %CDBI_ANALYSES
    = ( map { uc( $_->analysis_name ) => $_ } $MDB->retrieve_all_Analysis );

my $analysis = $CDBI_ANALYSES{ uc( $analysis_name ) } 
    or die "Invalid analysis '$analysis_name'";

our %CDBI_CORR_TYPES = ( map { uc( $_->type ) => $_ }
        $MDB->retrieve_all_AnalyticalCorrespondenceType );

unless (
    $analysis
    and (  $analysis->type eq 'ncbitrace'
        or $analysis->type eq 'entrez'
        or $analysis->type eq 'unigene'
        or $analysis->type eq 'fasta' 
        or $analysis->type eq 'genbank' )
    )
{
    warn sprintf(
        "\n[*DIE] No analysis $analysis_name in DB. Use eg:"
            . "\n       --analysis=%s\n\n",
        join(
            "\n       --analysis=",
            map { $_->analysis_name }
                grep {
                       $_->type eq 'ncbitrace'
                    or $_->type eq 'entrez'
                    or $_->type eq 'fasta'
                } values %CDBI_ANALYSES
        )
    );
    pod2usage();
}

# Validate some analyses that are used later
my @a = qw( SHARED_SYNONYM MATE_PAIR CLONE_END EST_CLUSTER );
foreach my $a ( @a ) {
    $CDBI_CORR_TYPES{$a} || die( "\n[*DIE] No $a corr type in DB" );
}

my $marker_type_name;
if (   $analysis_name =~ /bac\s*end/
    or $analysis_name =~ /clone\s*end/ )
{
    $marker_type_name = 'bac_end_sequence';
}
elsif ($analysis_name =~ /^est_/
    or $analysis_name =~ /^dbest_/ )
{
    $marker_type_name = 'est';
}
elsif ( $analysis_name =~ /^estcluster_/ ) {
    $marker_type_name = 'est_cluster';
}
elsif ( $analysis_name =~ /^clone_/ ) {
    $marker_type_name = 'clone';
}
elsif ( $analysis_name =~ /^wgs_/ ) {
    $marker_type_name = 'wgs';
}
elsif ( $analysis_name =~ /^gss_/ ) {
    $marker_type_name = 'gss';
}
elsif ( $analysis_name =~ /^mrna/ ) {
    $marker_type_name = 'mrna';
}

#if ( !$marker_type_name ) {
#    warn(
#        "\n[*DIE] Cannot determine the MarkerType from $analysis_name\n\n" );
#    pod2usage;
#}

#---
# Validate marker_type
my %CDBI_MARKER_TYPES;
for my $m ( $MDB->retrieve_all_MarkerType ) {
    my $name = lc( $m->marker_type );
    $name =~ s/\s+/_/g;
    $CDBI_MARKER_TYPES{$name} = $m;
}

my $marker_type_obj = $CDBI_MARKER_TYPES{$marker_type_name};
#    || die( "\n[*DIE] No marker_type $marker_type_name\n\n" );

#---
# Validate species if provided
our $CDBI_SPECIES;
if ( $species ) {
    ( $CDBI_SPECIES ) = $MDB->search_Species( 'species' => $species );
    $CDBI_SPECIES || die( "\n[*DIE] No species; $species\n\n" );
}

#---
# Seq attributes to skip
my %skip_fields_by_analysis;
$skip_fields_by_analysis{ncbitrace} = {
    accession          => 1,    # Used for synonym
    center_name        => 1,    # Used for Library
    center_project     => 1,    # Used for Library
    chemistry_type     => 1,    # Uninteresting
    clip_quality_left  => 1,    # Used to clip sequence
    clip_quality_right => 1,    # " Ditto
    clip_vector_left   => 1,    # " Ditto
    clip_vector_right  => 1,    # " Ditto
    mate_pair          => 1,    # Used for correspondence
    plate_id           => 1,    # Synonymous with plate
    program_id         => 1,    # E.g. PHRED, uninteresting
    seq_lib_id         => 1,    # Used for Library
    source_type        => 1,    # Always GENOMIC
    species_code       => 1,    # Used for species_id FK
    submission_type    => 1,    # E.g. NEW. Enable?
    taxid              => 1,    # Used for species_id FK
    template_id        => 1,    # Synonym with template
    ti                 => 1,    # Used for synonym
    trace_end          => 1,    # Synonymous with read_direction
    trace_format       => 1,    # E.g. ABI. Uninteresting
    trace_name         => 1,    # Used for marker_name
    trace_type_code    => 1,    # Always CLONEEND
    well_id            => 1,    # Synonymous with row/col
};
$skip_fields_by_analysis{entrez} = {
    mol_type => 1,              # Always DNA
    cultivar => 1,              # Used for Germplasm
    strain   => 1,              # Used for Germplasm
};
$skip_fields_by_analysis{unigene} = {};

my %skip_fields_by_type;
$skip_fields_by_type{bac_end_sequence} = {
    clone        => 1,          # Used for clone correspondence
    clone_lib    => 1,          # Used for library
    comment      => 1,          # Discard. TODO: add to schema?
    date_changed => 1,          # Discard. TODO: add to schema?
    date_created => 1,          # Discard. TODO: add to schema?
    date_updated => 1,          # Discard. TODO: add to schema?
    db_xref      => 1,          # TODO: evaluate this tag
    keyword      => 1,          # Discard. TODO: add to schema?
    lab_host     => 1,          # Discard. TODO: add to schema?
    note         => 1,          # Discard. TODO: add to schema?
    organism     => 1,          # Used for Species
    ref_authors  => 1,          # Discard. TODO: add to schema?
    ref_location => 1,          # Discard. TODO: add to schema?
    ref_title    => 1,          # Discard. TODO: add to schema?
    ref_year     => 1,          # Discard. TODO: add to schema?
    trace        => 1,          # Used for synonym
};

#-----
# Get the SeqIO stream
my ( $seq_count, $seqio ) = &get_seqio( 
    analysis => $analysis, 
    data_dir => $datadir, 
    reuse    => $reuse,
    file     => $in_file,
);

#---
# Loop through each seq
my $batch_size = 1000;          # For progress output

while ( my $seq = $seqio->next_seq ) {
    my $batch = $no_processed / $batch_size;
    if ( $no_processed and $batch - int( $batch ) eq 0 ) {
        print( "...processed $no_processed of $seq_count...\n" );
    }
    prlog( "[INFO][MARKER] Accession: " . $seq->accession_number . "\n" )
        if $v;

    eval { 
        process_seq_to_marker( 
            seq         => $seq, 
            marker_type => $marker_type_obj, 
            analysis    => $analysis 
        ); 
    };

    if ( $@ ) {
        prlog(    "[ERROR]  process_seq_to_marker failed for "
                . $seq->accession_number . "|"
                . $seq->display_id
                . ", $@" );
        $no_errors++;
    }

    $no_processed++;

    #last if $no_processed > 1;
}

# All done - update the last_run field of analysis
if ( $I and $analysis ) {
    # warn Dumper( $analysis );
    # $analysis->last_run( sprintf( "%4.4i-%2.2i-%2.2i", Date::Calc::Today ) );
    $analysis->last_run( $rundate );
    $analysis->update;
    prlog(    "[INFO][ANALYSIS] "
            . "Updating "
            . $analysis->analysis_name
            . ".last_run to "
            . $analysis->last_run );
}

exit 0;

#======================================================================
#
sub get_seqio {
    my %args     = @_;
    my $analysis = $args{'analysis'};
    my $datadir  = $args{'data_dir'};
    my $reuse    = $args{'reuse'};
    my $in_file  = $args{'file'};

    unless ( UNIVERSAL::isa( $analysis, 'Gramene::CDBI::Markers::Analysis' ) )
    {
        die( "Need a Gramene::CDBI::Markers::Analysis object" );
    }

    my $analysis_type = $analysis->type;
    my $analysis_name = $analysis->analysis_name;
    $analysis_type    =~ s/\s+/_/g;
    $analysis_name    =~ s/\s+/_/g;
    my $date          = sprintf( '%4.4i%2.2i%2.2i', Date::Calc::Today );
    $datadir        ||= "/usr/local/data/$analysis_type";

    if ( !-d $datadir ) {
        warn( "\n[*DIE] datadir $datadir is not a directory\n\n" );
        pod2usage;
    }

    my $file_template = "$datadir/$date.$analysis_name.%s";

    if ( $analysis_type eq 'entrez' ) {
        return ( _get_seqio_entrez( $analysis, $file_template, $reuse ) );
    }
    elsif ( $analysis_type eq 'ncbitrace' ) {
        return ( _get_seqio_ncbitrace( $analysis, $file_template, $reuse ) );
    }
    elsif ( $analysis_type eq 'unigene' ) {
        return ( _get_seqio_unigene( $analysis, $file_template, $reuse ) );
    }
    elsif ( $analysis_type eq 'fasta' ) {
        return ( _get_seqio_fasta( $analysis, $file_template, $reuse ) );
    }
    elsif ( $analysis_type eq 'genbank' ) {
        return ( _get_seqio_genbank( $in_file ) );
    }
    else {
        die( "Do not know how to create a SeqIO stream for $analysis_type" );
    }
}

sub _get_seqio_genbank {
    my $file_name = shift or die "No GenBank filename\n";

    unless ( -e $file_name && -s _ ) {
        die "Missing or empty GenBank file.\n"
    }

    my $seqio = Bio::SeqIO->new(
        -format => 'genbank',
        -file   => $file_name,
    );

    my $count = '1';

    return ( $count, $seqio );
}

sub _get_seqio_fasta {
    my $analysis = shift || die( "Need an analysis" );
    my $file_t   = shift || die( "Need a filename template" );
    my $reuse    = shift;

    my $file = sprintf( $file_t, 'fasta' );
    my $location = $analysis->description;

    my $count = '???';

    if ( $reuse ) {    # Reuse existing files on disc
        prlog( "[INFO][SEQIO] reusing file $file" );
        unless ( -e $file ) {
            prlog(
                "\n[*DIE] --reuse specified but file $file does not exist\n\n"
            );
            pod2usage;
        }
    }
    else {             # Get file from somewhere
        if ( $location =~ m|^file://(.+)| ) {

            # Get file from the filesystem
            my $ofile = $1;
            File::Copy::copy( $ofile, $file )
                or
                ( prlog( "\n[*DIE] Copy from $ofile to $file failed: $!\n\n" )
                && pod2usage );
        }
        elsif ( $location =~ m|ftp://(.+?)/(.+)| ) {
            my $ftp_host    = $1;
            my $ftp_file    = $2;
            my $compression = '';
            if ( $ftp_file =~ /(\.gz)$/i ) {

                # Handle compression
                $file .= lc( $1 );
                $compression = lc( $1 );
            }
            prlog( "[INFO][SEQIO] Retrieving $location to $file" );
            my $ftp = Net::FTP->new( $ftp_host )
                || die( "[*DIE] Cannot connect to $ftp_host: $@" );
            $ftp->login( "anonymous", '-anonymous@' )
                || die( "[*DIE] Cannot login ", $ftp->message );
            $ftp->binary;
            $ftp->get( $ftp_file, $file )
                || die( "[*DIE] Cannot get $ftp_file ", $ftp->message );

            if ( $compression eq '.gz' ) {    # Handle compression if needed
                prlog( "[INFO][SEQIO] Uncompressing $file with gzip" );
                system( 'gzip', '-d', $file ) == 0
                    or die( "Could not gzip -d: $?" );
                $file =~ s/\.gz$//i;
            }
        }
        else {
            prlog( "\n[*DIE] Do not understand location $location.\n\n" );
            pod2usage;
        }
    }

    # Test that we have the correct number of entries in the file
    prlog( "[INFO][SEQIO] Counting entries in $file ..." );
    $count = 1;    #`grep -c '^>' $file`;
    chomp $count;
    $count || die( "[*DIE][SEQIO] No sequences found for $location" );
    prlog( "[INFO][SEQIO] Retrieved $count seqs" );

    # Create IO stream
    my $seqio = Bio::SeqIO->new(
        -format => 'fasta',
        -file   => $file
    );
    return ( $count, $seqio );
}

sub _get_seqio_entrez {
    my $analysis = shift || die( "Need an analysis" );
    my $file_t   = shift || die( "Need a filename template" );
    my $reuse    = shift;

    my $file = sprintf( $file_t, 'gb' );

    my $count = '???';

    if ( $reuse ) {    # Reuse existing files on disc
        prlog( "[INFO][SEQIO] reusing file $file" );
        unless ( -e $file ) {
            prlog(
                "\n[*DIE] --reuse specified but file $file does not exist\n\n"
            );
            pod2usage;
        }
    }
    else {             # Direct GenBank query
        prlog( "[INFO][SEQIO] writing to file $file" );

        my %args = (
            -db    => 'nucleotide',
            -query => $analysis->{description}
        );
        if ( my $mindate = $analysis->{last_run} ) {
            $mindate =~ s/-/\//g;
            $args{-mindate} = $mindate;
            $args{-maxdate} = join( "/", Date::Calc::Today() );
        }
        my $query = Bio::DB::Query::GenBank->new( %args );
        $count = $query->count;

        unless ( $count > 0 ) {
            prlog(    "\n[*DIE][ENTREZ] query returned no results "
                    . "($args{-query}"
                    . ( $args{-mindate} ? " from $args{-mindate}" : '' )
                    . ")\n\n" );
            pod2usage;
        }

        prlog( "[INFO][SEQIO] Expecting $count seqs ($args{-query})" );

        open( GB, "> $file" )
            or die( "Could not open $file for write: $!" );

        my $error;
        my $processing = sub {
            if ( $_[0] =~ /^\nError/ ) { $error .= $_[0]; $error =~ s/\n//g; }
            else { print GB $_[0] }
        };
        my $gb = Bio::DB::GenBank->new();
        my $req = $gb->get_request( -query => $query, -mode => 'query' );

        my $res = LWP::UserAgent->new->request( $req, $processing );
        die "Couldn't connect to entrez server" if !$res->is_success;
        if ( $error ) { die( "\n[*DIE][ENTREZ] $error\n\n" ) }
        close( GB );

    }

    # Test that we have the correct number of entries in the file
    prlog( "[INFO][SEQIO] Counting entries in $file ..." );
    my $ret_count = `grep -c '^//' $file`;
    chomp $ret_count;
    if ( $count eq '???' ) { $count = $ret_count }  # Reusing - assume correct
    if ( $ret_count ne $count ) {
        die(
            "\n[*DIE][ENTREZ] Expecting $count entries, got $ret_count\n\n" );
    }
    prlog( "[INFO][SEQIO] Retrieved $ret_count seqs of $count" );

    # Create IO stream
    my $seqio = Bio::SeqIO->new(
        -format => 'genbank',
        -file   => $file
    );
    return ( $count, $seqio );
}

sub _get_seqio_ncbitrace {
    my $analysis = shift || die( "Need an analysis" );
    my $file_t   = shift || die( "Need a filename template" );

    my $fastafile = sprintf( $file_t, 'fasta' );
    my $infofile  = sprintf( $file_t, 'info' );

    my $page_size = 40000;

    my $count = "???";

    if ( $reuse ) {    # Reuse existing files on disc
        prlog( "[INFO][SEQIO] reusing files $fastafile, $infofile" );
        foreach my $f ( $fastafile, $infofile ) {
            unless ( -e $f ) {
                warn(
                    "\n[*DIE] --reuse specified but file $f does not exist\n\n"
                );
                pod2usage;
            }
        }
    }
    else {             # Direct Trace query
        my $filter = $analysis->description;
        if ( my $mindate = $analysis->last_run ) {
            $filter .= " and load_date>'$mindate'";
        }
        my $count_query = "query count $filter";
        my $processing  = sub {
            if ( $_[0] =~ /(\d+)/ ) { $count = $1 }
        };
        _do_trace_query( $count_query, $processing );

        # Retrieve TRACE data for each page (max 40000 per page)
        my $pages = $count / $page_size;
        if ( $pages - int( $pages ) > 0 ) { $pages = int( $pages ) + 1 }
        prlog( "[INFO][SEQIO] Expecting $count seqs ($filter)" );
        prlog(
            "[INFO][SEQIO] This needs $pages requests ($page_size per request)"
        );

        open( FASTA, "> $fastafile" )
            or die( "Could not open $fastafile for write" );
        open( INFO, "> $infofile" )
            or die( "Could not open $infofile for write" );

        for ( my $i = 0; $i < $pages; $i++ ) {
            prlog(
                "[INFO][SEQIO] ...request " . ( $i + 1 ) . " of $pages..." );
            my $ti_bin = '';
            my $query
                = "query page_size $page_size page_number $i binary $filter";
            my $processing = sub { $ti_bin .= $_[0] };
            _do_trace_query( $query, $processing );

            if ( $ti_bin ) {
                my $query_fasta      = "retrieve fasta 0b$ti_bin";
                my $query_info       = "retrieve info 0b$ti_bin";
                my $processing_fasta = sub { print FASTA $_[0] };
                my $processing_info  = sub { print INFO $_[0] };
                _do_trace_query( $query_fasta, $processing_fasta );
                _do_trace_query( $query_info,  $processing_info );
            }
        }
        close( FASTA );
        close( INFO );
    }

    # Create IO streams
    my $infoio = Bio::FeatureIO->new(
        -format => 'traceinfo',
        -file   => $infofile
    );
    my $fastaio = Bio::SeqIO->new(
        -format => 'fasta',
        -file   => $fastafile
    );
    my $seqio = Bio::SeqIO->new(
        -format    => 'ncbitrace',
        -traceinfo => $infoio,
        -fasta     => $fastaio
    );
    die();
    return ( $count, $seqio );
}

sub _do_trace_query {
    my $query      = shift;    # Query string
    my $processing = shift;    # Processing callback
    my $req = HTTP::Request::Common::POST( $TRACE_URL, [ query => $query ] );
    my $res = LWP::UserAgent->new->request( $req, $processing );
    die "Couldn't connect to TRACE server" if !$res->is_success;
    return $res;
}

sub _get_seqio_unigene {
    my $analysis = shift || die( "Need an analysis" );
    my $file_t   = shift || die( "Need a filename template" );

    my ( $datadir ) = $file_t =~ m|(.+)\/[^/]+|;
    my $datafile  = sprintf( $file_t, 'data' );
    my $fastafile = sprintf( $file_t, 'fasta' );

    my %ignore_species
        = ( 'Arabidopsis_thaliana' => 1 );    # Don't process these!
    my $count = '???';

    if ( $reuse ) {                           # Reuse existing files on disc
        prlog( "[INFO][SEQIO] reusing files $fastafile, $datafile" );
        foreach my $f ( $fastafile, $datafile ) {
            unless ( -e $f ) {
                warn(
                    "\n[*DIE] --reuse specified but file $f does not exist\n\n"
                );
                pod2usage;
            }
        }
    }
    else {    # Download unigene files from NCBI

        # Initialise files/counts
        unlink( $datafile, $fastafile );
        $count = 0;

        # Initialise the FTP connection
        my ( $ftp_host, $ftp_dir ) = $UNIGENE_URL =~ m|ftp://(.+?)(/.+)|;

        $ftp_host || die( "Cannot parse FTP host from $UNIGENE_URL" );

        my $ftp = Net::FTP->new( $ftp_host )
            || die( "[*DIE] Cannot connect to $ftp_host: $@" );
        $ftp->login( "anonymous", '-anonymous@' )
            || die( "[*DIE] Cannot login ", $ftp->message );
        $ftp->cwd( $ftp_dir )
            || die( "[*DIE] Cannot change working directory ",
            $ftp->message );

        # Loop through each UniGene species
        foreach my $sp ( $ftp->ls ) {
            next if $ignore_species{$sp};
            my $sp_str = $sp;
            $sp_str =~ s/_/ /g;

            # If UniGene species matches MarkerDB, then consider processing
            if ( my $cdbi_sp = $MDB->search_Species( species => $sp_str ) ) {

               # Parse the date stamp and version from the <species>.info file
                my ( $info_file ) = $ftp->ls( "$sp/*.info" );
                my ( $prefix ) = $info_file =~ m/(\w+).info/;
                my $info_data;
                my $sh = IO::Scalar->new( \$info_data );
                $ftp->ascii;    # Need to play nice with Scalar::IO
                $ftp->get( $info_file, $sh );

                # Get UniGene version + count
                my ( $ugene_version )
                    = $info_data =~ m/(UniGene Build \#\d+)/;
                my ( $sp_count ) = $info_data =~ m/(\d+)\s+sets total/;
                $ugene_version
                    || prlog( "[WARN] UniGene version for $sp not found" );
                $sp_count
                    || prlog( "[WARN] UniGene count for $sp not found" );

                # Get UniGene release date
                my ( $ugene_date, $file_date );
                if ( $info_data =~ m/dbEST through (\d+) (\w+) (\d+)/ ) {
                    my ( $d, $m, $y ) = ( $1, $2, $3 );
                    $m = Date::Calc::Decode_Month( $m );
                    $ugene_date = sprintf( "%4.4d%2.2d%2.2d", $y, $m, $d );

                    #$file_date  = sprintf( "%4.4d%2.2d%2.2d", $y,$m,$d );
                }
                else {

                    # Date not found
                    &prlog( "[WARN] UniGene date for $sp not found" ) && next;
                }

                my $last_run_date = $analysis->{last_run} || 0;
                $last_run_date =~ s/\D//g;
                if ( $ugene_date > $last_run_date ) {

                    # UniGene updates exist. FTP the latest unigene file
                    $count += $sp_count;
                    prlog(    "[INFO] FTPing $sp $ugene_version $ugene_date "
                            . "($sp_count clusters)" );
                    my $spdatafile  = "$prefix.data.gz";
                    my $spfastafile = "$prefix.seq.uniq.gz";
                    $ftp->binary;
                    $ftp->get( "$sp/$spdatafile",  "$datadir/$spdatafile" );
                    $ftp->get( "$sp/$spfastafile", "$datadir/$spfastafile" );
                    system( "gunzip -c $datadir/$spdatafile  >> $datafile" );
                    system( "gunzip -c $datadir/$spfastafile >> $fastafile" );
                    unlink( "$datadir/$spdatafile", "$datadir/$spfastafile" );
                }
            }
        }
    }

    my $ret_count_fasta = `grep -c '^>' $fastafile`;
    if ( $ret_count_fasta ne $count ) {
        die(
            "\n[*DIE][UNIGENE] Expecting $count seqs, got $ret_count_fasta\n\n"
        );
    }
    my $ret_count_data = `grep -c '^ID ' $datafile`;
    if ( $ret_count_data ne $count ) {
        die(
            "\n[*DIE][UNIGENE] Expecting $count data, got $ret_count_data\n\n"
        );
    }

    die( "Not yet implemented" );
}

#----------------------------------------------------------------------
# This subroutine is responsible for turning a Bio::Seq object
# into markers and correspondences in the MarkersDB.
#
sub process_seq_to_marker {
    my %args             = @_;
    my $seq              = $args{'seq'}      || die( "Need a Bio::Seq object" );
    my $cdbi_analysis    = $args{'analysis'} || die( "Need a CDBI::Analysis" );
    my $cdbi_marker_type = $args{'marker_type'};

    if ( !$cdbi_marker_type ) {
        my %attr = annotation_to_hash( $seq->annotation );
        my $marker_type =
            $attr{'keyword'} =~ /^(GSS|EST)$/i
                ? uc $1
                : $attr{'mol_type'} =~ /mrna/i
                    ? 'mRNA'
                    : 'other_nucleotide'
        ;

        $cdbi_marker_type = $CDBI_MARKER_TYPES{ $marker_type }
            || die "\n[*DIE] Cannot determine marker type\n";
    }

    # Initialise the marker annotation variables, and populate the easy ones
    # The completed %marker (except fields starting with '_') will be
    # used as the args to Gramene::Marker::DB->create_Marker
    my %marker = (
        MarkerType  => $cdbi_marker_type,
        Analysis    => $cdbi_analysis,
        description => $seq->description,
        synonyms    => [],
        details     => {},
        _seq        => $seq,
        _attribs    => {},
    );

    # Fetch the attributes (annotation)
    $marker{'_attribs'} = &get_attribs( {%marker} );
    my %attribs = %{ $marker{'_attribs'} };


    # Set the CDBI::Species
    $marker{Species} = &get_Species( {%marker} );

    # Set the CDBI::Germplasm (if available)
    if ( my $g = &get_Germplasm( {%marker} ) ) { $marker{Germplasm} = $g }

    # Set the CDBI::Library
    if ( my $l = &get_Library( {%marker} ) ) { $marker{Library} = $l }

    # Fetch the synonyms data. Each is a hashref containing the name and type.
    my @synonyms = @{ &get_synonyms( {%marker} ) };

    # Take the first marker for the display_marker_synonym, 
    # and the rest as syns
    $marker{marker_name} = $synonyms[0];    # display_marker_synonym
    $marker{synonyms}    = [@synonyms];     # other synonyms

    # Fetch the correspondence data
    my @correspondences = @{ &get_correspondences( {%marker} ) };
    my $syn = shift @correspondences;

    # Search for an existing marker that has the same name/species as this
    if ( $syn ) {
        my $old_id     = $syn->[1];
        my $old_marker = $MDB->retrieve_Marker( $old_id );

        # if( my $old_marker = &search_existing_marker( {%marker} ) ){
        # Found an existing marker. Update this rather than create new.
        # ...This is indicated by setting $marker{marker_id}.
        # We will either replace all annotation in the existing marker with new,
        # ...or accept old marker annotation; fill in the gaps only.
        my $old_analysis_type    = $old_marker->analysis->type;
        my $new_analysis_type    = $marker{Analysis}->type;
        my $use_existing_attribs = 0;
        if ( $old_analysis_type eq $new_analysis_type ) {
            # Same analysis. Which one is more recent?
            if ( $new_analysis_type eq 'entrez' ) {
                # Test version
                my ( $old_version ) = (
                    sort { $b <=> $a } map {m/\.(\d{1,2})$/}
                        map { $_->marker_name } $old_marker->marker_synonyms
                );
                my ( $new_version ) = (
                    sort { $b <=> $a } map {m/\.(\d{1,2})$/}
                        map { $_->{marker_name} } @{ $marker{synonyms} }
                );
                if ( $new_version <= $old_version ) { # Old version up-to-date
                    $use_existing_attribs = 1;
                }
            }
        }

        if (    $new_analysis_type eq 'ncbitrace'
            and $old_analysis_type eq 'entrez' )
        {

            # entrez takes priority over ncbitrace
            $use_existing_attribs = 1;
        }

        $marker{marker_id}             = $old_marker->id;
        $marker{_use_existing_attribs} = $use_existing_attribs;
        $marker{_existing_marker}      = $old_marker;
    }

    #---
    # We have gathered all the annotation we need.

    # These are the annotations to skip
    my $marker_type_name = lc( $marker{MarkerType}->marker_type );
    $marker_type_name =~ s/\s/_/g;    # Strip whitespace
    my %skip_fields = (
        %{ $skip_fields_by_analysis{ $analysis->type } || {} },
        %{ $skip_fields_by_type{$marker_type_name}     || {} }
    );

    # These are the annotations to keep
    my ( $table, @fields )
        = $MDB->marker_type_to_table_name( $marker_type_name );
    my %detail_fields = ( map { $_ => 1 } @fields );
    $detail_fields{seq} = 1;          # explicitly add seq field

    # Copy marker_detail annotation from %_attribs
    foreach my $key ( keys %detail_fields ) {
        if ( defined( $marker{_attribs}->{$key} ) ) {
            $marker{details}->{$key} = $marker{_attribs}->{$key};
        }
    }

    #---
    # Add/update the marker to the DB

    $marker{date_updated} = $rundate;
    unless ( $marker{marker_id} ) {

        # Insert marker into DB
        &prlog(
            sprintf(
                "[INFO][MARKER] Creating %s (%s, %s)",
                $marker{marker_name}->{marker_name},
                $marker{MarkerType}->marker_type,
                $marker{Species}->species
            )
        );
        if ( $I ) {

            # warn Dumper(\%marker);

            $marker{date_created} = $rundate;

            eval { $marker{marker_id} = $MDB->create_marker( %marker ); };
            if ( $@ ) {
                die(      "[ERROR]  cannot create marker "
                        . $marker{marker_name}->{marker_name}
                        . ", $@" );
            }
        }
        else { $marker{marker_id} = 'XX' }
    }
    else {
        # Update marker
        # Only fill in the blank fields of the old marker, never replace 
        # an old value

        &prlog(
            sprintf(
                "[INFO][MARKER] Updating %s (%s, %s)",
                $marker{marker_name}->{marker_name},
                $marker{MarkerType}->marker_type,
                $marker{Species}->species
            )
        );

        if ( $marker{_use_existing_attribs} ) {
            my $old_marker = $marker{_existing_marker};

            delete( $marker{marker_name} );
            delete( $marker{MarkerType} );
            delete( $marker{Species} );
            $old_marker->germplasm_id != 1 and delete( $marker{Germplasm} );
            $old_marker->analysis_id != 1  and delete( $marker{Analysis} );
            $old_marker->library_id != 1   and delete( $marker{Library} );
            $old_marker->description       and delete( $marker{description} );
            my %old_details
                = %{ $MDB->get_marker_details( Marker => $old_marker ) };
            $old_details{seq}
                = $MDB->get_marker_sequence( marker_id => $old_marker );

            #print "seq = $old_details{seq}\n";
            foreach my $a ( keys %old_details ) {

                if ( $a eq 'seq' ) {

                    # instead of compare seq directly, we could
                    # follow example in /home/kclark/work/markers/set-md5.pl
                    # to calculate md5_sum for the new seq and
                    # compare with the old md5_sum

                    my $old_seq = lc( $old_details{seq} );
                    my $new_seq = lc( $marker{details}->{seq} );

                    # if the same delete the seq from %marker
                    # if different, get rid of all the gt0506 sequence
                    # map mappings for this marker_id
                    if ( $old_seq eq $new_seq ) {
                        delete( $marker{details}->{$a} );
                    }
                    else {
                        prlog(
                            "Delete TIGR mappings for marker_id "
                            . " $marker{marker_id} due to sequence updates"
                        );

                        remove_tigr_mappings_for_updated_seq(
                            $marker{marker_id} )
                            if ( $I );
                    }

                }
                else {
                    unless ( defined( $old_details{$a} ) ) {next}
                    delete( $marker{details}->{$a} );
                }
            }
        }

        if ( $I ) {
            $MDB->update_marker( %marker );
        }
    }

    # And the correspondences
    # Needs an array of arrayrefs, where each arrayref contains;
    # [0] the cdbi_analytical_correspondence_type,
    # [1] the from_marker_id, or undef to used the current marker_id
    # [2] the to_marker_id,   or undef to used the current marker_id
    # EITHER/OR [1] or [2] must be populated. Not neither, not both.
    for my $cdata ( @correspondences ) {
        my ( $cdbi_corrtype, $from_marker_id, $to_marker_id ) = @{$cdata};

        unless ( $from_marker_id || $to_marker_id ) {
            die( 'Need a correspondence id' );
        }

        $from_marker_id ||= $marker{'marker_id'};
        $to_marker_id   ||= $marker{'marker_id'};

        if ( $from_marker_id == $to_marker_id ) { next }

        &prlog(   "[INFO][CORR] Creating ("
                . $cdbi_corrtype->type
                . ") from ID $from_marker_id to $to_marker_id" );

        if ( $I ) {
            # print"set_correspondence $from_marker_id, $to_marker_id, 
            # " . ", " . $cdbi_corrtype->id ."\n";
            $MDB->set_correspondence(
                from_marker_id => $from_marker_id,
                to_marker_id   => $to_marker_id,
                analysis_id    => $cdbi_analysis->analysis_id,
                analytical_correspondence_type_id => $cdbi_corrtype->id,
            ) || die( $MDB->error );
        }
    }

    for my $key ( sort keys %attribs ) {
        next if $skip_fields{ $key };
        next if $detail_fields{ $key };

        my $l = length( $attribs{ $key } );
        $all_attribs{$key} = $l if ( $l > $all_attribs{$key} );
        print sprintf( "  %-16.16s = %-100.100s\n", $key, $attribs{$key} )
            if $v;
    }

    return ( {%marker} );
}

#----------------------------------------------------------------------
#
sub get_attribs {
    my $marker        = shift;
    my $seq           = $marker->{_seq}       || croak( "No Bio::Seq object" );
    my $cdbi_analysis = $marker->{Analysis}   || croak( "No CDBI Analysis" );
    my $analysis_type = lc( $cdbi_analysis->type );
    my $cdbi_mt       = $marker->{MarkerType} || croak( "No CDBI MarkerType" );
    my $marker_type   = lc( $cdbi_mt->marker_type );

    # Process sequence annotation fields
    my %attribs = ( &annotation_to_hash( $seq->annotation ) );
    for my $sf ( $seq->get_all_SeqFeatures ) {
        %attribs = ( %attribs, &seqfeature_to_hash( $sf ) );
    }

    # Extra attribs from Bio::Seq species
    if ( my $species_obj = $seq->species ) {
        if ( my $v = $species_obj->sub_species ) {
            $attribs{sub_species} = $v;
        }
        if ( my $v = $species_obj->organelle ) { $attribs{organelle} = $v }
    }

    # Process dates
    my @dates;
    if ( $seq->can( 'get_dates' ) ) {
        foreach my $date ( $seq->get_dates ) {
            use Date::Calc qw(Decode_Month);
            my ( $day, $mo, $yr ) = split( '-', $date );
            $mo = Decode_Month( $mo );
            push @dates, sprintf( "%4.4d-%2.2d-%2.2d", $yr, $mo, $day );
        }
        $attribs{date_created} = $dates[0];
        $attribs{date_updated} = $dates[-1];

        #warn( "==> $details{date_created} $details{date_updated}" );
    }

    # Process sequence
    my $seq_str = $seq->seq;
    if ( $analysis_type eq 'ncbitrace' ) {

        # Need to quality-clip the sequence
        my $clip_l = $attribs{clip_quality_left}
            || $attribs{clip_vector_left}
            || 1;
        my $clip_r = $attribs{clip_quality_right}
            || $attribs{clip_vector_right}
            || $seq->length;
        $seq_str = $seq->subseq( $clip_l, $clip_r );

    }
    $attribs{seq_length} = $seq->length;
    $attribs{seq}        = $seq_str;

    # Analysis/Type specific attribute processing
    if ( $analysis_type eq 'ncbitrace' ) {
        if ( my $v = $attribs{trace_end} )   { $attribs{read_direction} = $v }
        if ( my $v = $attribs{plate_id} )    { $attribs{plate}          = $v }
        if ( my $v = $attribs{well_id} )     { $attribs{row}            = $v }
        if ( my $v = $attribs{template_id} ) { $attribs{template}       = $v }
    }

    if ( $marker_type eq 'gss' ) {

        # If unset, try to retrieve some of the annotations from the
        # clone
        my $clone_name = $attribs{clone} || $seq->display_id;
        if ( $clone_name =~ /(.+?)([a-z]{0,1})(\d{4}[A-Z]{1}\d{2})(.*)/ ) {

            # Clone-name in the format e.g. OSJNBa0001A01
            $attribs{clone} = $1 . $2 . $3;    # Validated clone name
            $attribs{clone_lib}
                ||= $1 . $2;    # Use prefix as clone lib if needed
            $attribs{template} ||= $1 . $2 . $3;    # Copy clone to template

            # Some hacks for Oryza sativa.
            # CUGI library nbeb renamed to OSJNBb, and nbxb to OSJNBa.
            # Also AGI suffix for readdir is '.f'/'.r', CUGI is 'f'/'r'.
            if ( $1 eq 'OSJNB' ) {
                unless ( $marker->{_seq}->isa( 'Bio::Seq::RichSeq' ) ) {

                    # We need a RichSeq to enable adding of 2'd accessions!
                    bless( $marker->{_seq}, 'Bio::Seq::RichSeq' );
                }
                my $oldlib = $2 eq 'a' ? 'nbxb' : 'nbeb';
                if ( $4 eq 'f' or $4 eq 'r' ) {     # CUGI format
                    $marker->{_seq}->add_secondary_accession(
                        [ $oldlib . $3 . $4, $1 . $2 . $3 . '.' . $4 ] );
                }
            }

        }
    }

    if ( $marker_type eq 'clone' ) {

        # Can we infer the library name from the marker name?
        # Mainly for direct FASTA load
        my $marker_name = $marker->{_seq}->accession_number;
        if ( $marker_name =~ /(.+?)([a-z]{0,1})(\d{4}[A-Z]{1}\d{2})/ ) {

            # E.g. for OSJNBa0001A01.f, library is OSJNBa
            $marker->{_seq}->display_id( $1 . $2 . $3 );
            $marker->{_seq}
                ->add_secondary_accession( [ $marker_name, $2 . $3 ] );
            $attribs{clone_lib} ||= $1 . $2;

            # Hacks for Oryza sativa (see above)
            if ( $1 eq 'OSJNB' ) {
                my $oldlib = $2 eq 'a' ? 'nbxb' : 'nbeb';
                $marker->{_seq}->add_secondary_accession( [ $oldlib . $3 ] );
            }
        }
    }

    return {%attribs};
}

#----------------------------------------------------------------------
#
sub annotation_to_hash {
    my $ac = shift;
    $ac->isa( 'Bio::AnnotationCollectionI' )
        || die( "Need an AnnotationCollection" );

    my %annots;
    foreach my $key ( $ac->get_all_annotation_keys() ) {
        foreach my $annot ( $ac->get_Annotations( $key ) ) {
            if ( $annot->isa( 'Bio::Annotation::SimpleValue' ) ) {
                $annots{$key} = $annot->value;
            }
            elsif ( $annot->isa( 'Bio::Annotation::Comment' ) ) {
                $annots{$key} = $annot->text;
            }
            elsif ( $annot->isa( 'Bio::Annotation::Reference' ) ) {
                $annots{'ref_authors'}  = $annot->authors;
                $annots{'ref_title'}    = $annot->title;
                $annots{'ref_location'} = $annot->location;
                $annots{'ref_pubmed'}   = $annot->pubmed;
                $annots{'ref_medline'}  = $annot->medline;
                if ( $annots{'ref_location'} =~ /\((\d{4,4})\)/ ) {
                    $annots{'ref_year'} = $1;
                }
            }
            else {
                $annots{$key} = $annot;
            }
        }
    }
    return %annots;
}

#----------------------------------------------------------------------
#
sub seqfeature_to_hash {
    my $sf = shift;
    $sf->isa( "Bio::SeqFeatureI" )
        || die( "Need a SeqFeature" );

    my %annots;
    foreach my $tag ( $sf->get_all_tags ) {
        $annots{$tag} = join( " || ", $sf->get_tag_values( $tag ) );
    }
    return %annots;
}

#----------------------------------------------------------------------
#
our %CDBI_SYNONYM_TYPES;

sub get_synonyms {
    my $marker = shift           || croak( "Need a marker hashref" );
    my $seq    = $marker->{_seq} || croak( "No Bio::Seq object" );
    my $cdbi_analysis = $marker->{Analysis} || croak( "No CDBI Analysis" );
    my $attrib_hash   = $marker->{_attribs} || croak( "No _attrib hash" );
    my $analysis_type = $cdbi_analysis->type;
    my $marker_type   = $marker->{MarkerType}->marker_type;
    my %attribs       = %{$attrib_hash};

    # Pre-cache a list of synonym types
    unless ( keys %CDBI_SYNONYM_TYPES ) {
        %CDBI_SYNONYM_TYPES
            = map { $_->synonym_type => $_ } $MDB->retrieve_all_SynonymType;
    }

    # Select CDBI analysis-specific types
    my $acc          = $seq->accession_number;
    my $gb_acc_regex = $CDBI_SYNONYM_TYPES{GENBANK_ACCESSION}->validation;
    my %syn_types;

    # the gb_acc_regex defined in db was  ^([A-Z]\d{5})|([A-Z]{2}_{0,1}\d{6})$
    # does not match accessions like NM_001073711, so change it
    #$gb_acc_regex = '^([A-Z]\d{5})|([A-Z]{2}_\d+)$';
    #print "The acc is $acc, $analysis_type, $gb_acc_regex\n";

    if ( $analysis_type eq 'entrez' && $acc =~ /$gb_acc_regex/i ) {

        #print "The acc is $acc\n";
        $syn_types{accession} = $CDBI_SYNONYM_TYPES{GENBANK_ACCESSION};
        $syn_types{version}   = $CDBI_SYNONYM_TYPES{GENBANK_VERSION};
        $syn_types{primary}   = $CDBI_SYNONYM_TYPES{GENBANK_GI};
    }
    elsif ( $analysis_type eq 'ncbitrace' ) {
        $syn_types{accession} = $CDBI_SYNONYM_TYPES{TRACE_NAME};
        $syn_types{primary}   = $CDBI_SYNONYM_TYPES{TRACE_TI};
    }
    else {
        $syn_types{accession} = $CDBI_SYNONYM_TYPES{UNKNOWN};
        $syn_types{primary}   = $CDBI_SYNONYM_TYPES{UNKNOWN};
    }

    my @synonyms;

    if ( my $syn = $acc and $acc ne 'unknown' ) {

        # Used for the marker_display_id
        push @synonyms,
            {
            marker_name => $syn,
            SynonymType => $syn_types{'accession'}
            };
    }

    if ( my $syn = $seq->primary_id || $attribs{'ti'} ) {

        # NCBI ID (GI, TI)
        unless ( $syn =~ /^Bio\:\:/ )
        {    # A primary_seq is returned if no primary_id
            push(
                @synonyms,
                {
                    marker_name => $syn,
                    SynonymType => $syn_types{'primary'}
                }
            );
        }
    }
    if ( my $syn = $attribs{'accession'} ) {

        # Genbank accession - to top of list.
        unshift(
            @synonyms,
            {
                marker_name => $syn,
                SynonymType => $CDBI_SYNONYM_TYPES{'GENBANK_ACCESSION'}
            }
        );
    }
    if ( $seq->can( 'seq_version' ) ) {

        # Versioned accession
        for ( my $i = 1; $i <= $seq->seq_version; $i++ ) {
            my $syn = "$acc.$i";
            push(
                @synonyms,
                {
                    marker_name => $syn,
                    SynonymType => $syn_types{'version'}
                }
            );
        }
    }
    if ( $seq->can( 'get_secondary_accessions' ) ) {

        #Secondary accession
        my @synary = $seq->get_secondary_accessions();

        #warn Dumper(\@synary);
        foreach my $syn ( @synary ) {
            push(
                @synonyms,
                {
                    marker_name => $syn,
                    SynonymType => $CDBI_SYNONYM_TYPES{'UNKNOWN'}
                }
            );
        }
    }
    if ( my $syn = $attribs{'trace'} ) {

        #Poss trace_name
        push(
            @synonyms,
            {
                marker_name => $syn,
                SynonymType => $CDBI_SYNONYM_TYPES{'UNKNOWN'}
            }
        );
    }

    return [@synonyms];
}

#----------------------------------------------------------------------
# Looks for markers with the same synonyms and similar species
# as the new marker.
# Returns an arrayref of arrayrefs.
# The first element of the outer array is the 'synonymous' marker, if
# found, else undef.
# The inner arrays consist of
# [ $cdbi_synonym_type, $from_marker_id, $to_mareker_id]
sub get_correspondences {
    my $marker = shift || croak( "Need a marker hashref" );
    my @correspondences;
    my %corr_marker_ids;

    my @syns = ( map { $_->{marker_name} } @{ $marker->{synonyms} } );
    my %syn_types
        = ( map { $_->{marker_name} => $_->{SynonymType}->synonym_type }
            @{ $marker->{synonyms} } );
    my $marker_type = uc( $marker->{MarkerType}->marker_type );

    # Synonym types to never synonymise. E.g. integer database IDs
    my %ignore_syntype = ( TRACE_TI => 1, GENBANK_TI => 1 );

    #---
    # BACend/GSS specific correspondences
    if ( $marker_type =~ /^(BAC)|(GSS)/ ) {

        #---
        # Explicit mate_pair via trace_id
        if ( my $mate_ti = $marker->{_attribs}{mate_pair} ) {
            foreach my $mrk (
                $MDB->search_marker_synonyms(
                    synonyms    => [$mate_ti],
                    Species     => $marker->{Species},
                    MarkerType  => $marker->{MarkerType},
                    SynonymType => $CDBI_SYNONYM_TYPES{'TRACE_TI'}
                )
                )
            {
                my $mate_marker
                    = $MDB->retrieve_Marker( $mrk->{'marker_id'} );
                $corr_marker_ids{ $mrk->{'marker_id'} } && next;
                $corr_marker_ids{ $mrk->{'marker_id'} } = $mrk;
                push(
                    @correspondences,
                    [
                        $CDBI_CORR_TYPES{'MATE_PAIR'}, $mate_marker->id, undef
                    ]
                );
            }
        }

        #---
        # Implicit mate_pair via template (same as clone, but indexed so fast)
        my $search = 'search_MarkerDetailsGss';
        if ( my $template = $marker->{_attribs}{template} ) {
            foreach my $md ( $MDB->$search( { template => $template } ) ) {
                my $mid = $md->id;
                $corr_marker_ids{$mid} && next;
                $corr_marker_ids{$mid} = {};
                push( @correspondences,
                    [ $CDBI_CORR_TYPES{'MATE_PAIR'}, $mid, undef ] );
            }
        }

        #---
        # clone correspondence
        # create new clone if not found in database
        if ( my $clone = $marker->{_attribs}{clone} ) {
            my $cdbi_clone;
            foreach my $mrk (
                $MDB->search_marker_synonyms(
                    synonyms   => [$clone],
                    MarkerType => $CDBI_MARKER_TYPES{'clone'},
                    Species    => $marker->{Species}
                )
                )
            {
                if ( $ignore_syntype{ uc( $mrk->{synonym_type} ) } ) {next}
                $cdbi_clone = $MDB->retrieve_Marker( $mrk->{marker_id} );
            }
            unless ( $cdbi_clone ) {

                # Create a new clone marker - copy sequence and clone_lib
                my $clone_seq
                    = Bio::Seq::RichSeq->new( -accession_number => $clone );

                $clone_seq->species( $marker->{_seq}->species );
                if ( my $lib = $marker->{_attribs}{clone_lib} ) {
                    my $feat = Bio::SeqFeature::Generic->new(
                        -tag => { clone_lib => $lib } );
                    $clone_seq->add_SeqFeature( $feat );
                }
                $cdbi_clone = process_seq_to_marker( 
                    seq         => $clone_seq,
                    analysis    => $marker->{Analysis},
                    marker_type => $CDBI_MARKER_TYPES{'clone'},
                );
                $cdbi_clone || die( "Could not create clone for $clone" );
            }
            my $mid = $cdbi_clone->{marker_id};
            $corr_marker_ids{$mid} && next;
            $corr_marker_ids{$mid} = {};
            push( @correspondences,
                [ $CDBI_CORR_TYPES{CLONE_END}, undef, $mid ] );
        }
    }

    #---
    # Shared synony correspondences
    my $found_synonym = 0;
    foreach my $mrk (
        $MDB->search_marker_synonyms(
            synonyms => [@syns],
            Species  => $marker->{Species}
        )
        )
    {
        $corr_marker_ids{ $mrk->{marker_type_id} } && next;

        # Look for synonymous marker
        if (
            uc( $mrk->{marker_type} ) eq $marker_type
            and ( uc( $mrk->{synonym_type} ) eq 'UNKNOWN'
                or $mrk->{synonym_type} eq $syn_types{ $mrk->{marker_name} } )
            )
        {
            $found_synonym++;
            unshift( @correspondences, [ 'SYNONYMOUS', $mrk->{marker_id} ] );
        }

        # Assume shared synonym
        else {
            if ( $ignore_syntype{ uc( $mrk->{synonym_type} ) } ) {next}
            push(
                @correspondences,
                [
                    $CDBI_CORR_TYPES{SHARED_SYNONYM}, $mrk->{marker_id}, undef
                ]
            );

        }
        $corr_marker_ids{ $mrk->{marker_type_id} } = {};
    }
    $found_synonym || unshift( @correspondences, undef );

    return \@correspondences;
}

#----------------------------------------------------------------------
#
# cache a list of marker species objects keyed by ncbi_taxid.
our %markerdb_species;

sub get_Species {
    my $marker  = shift           || croak( "Need a marker hashref" );
    my $bio_seq = $marker->{_seq} || croak( "Need a Bio::Seq" );
    my $bio_species = $bio_seq->species;
    $bio_species || return $CDBI_SPECIES;    # Use default if needed
    my $species_name = $bio_species->binomial;

    my $ncbi_taxid = $bio_species->ncbi_taxid
        || die( "Species obj must have an ncbi_taxid" );

    if ( my $sp = $markerdb_species{$ncbi_taxid} ) {

        # Found in cache
        return $sp;
    }

    # Still here? fetch gramene_tax_id from ontology DB
    my $term = $ODB->get_term_by_xref( 'NCBI_taxid', $ncbi_taxid );

    my $gramene_tax_id;
    if ( $gramene_tax_id = $term->{term_accession} ) {
        if (
            my ( $sp ) = $MDB->search_Species(
                { gramene_taxonomy_id => $gramene_tax_id }
            )
            )
        {

            # Found in DB
            $markerdb_species{$ncbi_taxid} = $sp;    # Update cache
            return $sp;
        }
        else {
            $species_name = $term->{term_name} if $term->{term_name};
        }
    }
    else {
        my $spname = $bio_species->binomial;
        prlog(
            "[WARN] Species $species_name (NCBI $ncbi_taxid) has no GR:TaxID"
        );
        if ( my ( $sp )
            = $MDB->search_Species( { species => $species_name } ) )
        {

            # Found in DB
            $markerdb_species{$ncbi_taxid} = $sp;    # Update cache
            return $sp;
        }
    }

    # Still here? Create new
    my %sp_data = ( species => $species_name );
    if ( $gramene_tax_id ) { $sp_data{gramene_taxonomy_id} = $gramene_tax_id }
    if ( my $v = $bio_species->common_name ) { $sp_data{common_name} = $v }
    prlog( "[INFO][SPECIES] Creating $sp_data{species} ($gramene_tax_id)" );

    my $species_obj;
    if ( $I ) {
        $species_obj = $MDB->insert_Species( {%sp_data} );
    }

    # Update cache
    $markerdb_species{$ncbi_taxid} = $species_obj;

    return $species_obj;
}

#----------------------------------------------------------------------
#
sub get_Germplasm {
    my $marker  = shift               || croak( "Need a marker hashref" );
    my $attribs = $marker->{_attribs} || croak( "No _attribs hash" );
    my $species = $marker->{Species}  || croak( "No Species" );

    # Process germplasm
    my %seen_germplasm = ();
    my $germplasm;
    foreach my $type ( 'strain', 'cultivar' ) {

    # Order of $type denotes reverse priority; cultivar overwrites sub_species
        if ( my $val = $attribs->{$type} ) {
            $germplasm = $val;
            $seen_germplasm{$type} = $val;
        }
    }
    if ( keys( %seen_germplasm ) > 1 ) {  # Multiple attribs satisfy germplasm
            # TODO handle cleanly if needed, or uncomment for warning
            #foreach my $key( keys %seen_germplasm ) {
         #  print sprintf( "  %-16.16s = %-100.100s\n",$key,$seen_germplasm{$key} );
         #}
    }
    if ( $germplasm ) {
        my $germplasm_id = &markerdb_germplasm_id( $germplasm, $species->id );
        return $MDB->retrieve_Germplasm( $germplasm_id );
    }
    return undef;
}

#----------------------------------------------------------------------
#
# Cache a list of all marker libraries keyed by library name.
our %marker_libraries;

sub get_Library {
    my $marker  = shift;
    my %attribs = %{ $marker->{_attribs} };

    # Return unless library attrib is valid for this marker type
    my $marker_type = uc( $marker->{MarkerType}->marker_type );
    unless ( $marker_type =~ /^BAC/i
        or $marker_type eq 'EST'
        or $marker_type eq 'CLONE'
        or $marker_type eq 'GSS'
        or $marker_type eq 'MRNA' )
    {
        return ();
    }

    my $library_name = $attribs{clone_lib}
        || $attribs{seq_lib_id}
        || $attribs{ref_location};

    $library_name || return;

    if ( length( $library_name ) > 255 ) {
        $library_name = substr( $library_name, 0, 255 );    # Trim for MySQL
    }

    if ( exists( $marker_libraries{$library_name} ) ) {

        # Cached
        return $marker_libraries{$library_name};
    }

    my %search = ( library_name => $library_name );
    my %fields = ();

    if ( my $v = $attribs{center_name} )    { $fields{center_name}    = $v }
    if ( my $v = $attribs{center_project} ) { $fields{center_project} = $v }
    if ( my $v = $attribs{development_stage} ) {
        $fields{development_stage} = $v;
    }
    if ( my $v = $attribs{tissue_type} ) { $fields{tissue_type} = $v }
    if ( my $v = $attribs{cell_type} )   { $fields{cell_type}   = $v }
    if ( my $v = $attribs{cell_line} )   { $fields{cell_line}   = $v }
    if ( my $v = $attribs{sex} )         { $fields{sex}         = $v }

    my ( $library ) = $MDB->search_Library( {%search} );
    unless ( $library ) {
        prlog( "[INFO][LIBRARY] Creating $search{library_name}" );
        if ( $I ) {
            $library = $MDB->insert_Library( { %search, %fields } );
        }
    }

    # TODO: do we want to update the library with the new fields?
    $marker_libraries{$library_name} = $library;
    return $library || undef;
}

#----------------------------------------------------------------------
#
# Cache a list of all marker germplasms keyed by germplasm name.
our %all_marker_germs;

sub markerdb_germplasm_id {

    unless ( %all_marker_germs ) {

        # Initialise cache
        foreach ( @{ $MDB->get_all_germplasm } ) {
            my $sid = $_->{species_id};
            my $gid = $_->{germplasm_id};
            my $nm  = $_->{germplasm_name};
            $all_marker_germs{$sid} ||= {};
            $all_marker_germs{$sid}->{$nm} = $gid;
        }
    }

    my $germplasm = shift || die( "Need a germplasm name" );
    my $truncated = 0;
    if ( length( $germplasm ) > 50 ) {

        # Truncate name to fit database field
        $germplasm = substr( $germplasm, 0, 47 ) . "...";
        $truncated++;
    }
    my $species_id = shift || die( "Need a species_id" );
    my $germplasm_id = '???';
    if ( my $id = $all_marker_germs{$species_id}->{$germplasm} ) {

        # Found in cache
        $germplasm_id = $id;
    }
    else {

        # Create new
        prlog(
            "[INFO][GERMPLASM] Creating $germplasm (species_id $species_id)"
        );
        if ( $truncated ) {
            prlog(
                "[WARN][GERMPLASM] Truncated germplasm_name to $germplasm" );
        }
        if ( $I ) {
            $germplasm_id
                = $MDB->find_or_create_germplasm( $germplasm, $species_id )
                || die( $MDB->error );
        }

        # Update cache
        $all_marker_germs{$species_id}->{$germplasm} = $germplasm_id;
    }
    return $germplasm_id;
}

#----------------------------------------------------------------------
#
sub log_correspondence_problem {
    my $type    = shift || die( "Need a problem type" );
    my $marker1 = shift;
    my $marker2 = shift;

    $type = uc( $type );
    if ( $type eq 'SPECIES' ) {
        my $t = ( "[WARN][CORR] X-species shared syn: "
                . "Marker %s (%s, %s) vs. "
                . "Marker %s (%s, %s)" );
        prlog(
            sprintf( $t,
                \$marker1->{marker_name}, $marker1->{marker_type},
                $marker1->{species},      $marker2->{marker_name},
                $marker2->{marker_type},  $marker2->{species},
            )
        );
    }
    else {
        die( "Unknown correspondence problem, $type" );
    }
}

#----------------------------------------------------------------------
#
sub prlog {
    my $message = shift;
    print LOG $message . "\n";
}

#----------------------------------------------------------------------
#
sub print_report {
    # Look for unhandled attributes
    foreach my $attrib ( sort keys %all_attribs ) {
        prlog(
            "[WARN] Unhandled attrib: $attrib. Length: $all_attribs{$attrib}"
        );
    }

    # Print status info
    my $msg = sprintf(
        "[INFO] Done.\nLogged to: %s\n"
        . "Processed %s record%s of %s with %s error%s.",
        $logfile,
        $no_processed, ( $no_processed == 1 ) ? '' : 's',
        $count, $no_errors, ( $no_errors == 1 ) ? '' : 's',
    );
    prlog( $msg );
    print( "$msg\n" );

}

#-----------------------------------------------------------------------
# for an existing marker whose seq changes and needs update, we also need
# to remove
# all its mappings to tigr assembly so that the mapping will be redone

sub remove_tigr_mappings_for_updated_seq {
    my $marker_id = shift || return;
    my $mapping_ids = $MDB->search_tigr_mappings_for_markerid( $marker_id );

    for my $mapping_id ( @{$mapping_ids} ) {
        $MDB->delete_mapping( mapping_id => $mapping_id );
    }

    return;
}
#======================================================================


