#!/usr/local/bin/perl

=head1 NAME

load-markers-genbank.pl - load marker data into db direct from GenBank
DEPRECATED: Use load-markers-by-analysis.pl instead.

=head1 SYNOPSIS

  load-markers.pl [options]

Options:

  -h|--help         Show brief help and exit.
  -v|--verbose      Talk about what's happening.
  -c|--config_file  Path to DB config file, def $ENV{GrameneConfPath}.
  -a|--analysis     analysis_name in MarkersDB analysis table.
  -m|--marker_type  Marker type (e.g. EST).
  -n|--no_insert    Do not make changes to the database. Useful for debug.
  --datafile        Path to source GenBank file (if debugging this way).
  --logdir          Directory to write logs into. Def $PWD
  --logfile         File to write logs into. Def date.pid.script.analysis.log

=head1 DESCRIPTION

Loads data directly into the markers DB from GenBank using an Entrez
query. Under development.

The program retrieves the requested 'analysis' from the MarkersDB
(connection defined by config_file) and uses the query defined therein
to pull entries from GenBank and load them into the MarkersDB.

Format of config_file;

  <markers_admin>
      db_dsn   dbi:mysql:{dbname}:{myhost}:{myport}
      db_user  marker_rw_user
      db_pass  secret
  </markers_admin>

If the program is called with e.g. '-analysis dbest_poaceae', then, so
long as the type is 'entrez', GenBank will be queried with the text in
the description, limited between the last_run date and today's
date. This allows for incremental updates. The last_run date will be
updated upon completion. If last_run is NULL, then no limits will be
applied.

An example MarkersDB.analysis entry;

  analysis_id: 6
analysis_name: dbest_poaceae
         type: entrez
  description: Poaceae[ORGN] AND gbdiv_est[PROP]
     last_run: 2005-07-26

The program itterates through each sequence returned by the entrez
query and uses the data to;

  - Construct a variable representing a MarkersDB-compliant marker,
  - Update the MarkersDB if the marker already exists,
  - Insert the Marker into the DB if it is 'new',
  - Create a correspondenc between the marker and markers of the 
    same species that share a synonym.

For debug, there is a --no_insert flag that causes the program to go
through the motions, but ommits any methods that cause the database to
be updated. It is also possible to supply the path to a sequence file
in GenBank format; the --no_update flag is implied.

Everything printed to STDERR by the program will be logged to a
file. Lines start with '[INFO]', '[WARN]' or '[*DIE]' for ease of
grepping. The logfile and logdir can be specified as program
arguments. You can also specify '--logfile=STDOUT' to get the log to
print to the terminal (useful for debug).

If it all goes wrong. Here is some SQL that blows away all traces of
an analysis and its markers. Run in order due to key constraints;

  #Clean up correspondences

  delete ca
  from  marker m, analysis a, analytical_correspondence ac,
        correspondence_to_analysis ca
  where m.analysis_id=a.analysis_id
  and   m.marker_id=ac.from_marker_id
  and   ac.analytical_correspondence_id=ca.analytical_correspondence_id
  and   a.analysis_name="dbest_poaceae_2003";
 
  delete ac
  from  marker m, analysis a, analytical_correspondence ac
  where m.analysis_id=a.analysis_id 
  and   m.marker_id=ac.from_marker_id
  and   a.analysis_name="dbest_poaceae_2003";

  delete ca
  from  marker m, analysis a, analytical_correspondence ac,
        correspondence_to_analysis ca
  where m.analysis_id=a.analysis_id
  and   m.marker_id=ac.to_marker_id
  and   ac.analytical_correspondence_id=ca.analytical_correspondence_id
  and   a.analysis_name="dbest_poaceae_2003";
 
  delete ac
  from marker m, analysis a, analytical_correspondence ac
  where m.analysis_id=a.analysis_id 
  and m.marker_id=ac.to_marker_id
  and a.analysis_name="dbest_poaceae_2003";

  # Clean up synonyms
  
  update marker m, analysis a 
  set    m.display_synonym_id=NULL 
  where  m.analysis_id=a.analysis_id 
  and    a.analysis_name="dbest_poaceae_2003";

  delete ms 
  from  marker m, marker_synonym ms, analysis a 
  where m.marker_id=ms.marker_id 
  and   m.analysis_id=a.analysis_id 
  and   a.analysis_name="dbest_poaceae_2003";

  # And the marker

  delete md
  from marker m, analysis a, marker_details_est md
  where m.analysis_id=a.analysis_id
  and   m.marker_id=md.marker_id 
  and   a.analysis_name="dbest_poaceae_2003";

  delete m 
  from  marker m, analysis a 
  where m.analysis_id=a.analysis_id 
  and   a.analysis_name="dbest_poaceae_2003";

  # Reset the analysis to re-run

  update analysis 
  set last_run=NULL 
  where analysis_name="dbest_poaceae_2003";


=head1 SEE ALSO

Gramene::Marker::DB, Text::RecordParser.

=head1 AUTHOR

Will Spooner E<lt>whs@ebi.ac.ukE<gt>.

=cut

# ----------------------------------------------------

BEGIN{
  die( "[*DIE] DEPRECATED: Use load-markers-by-analysis.pl instead\n\n" );
}

use strict;
use Getopt::Long;

use Gramene::Ontology::OntologyDB;
use Pod::Usage;
use Text::RecordParser;

use Bio::DB::Query::GenBank;
use Bio::DB::GenBank;
use Data::Dumper qw(Dumper);
use Date::Calc;
use Carp;

use lib "/home/weix/gramene/lib/perl";
use Gramene::Marker::DB;

our ( $no_processed, $count, $no_errors ) = ( 0, '???', 0 );
our ( %all_attribs );
END{ &print_report; exit(0) }
$SIG{'INT'} = sub { exit(0) };

my ( $help, $v, $conffile, $gbfile, $analysis_name, $marker_type,
     $no_insert, $logdir, $logfile );
GetOptions(
           'help'           => \$help,
           'config_file:s'  => \$conffile,
           'datafile:s'     => \$gbfile,
           'verbose'        => \$v,
           'analysis:s'     => \$analysis_name,
           'marker_type:s'         => \$marker_type,
           'no_insert'      => \$no_insert,
           'logdir:s'       => \$logdir,
           'logfile:s'      => \$logfile,
);
pod2usage(-verbose => 2) if $help;

#----
# Validate params
if( defined $conffile ){ $ENV{GrameneConfPath} = $conffile }

unless( $gbfile || $analysis_name ){
  warn( "\n[*DIE] Must specify either a --analysis or --datafile\n\n" );
  pod2usage;
}
unless( $marker_type ){
  warn( "\n[*DIE] Must specify a --marker_type, e.g. est\n\n" );
  pod2usage;
}

my $I = $no_insert ? 0 : 1;
unless( $I ){ warn( "[INFO] Evaluation run - no db changes will be made\n")}

foreach my $file( $ENV{GrameneConfPath}, ( $gbfile ? $gbfile : () ) ){
  unless( $file ){
    warn( "\n[*DIE] Unable to find config file\n\n" );
    pod2usage;
  } unless( -e $file ){
    warn( "\n[*DIE] File $file does not exist\n\n" );
    pod2usage;
  } unless( -r $file ){
    warn( "\n[*DIE] Cannot read $file\n\n" );
    pod2usage;
  } unless( -f $file ){
    warn( "\n[*DIE] File $file is not plain-text\n\n" );
    pod2usage;
  } unless( -s $file ){
    warn( "\n[*DIE] File $file is empty\n\n" );
    pod2usage;
  }
}

#----
# Log stderr to logfile
$logdir ||= $ENV{PWD};
unless( $logfile ){
  my( $file ) = ( $0 =~ m/([^\/]+)$/ );
  $file =~ s/\.\w+$//;
  my $date = sprintf('%4.4i%2.2i%2.2i',Date::Calc::Today);
  $logfile = join( ".", $date, $$, $file, ($analysis_name||()),'log' );
}
if( uc($logfile) eq 'STDERR' or uc($logfile) eq 'STDOUT' ){
  # Log to STDOUT
  open( LOG, ">&$logfile" ) or die( $! );
} else {
  # Log to file
  open( LOG, ">$logfile" ) or die( $! );
  open( STDERR, ">&LOG" ) or die( $! );
}

#-----
# Connect to Marker and Ontology DB
our( $MDB, $ODB );
$ENV{GATEWAY_INTERFACE} ++; # tell DBD::mysql to auto-reconnect

$MDB = Gramene::Marker::DB->new( admin=>1 ) ||
    die "\n[*DIE] " . Gramene::Marker::DB->error . "\n\n";
$ODB = Gramene::Ontology::OntologyDB->new ||
    die "\n[*DIE] " . Gramene::Ontology::OntologyDB->error . "\n\n";

#----
# Get analysis data from DB
my $analysis;
if( $analysis_name ){
  $analysis = $MDB->view_analysis(analysis_name=>$analysis_name) ||
    die "\n[*DIE] " . $MDB->error . "\n\n";

#  print "analysis name is $analysis->{analysis_name}\n";
  unless( lc( $analysis->{type} ) eq 'entrez' ){
    die "\n[*DIE] Analysis type must be 'entrez', not $analysis->{type}\n\n";
  }
}
else{
  prlog( "[WARN] No analysis. no_insert set to true.");
  $I = 0;
}

my $corr_analysis = $MDB->view_analysis(analysis_name=>'SHARED_SYNONYM') ||
    die "\n[*DIE] " . $MDB->error . "\n\n";
my $corr_analysis_id = $corr_analysis->{analysis_id};

unless( $I ){
  # No insert - set MDB to read-only
  $MDB = Gramene::Marker::DB->new ||
      die "\n[*DIE] " . Gramene::Marker::DB->error . "\n\n";
  prlog( "[INFO] no_insert set. Database will not be updated" );
}

#-----
# Get the detail table for this type
# print "The marker types are\n" . (join "\n", @marker_type_names) ."\n";

my $marker_types      = $MDB->get_all_marker_types( ); #order_by =>"marker_type is default
my @marker_type_names = reverse sort map { lc($_->{marker_type}) } @{$marker_types};
my $marker_type_re =  join "|", @marker_type_names;

#print "[DG] The marker type regex is $marker_type_re\n";

if( lc($marker_type) =~ / ($marker_type_re) /xms){
  #print "matched $1\n";
}else{
  my $msg = "[*DIE] --marker_type $marker_type unknown. Use est, mrna or bacend";
  prlog( $msg );
  #print( "\n$msg\n\n" );
  exit;
}

my ($table, @fields) = $MDB->marker_type_to_table_name($marker_type);
print "[DG] The detail table to use is $table\n";

#-----
# Get the SeqIO stream

my( $seqio, $count) = &get_seqio( $analysis, $gbfile );

#print "[DG] After get Seqio for $analysis->{analysis_name}, $gbfile\n";

#---
# GenBank attributes to skip
my %skip_fields = ( date_changed        => 1, # Date processing
                    db_xref             => 1, # Species processing
                    #keyword             => 1, # Always EST
                    #mol_type            => 1, # Always mRNA
                    organism            => 1, # Species processing
                    secondary_accession => 1, # Synonym processing
                    );

#---
# detail table attributes to keep

my %detail_fields = ( map{$_=>1} @fields);
print "[DG] The detail_fields keys are " . (join " | " , keys %detail_fields) ."\n" if $v;

#---
# Loop through each seq
my $batch_size = 1000; # For progress output
#my  $c = 0;
ENTRY: while( my $seq =  $seqio->next_seq ){
  my $batch = $no_processed/$batch_size;
  if( $batch - int($batch) eq 0 ){
    print( "...processed $no_processed of $count...\n" );
  }
  $no_processed ++;

  # warn Dumper( $seq );

  my $acc = $seq->accession;
  print( "\n\n$acc\n" ) if $v;

  # Initialise the marker annotation variables, and populate the easy ones
  my %marker  = ( marker_name     => $seq->accession,
		  synonym_type    => 'GENBANK_ACCESSION',
                  marker_type     => $marker_type,
                  description     => $seq->description,
                  analysis_id     => $analysis->{analysis_id},
                  synonyms        => [],
                  details         => {} ); 

  # This for possible details table attributes that can be assigned now
  my %details_for_all = ( 
		 sequence_length     => $seq->length,
		 clone_insert_length => $seq->length,
                 seq                 => $seq->seq,
		 
		);

  my %attribs;

  # Process species
  my $species_obj = $seq->species;
  $marker{species_id} = &markerdb_species_id($species_obj);
  $marker{species}    = $species_obj->binomial;

  print "[DG] $marker{species_id}, $marker{species}\n" if $v;

  # Process synonyms
  push @{$marker{synonyms}}, {marker_name => $seq->primary_id, 
			      synonym_type => 'GENBANK_GI'}; # NCBI GI number
  for( my $i=1; $i<=$seq->seq_version; $i++ ){
    my $syn =  "$acc.$i";
    push( @{$marker{synonyms}}, {marker_name => $syn,
				 synonym_type => 'GENBANK_VERSION'}); # Versioned accession
  }
  for my $syn( $seq->get_secondary_accessions ){
    push( @{$marker{synonyms}}, {marker_name => $syn,
				synonym_type => 'UNKNOWN'} );
  }

  # Process dates
  my @dates;
  foreach my $date( $seq->get_dates ){
    use Date::Calc qw(Decode_Month);
    my( $day, $mo, $yr ) = split( '-', $date );
    $mo = Decode_Month($mo);
    push @dates, sprintf( "%4.4d-%2.2d-%2.2d",$yr,$mo,$day );
  }
  $details_for_all{date_created} = $dates[0];
  $details_for_all{date_updated} = $dates[-1];
  #warn( "==> $details{date_created} $details{date_updated}" );

  # Process other annotation fields
  %attribs = ( %attribs, &annotation_to_hash( $seq->annotation ) );
  foreach my $sf( $seq->get_all_SeqFeatures ){
    %attribs = ( %attribs, &seqfeature_to_hash( $sf ) );
  }

  # Process germplasm
  $attribs{sub_species}  ||= $species_obj->sub_species;
  $attribs{organelle}    = $species_obj->organelle;

  if($v){
    print "***The marker $acc attribs hash looks like***\n";
    for my $k(sort keys %attribs){
      my $v = $attribs{$k};
      my $l = length ($attribs{$k});
      print "$k ==> $v, length = $l\n";
    }
  }

  my %seen_germplasm = ();
  my $germplasm;
  foreach my $type( 'strain','cultivar' ) {
    # Order of $type denotes reverse priority; cultivar overwrites sub_species
    if( my $val = $attribs{$type} ){
      $germplasm = $val;
      $seen_germplasm{$type} = $val;
    }
    delete( $attribs{$type} );
  }
  if( keys(%seen_germplasm) > 1 ){ # Multiple attribs satisfy germplasm
    # TODO handle cleanly if needed, or uncomment for warning
    #foreach my $key( keys %seen_germplasm ) {
    #  print sprintf( "  %-16.16s = %-100.100s\n",$key,$seen_germplasm{$key} );
    #}
  }
  if( $germplasm ){
    $marker{germplasm_id} = &markerdb_germplasm_id( $germplasm,
                                                    $marker{species_id} );
  }


  my %details;
  # populate detail table fields from %attribs and %details_for_all;
  foreach my $key( keys %detail_fields ){
    if( exists( $attribs{$key} ) ){
      $details{$key} = $attribs{$key};
    }elsif(exists( $details_for_all{$key})){
      $details{$key} = $details_for_all{$key};
    }
    delete( $attribs{$key} );
  }

  # Add details to marker
  $marker{details} = {%details};

  # Is this marker already in the DB?
  # For now marker_synonym_search only accept bare synonyms instead of synonym objects as input arg
  #
  my @synonyms = ( $marker{marker_name}, (map {$_->{marker_name}} @{$marker{synonyms}}) );
  my @cmarker_ids = $MDB->marker_synonym_search( synonyms => [@synonyms] );
  
  my @correspondences;

  #print "xxx $details{date_created} $details{date_updated} marker_id => $id\n" if $v;
  foreach my $id( @cmarker_ids ){

    my $cmarker = $MDB->view_marker( marker_id => $id ); ##need study here ###
    
    #if($v){
     # for my $k(keys %{$cmarker}){
	#print "This already in db cmarker: $k ==> $cmarker->{$k}\n";
	#if(ref $cmarker->{$k} eq 'HASH'){
	 # for my $j(keys %{$cmarker->{$k}}){
	  #  print "\t$j ==> $cmarker->{$k}->{$j}\n";
	  #}
	#}#elsif(ref $cmarker->{$k} eq 'ARRAY'){
	  #print "\t" . (join "\t", @{$cmarker->{$k}} ) ."\n";
	#}
      #} 
    #}

    print "species comparison: $marker{species} ?? $cmarker->{species}\n" if $v;

    if( &compare_species( $marker{species}, $cmarker->{species} ) ){

      print "same species -> marker type comparison: $marker{marker_type} ?? $cmarker->{marker_type}\n" if $v;
      
      # Markers from same species
      if( uc($marker{marker_type}) eq uc($cmarker->{marker_type}) ){
        # Markers from same species and type

	print "same type -> date comparison: $marker{details}->{date_updated} ?? $cmarker->{details}->{date_updated}\n" if $v;

        if( $marker{details}->{date_updated} eq 
            $cmarker->{details}->{date_updated} ){
          # No change from marker already in DB!
          &prlog( "[INFO][MARKER] already up-to-date in DB: ".
                  "$marker{marker_name} ".
                  "($marker{marker_type}, $marker{species})" );
          next ENTRY;
        }
        # UPDATE MARKER!
        # TODO: think about handling curated data!
	# if there are more than one cmarkers need update, we want to collap them into one,
	# the Markers::DB function for doing that is under development
        
	$marker{marker_id} = $cmarker->{marker_id};
      
	# Combine synonyms into unique list
        my %syns = ( map{ $_->{marker_name} => $_ }
                    (@{$cmarker->{synonyms}}, @{$marker{synonyms}}) ); #????
        $marker{synonyms} = [values %syns];

      }

      else{
        # Markers from different type: CREATE CORRESPONDENCE;
        push( @correspondences, $cmarker->{marker_id});
      }

    } else {
      &log_correspondence_problem( 'species', {%marker}, $cmarker );
    }
  }

#debug

  if($v){
    print "\n***Marker hash is:***\n";
    foreach my $k(keys %marker){
      my $r = $marker{$k};
      print "$k => $r\n" unless($k eq 'seq');	
    }
    print "\tDETAIL HASH........\n";
    foreach my $k( keys %{$marker{details}} ){
      my $r = $marker{details}->{$k};
      print  "\t$k => $r\n" unless($k eq 'seq');
    }
  }
#print "\tsynonyms array........\n:". (join "\t", @{$marker{synonyms}}) ."\n";
#debug 

 #---
  # Add/update the marker to the DB
  my $marker_id = $marker{marker_id};
  unless( $marker_id ){
    # Insert marker into DB
    &prlog( "[INFO][MARKER] Creating $marker{marker_name} ".
            "($marker{marker_type}, $marker{species})" );
    if( $I ){

      $marker_id = $MDB->create_marker(%marker) ||
          croak( $MDB->error );

      #print "[DG] after create_marker\n";
    }
  } else {
    # Update marker
    &prlog( "[INFO][MARKER] Updating $marker{marker_name} ".
            "($marker{marker_type}, $marker{species})" );    
    if( $I ){
      $MDB->update_Marker(%marker) || croak( $MDB->error );
    }
  }

  # And the correspondences - both directions
  foreach my $cmarker_id( @correspondences ){
    &prlog( "[INFO][CORR] Creating between IDs $marker_id and $cmarker_id" );
    if( $I ){
      $MDB->set_correspondence
          ( from_marker_id => $cmarker_id,
            to_marker_id   => $marker_id,
            analysis_id    => $corr_analysis_id ) || die( $MDB->error );
    }
  }

  foreach my $key( sort keys %attribs ){
    next if $skip_fields{$key};
    my $val = $attribs{$key};
    my $l = length($val) ;
    $all_attribs{$key} = $l if( $l > $all_attribs{$key} ); #the biggest size for the unused attrib
    print sprintf( "  %-16.16s = %-100.100s, %10d\n", $key, $attribs{$key}, $l ) if $v;
  }
  
  print( "\n" ) if $v;
  

  #warn( "seq length for ", $seq->id, " is ", $seq->length, "\n" );
  #++$c;
  #last if $c >= 3;
}

# All done - update the last_run field of analysis
if( $I and $analysis and ! $gbfile ){
  #warn Dumper( $analysis );
  $analysis->{last_run} = sprintf( "%4.4i-%2.2i-%2.2i", Date::Calc::Today );
  $MDB->update_analysis(%$analysis) || croak( $MDB->error );
  prlog("[INFO][ANALYSIS] ".
        "Updating $analysis->{analysis_name}.last_run to ".
        "$analysis->{last_run}");
}

exit;

#======================================================================
#
sub get_seqio{
  my $analysis = shift;
  my $gb_file  = shift;

  my $count = "???";
  if( $gbfile ){ # Query from local file
    $seqio = Bio::SeqIO->new(-file=>$gbfile, -format=>'genbank');
    print "gb_file = $gb_file\n";
  }
  else { # Direct GenBank query
    my %args = ( -db    => 'nucleotide',
                 -query => $analysis->{description} );
    if( my $mindate = $analysis->{last_run} ){
      $mindate =~ s/-/\//g;
      $args{-mindate} = $mindate;
      $args{-maxdate} = join( "/", Date::Calc::Today() );
    }
    
    my $query = Bio::DB::Query::GenBank->new( %args );
    $count = $query->count;
    my $gb = Bio::DB::GenBank->new();
    $seqio = $gb->get_Stream_by_query($query);
  }
  return( $seqio, $count );
}

#----------------------------------------------------------------------

sub annotation_to_hash{
  my $ac = shift;
  $ac->isa('Bio::AnnotationCollectionI') || 
      die( "Need an AnnotationCollection" );

  my %annots;
  foreach my $key ( $ac->get_all_annotation_keys() ){
    foreach my $annot( $ac->get_Annotations( $key ) ){
      if( $annot->isa('Bio::Annotation::SimpleValue') ){
        $annots{$key} = $annot->value;
      } elsif( $annot->isa('Bio::Annotation::Comment') ){
        $annots{$key} = $annot->text;
      } elsif( $annot->isa('Bio::Annotation::Reference' ) ) {
        $annots{'ref_authors'}  = $annot->authors;
        $annots{'ref_title'}    = $annot->title;
        $annots{'ref_location'} = $annot->location;
        $annots{'ref_pubmed'}   = $annot->pubmed;
        $annots{'ref_medline'}  = $annot->medline;
        if( $annots{'ref_location'} =~ /\((\d{4,4})\)/ ){
          $annots{'ref_year'} = $1;
        }
      } else {
        $annots{$key} = $annot;
      }
    }
  }
  return %annots;
}

#----------------------------------------------------------------------
#
sub seqfeature_to_hash{
  my $sf = shift;
  $sf->isa("Bio::SeqFeatureI") ||
      die( "Need a SeqFeature" );

  my %annots;
  foreach my $tag( $sf->get_all_tags ){
    $annots{$tag} = join( " || ", $sf->get_tag_values( $tag ) );
  }
  return %annots;
}

#----------------------------------------------------------------------
#
# Pre-cache a list of all marker species keyed by gramene_taxonomy_id.
our %all_marker_species;
sub markerdb_species_id{
  unless( %all_marker_species ){
    # Initialise cache
    %all_marker_species = ( map{ $_->{gramene_taxonomy_id} => $_ }
                            @{$MDB->get_all_species} );
  }

  my $bio_species = shift || die( "Need a Bio::Species" );

  my $ncbi_taxid = $bio_species->ncbi_taxid;
  my $term = $ODB->get_term_by_xref('NCBI_taxid',$ncbi_taxid) ||
      die $ODB->error; # TODO - handle this gracefuly!
  my $gramene_tax_id = $term->{term_accession};

  my $species_id = '???';
  if( my $sp = $all_marker_species{$gramene_tax_id} ){
    # Found in cache
    $species_id = $sp->{species_id};
  } 
  else {
    # Create new
    my $species_name = $bio_species->binomial;
    my %sp_data = ( species_id          => $species_id,
                    species             => $species_name,
                    common_name         => $bio_species->common_name,
                    gramene_taxonomy_id => $gramene_tax_id );
    prlog( "[INFO][SPECIES] Creating $sp_data{species} ($gramene_tax_id)" );
    if( $I ){
      $species_id = $MDB->find_or_create_species( $species_name ) ||
          die $MDB->error;
      $sp_data{species_id} = $species_id;
      $MDB->update_species( %sp_data ) ||
          die $MDB->error;
    }
    # Update cache
    $all_marker_species{$gramene_tax_id} = \%sp_data;
  }

  return $species_id;
}

#----------------------------------------------------------------------
#
# Cache a list of all marker germplasms keyed by germplasm name.
our %all_marker_germs;
sub markerdb_germplasm_id{

  unless( %all_marker_germs ){
    # Initialise cache
    foreach( @{$MDB->get_all_germplasm} ){
      my $sid = $_->{species_id};
      my $gid = $_->{germplasm_id};
      my $nm  = $_->{germplasm_name};
      $all_marker_germs{$sid} ||= {};
      $all_marker_germs{$sid}->{$nm} = $gid;
    }
  }

  my $germplasm  = shift || die( "Need a germplasm name" );
  my $truncated = 0;
  if( length( $germplasm ) > 50 ){
    # Truncate name to fit database field
    $germplasm = substr($germplasm,0,47) . "...";
    $truncated ++;
  }
  my $species_id = shift || die( "Need a species_id" );
  my $germplasm_id = '???';
  if( my $id = $all_marker_germs{$species_id}->{$germplasm} ){
    # Found in cache
    $germplasm_id = $id;
  }
  else{
    # Create new
    prlog( "[INFO][GERMPLASM] Creating $germplasm (species_id $species_id)" );
    if( $truncated ){
      prlog( "[WARN][GERMPLASM] Truncated germplasm_name to $germplasm" ); 
    }
    if( $I ){
      $germplasm_id = $MDB->find_or_create_germplasm($germplasm,$species_id) ||
          die( $MDB->error );
    }
    # Update cache
    $all_marker_germs{$species_id}->{$germplasm} = $germplasm_id;
  }
  return $germplasm_id;
}

#----------------------------------------------------------------------
# Compares two species. 
# Assumes that Oryza sativa (japonica) is same as Oryza sativa
sub compare_species{
  my $sp1 = shift || die( "Need a species to compare" );
  my $sp2 = shift || die( "Need a species to compare with" );
  if( $sp1 eq $sp2 ){ return 'EXACT' };
  my( $g1, $s1 ) = split( /\s/, $sp1 );
  my( $g2, $s2 ) = split( /\s/, $sp2 );
  if( $g1 eq $g2 and $s1 eq $s2 ){ return 'NON_EXACT' }
  return undef;
}

#----------------------------------------------------------------------
#
sub log_correspondence_problem{
  my $type = shift || die( "Need a problem type" );
  my $marker1 = shift;
  my $marker2 = shift;

  $type = uc( $type );
  if( $type eq 'SPECIES' ){
    my $t = ( "[WARN][CORR] X-species shared syn: ".
              "Marker %s (%s, %s) vs. ".
              "Marker %s (%s, %s)" );
    prlog( sprintf( $t,
                   $marker1->{marker_name},
                   $marker1->{marker_type},
                   $marker1->{species},
                   $marker2->{marker_name},
                   $marker2->{marker_type},
                   $marker2->{species}, ) );
  } else {
    die( "Unknown correspondence problem, $type" );
  }
}

#----------------------------------------------------------------------
#
sub prlog{
  my $message = shift;
  print LOG $message."\n";
}

#----------------------------------------------------------------------
#
sub print_report{

  # Look for unhandled attributes
  foreach my $attrib( sort keys %all_attribs ){
    prlog("[WARN] Unhandled attrib: $attrib. Length: $all_attribs{$attrib}");
  }

  # Print status info
  my $msg = sprintf
      ( 
        "[INFO] Done. Processed %s record%s of %s with %s error%s.",
        $no_processed, ( $no_processed == 1 ) ? '' : 's',
        $count       , 
        $no_errors   , ( $no_errors    == 1 ) ? '' : 's',
        ); 
  prlog($msg);
  print("$msg\n");

};


#======================================================================


