#!/usr/local/bin/perl

=head1 NAME

load-markers-unigene.pl - load marker data into db direct from GenBank

=head1 SYNOPSIS

  load-markers.pl [options]

Options:

  -h|--help         Show brief help and exit.
  -v|--verbose      Talk about what's happening.
  -c|--config_file  Path to DB config file, def $ENV{GrameneConfPath}.
  -a|--analysis     Name of MarkersDB analysis  to run.
  -n|--no_insert    Do not make changes to the database. Useful for debug.
  --datafile        Path to source data file (if local load).
  --datadir         Path to working dir if downloading files.
  --logdir          Directory to write logs into. Def /tmp
  --logfile         File to write logs into. Def date.pid.analysis.log

=head1 DESCRIPTION

Loads data directly into the markers DB from GenBank UniGene file

Format of config_file;

  <markers_admin>
      db_dsn   dbi:mysql:{dbname}:{myhost}:{myport}
      db_user  marker_rw_user
      db_pass  secret
  </markers_admin>

If it all goes wrong. Here is some SQL that blows away all traces of
an analysis and its markers. Run in order due to key constraints;

  #Clean up correspondences

  delete ca
  from  marker m, analysis a, analytical_correspondence ac,
        correspondence_to_analysis ca
  where m.analysis_id=a.analysis_id
  and   m.marker_id=ac.from_marker_id
  and   ac.analytical_correspondence_id=ca.analytical_correspondence_id
  and   a.analysis_name="tigr_gene_index";
 
  delete ac
  from  marker m, analysis a, analytical_correspondence ac
  where m.analysis_id=a.analysis_id 
  and   m.marker_id=ac.from_marker_id
  and   a.analysis_name="tigr_gene_index";

  delete ca
  from  marker m, analysis a, analytical_correspondence ac,
        correspondence_to_analysis ca
  where m.analysis_id=a.analysis_id
  and   m.marker_id=ac.to_marker_id
  and   ac.analytical_correspondence_id=ca.analytical_correspondence_id
  and   a.analysis_name="tigr_gene_index";
 
  delete ac
  from marker m, analysis a, analytical_correspondence ac
  where m.analysis_id=a.analysis_id 
  and m.marker_id=ac.to_marker_id
  and a.analysis_name="tigr_gene_index";

  # Clean up synonyms
  
  update marker m, analysis a 
  set    m.display_synonym_id=NULL 
  where  m.analysis_id=a.analysis_id 
  and    a.analysis_name="tigr_gene_index";

  delete ms 
  from  marker m, marker_synonym ms, analysis a 
  where m.marker_id=ms.marker_id 
  and   m.analysis_id=a.analysis_id 
  and   a.analysis_name="tigr_gene_index";

  # And the marker

  delete md
  from marker m, analysis a, marker_details_est md
  where m.analysis_id=a.analysis_id
  and   m.marker_id=md.marker_id 
  and   a.analysis_name="tigr_gene_index";

  delete m 
  from  marker m, analysis a 
  where m.analysis_id=a.analysis_id 
  and   a.analysis_name="tigr_gene_index";

  # Reset the analysis to re-run

  update analysis 
  set last_run=NULL 
  where analysis_name="tigr_gene_index";


=head1 SEE ALSO

Gramene::Marker::DB, Text::RecordParser.

=head1 AUTHOR

Will Spooner E<lt>whs@ebi.ac.ukE<gt>.

=cut

use strict;
use Getopt::Long;
use Pod::Usage;
use Data::Dumper qw(Dumper);
use Date::Calc;
use Net::FTP;
use IO::Scalar;
use IO::Zlib qw(:gzip_external 1);
use File::Copy;

# The following paths to the gramene API may need to be changed!
# TODO: Use FindBin for relative paths instead!
#use lib qw(/usr/local/bioperl);
#use lib qw(/usr/local/gramene);

use Bio::ClusterIO;
use Bio::SeqIO;

use lib "/home/weix/gramene/lib/perl";
use Gramene::Marker::DB;
use Gramene::Ontology::OntologyDB;


#---
# Handler blocks
our ( $PROCESSED_COUNT, $COUNT, $ERROR_COUNT ) = ( 0, '???', 0 );
our ( %all_attribs );
END{ &print_report; exit(0) }
$SIG{'INT'} = sub { exit(0) };

#---
# Default network/filesystem locations
our $FTP_HOST_UNIGENE = 'ftp.ncbi.nih.gov';
our $FTP_DIR_UNIGENE  = '/repository/UniGene';
our $FTP_HOST_TIGR_GI = 'ftp.tigr.org';
our $FTP_DIR_TIGR_GI  = '/pub/data/tgi';
our $DATADIR_UNIGENE  = '/usr/local/data/cluster/unigene';
our $DATADIR_TIGR_GI  = '/usr/local/data/cluster/tgi';

#---
# Pre-processing
our( $I, $V, $ANALYSIS, $datadir, @datafiles );
our( $MDB, $ODB );

our ($year,$month,$day) = Date::Calc::Today;
print "date= $year,$month,$day\n";

SETUP:{

  #---
  # Get options
  my ( $help, $conffile, $analysis,
       $no_insert, $logfile, $logdir );
  GetOptions(
             'help'           => \$help,
             'config_file:s'  => \$conffile,
             'analysis:s'     => \$analysis,
             'datafile:s'     => \@datafiles,
             'datadir:s'      => \$datadir,
             'verbose'        => \$V,
             'no_insert'      => \$no_insert,
             'logdir:s'       => \$logdir,
             'logfile:s'      => \$logfile,
             );
  pod2usage(-verbose => 2) if $help;
  
  #---
  # Set defaults
  $logdir  ||= $ENV{PWD};

  #----
  # Validate params
  if( defined $conffile ){ $ENV{GrameneConfPath} = $conffile }
  #$no_insert ++ if @datafiles; # Use datafiles for debug only
  $I = $no_insert ? 0 : 1;
  unless( $I ){ warn( "[INFO] Evaluation run - no db changes will be made\n")}
  $analysis || ( warn( "\n[*DIE] Need a -analysis, e.g. ncbi_unigene" ) &&
                 pod2usage );

  #---
  # Validate files/paths
  foreach my $file( $ENV{GrameneConfPath}, @datafiles ){

    print "$file\n" if $V;
    unless( $file ){
      warn( "\n[*DIE] Unable to find config file\n\n" );
      pod2usage;
    } unless( -e $file ){
      warn( "\n[*DIE] File $file does not exist\n\n" );
      pod2usage;
    } unless( -r $file ){
      warn( "\n[*DIE] Cannot read $file\n\n" );
      pod2usage;
    } unless( -f $file ){
      warn( "\n[*DIE] File $file is not plain-text\n\n" );
      pod2usage;
    } unless( -s $file ){
      warn( "\n[*DIE] File $file is empty\n\n" );
      pod2usage;
    }
  }

  #----
  # Log stderr to logfile. 


  unless( $logfile ){
    my( $file ) = ( $0 =~ m/([^\/]+)$/ );
    $file =~ s/\.\w+$//;
    
    my $date = sprintf('%4.4i%2.2i%2.2i',$year,$month,$day);
    $logfile = join( ".", $date, $$, $file,'log' );
  }
  if( uc($logfile) eq 'STDERR' or uc($logfile) eq 'STDOUT' ){
    # Log to STDOUT
    open( LOG, ">&$logfile" ) or die( $! );
  } else {
    # Log to file
    open( LOG, ">$logdir/$logfile" ) or die( $! );
    open( STDERR, ">&LOG" ) or die( $! );
  }

  #-----
  # Connect to DBs
  $ENV{GATEWAY_INTERFACE} ++; # tell DBD::mysql to auto-reconnect
  if( $I ){ # use read-write DB user
    $MDB = Gramene::Marker::DB->new( admin=>1 );
  } else  { # use read-only DB user
    $MDB = Gramene::Marker::DB->new;
  }
  $MDB || die "\n[*DIE] " . Gramene::Marker::DB->error . "\n\n";
  $ODB = Gramene::Ontology::OntologyDB->new ||
      die "\n[*DIE] " . Gramene::Ontology::OntologyDB->error . "\n\n";

  #---
  # Get primary analysis from DB
  $ANALYSIS = $MDB->view_analysis(analysis_name=>$analysis) ||
  die "\n[*DIE] " . $MDB->error . "\n\n";

}

#----
# Get analysis data from DB
#my $corr_analysis = $MDB->view_analysis(analysis_name=>'EST_CLUSTER') ||
#    die "\n[*DIE] " . $MDB->error . "\n\n";
#my $corr_analysis_id = $corr_analysis->{analysis_id};


#---
# Create list of all marker types
my %MARKER_TYPES;
foreach my $type( @{$MDB->get_all_marker_types} ){
  $MARKER_TYPES{lc($type->{marker_type})} = $type->{marker_type_id};
}

#---
# Create a list of all analyses
my %ANALYSES;
foreach my $analysis( @{$MDB->get_all_analyses} ){
  $ANALYSES{lc($analysis->{analysis_name})} = $analysis->{analysis_id};
}

#---
# Marker type attributes to keep/skip
# TODO: make generic
#my $table = 'marker_details_est_cluster';

my ($table, @fields) = $MDB->marker_type_to_table_name("EST Cluster");
#my %detail_fields = ( map{$_=>1}  
                      #@{Gramene::Marker::DB::DETAILS_FIELD_NAMES->{$table}} );
my %detail_fields = ( map{$_=>1}  @fields);
my %skip_fields; 

#---
# Get a list of hashrefs representing input streams. Implementation
# will vary depending on the analysis.
my @streams = &get_input_streams( $ANALYSIS, @datafiles );


#---
# Loop for each est_cluster file, and each cluster
my $batch_size = 1000; # For progress output
STREAM: foreach my $stream( @streams ){
  prlog("[INFO] Processing stream $stream->{description}");
  print("...processing stream $stream->{description}...\n");
  my $STREAM_COUNT = 0;

  ENTRY: while( my $entry = &get_next_entry( $ANALYSIS, $stream ) ){

      print "get entry\n" if $V;    
    MARKER: foreach my $marker( &get_markers( $ANALYSIS, $stream, $entry ) ){

      #---
      # Status info
      my $batch = $PROCESSED_COUNT/$batch_size;
      if( $batch - int($batch) eq 0 ){
        print( "...processed $STREAM_COUNT in stream ".
               "($PROCESSED_COUNT tot)...\n" );
      }
      $PROCESSED_COUNT ++;
      $STREAM_COUNT    ++;

      print "get MARKER $marker->{marker_name}, " if $V; 
      #---
      # How do we handle this marker?
      # Option 1 - new marker, insert in DB
      # Option 2 - updated marker, update DB
      # Option 3 - old/duplicate marker, ignore
      # Extra 1  - create correspondences based on shared synonyms
      #            from markers of same species but different type.
      # Extra 2  - Warn if shared synonym with different species.
      my @synonyms = ( $marker->{marker_name}, @{$marker->{synonyms}||[]} );
      my @cmarker_ids;
      eval{
	@cmarker_ids = $MDB->marker_synonym_search( synonyms => [@synonyms] );
      };

      if($@){
	print "[*ERROR] $@";
      }
	
      my $duplicate = 0;
      my @correspondences;
      foreach my $cmarker_id( @cmarker_ids ){
	print "cmarker_ids = $cmarker_id\n" if $V;
        my $cmarker = $MDB->view_marker( marker_id => $cmarker_id );

        # Test whether markers from same species
        if( &compare_species( $marker->{species}, $cmarker->{species} )){

          # Test whether markers from same type
          if( uc($marker->{marker_type}) eq uc($cmarker->{marker_type}) ){

            # Test whether markers from same date
            my( $m_date, $cm_date );
            ( $m_date  = $marker->{details}->{date_updated}  ) =~ s/-//g;
            ( $cm_date = $cmarker->{details}->{date_updated} ) =~ s/-//g;
            if( $m_date <= $cm_date ){
              # No change from marker already in DB!              
              &prlog( "[INFO][MARKER] already up-to-date in DB: ".
                      "$marker->{marker_name} ".
                      "($marker->{marker_type}, $marker->{species})" );
              $duplicate ++;
              next MARKER; # Just skip the rest of the processing for now...
            } else {
              # Updated marker
              # TODO: think about handling curated data!
              $marker->{marker_id} = $cmarker->{marker_id};
              # Combine synonyms into unique list
              my %syns = ( map{$_=>1}
                           ( @{$marker->{synonyms}},
                             map{$_->{marker_name}} @{$cmarker->{synonyms}} ));
              $marker->{synonyms} = [keys %syns];
            }
          } else {
            # Marker has shared synonym to marker of same species
            push( @correspondences, 
                  { from_marker_id => $cmarker_id,
                    analysis_id    => $ANALYSES{shared_synonym} } );
          }
        } else {
          # Marker has shared synonym to marker of different species
          &log_correspondence_problem( 'species', $marker, $cmarker );
        }
      }

      #---
      # Get analysis-specific correspondences
      push( @correspondences, 
            &get_correspondences( $ANALYSIS, $stream, $entry, $marker ) );

     
      #---
      # Add/update the marker to the DB
      $marker->{analysis_id} ||= $ANALYSIS->{analysis_id}; # Set if missing
      my $marker_id = $marker->{marker_id};
      unless( $marker_id ){
        # Insert marker into DB
        &prlog( "[INFO][MARKER] Creating $marker->{marker_name} ".
                "($marker->{marker_type}, $marker->{species})" );
        if( $I ){
          $marker->{marker_id} = $MDB->create_marker(%$marker) ||
              die( $MDB->error );
          
        }
      } else {
        # Update marker
        &prlog( "[INFO][MARKER] Updating $marker->{marker_name} ".
                "($marker->{marker_type}, $marker->{species})" );    
        if( $I ){
          $MDB->update_marker(%$marker) || die( $MDB->error );
        }
      }
      
       print "Correspondences\n" if $V;
      # And the correspondences
      foreach my $corr( @correspondences ){
        $corr->{from_marker_id} ||= $marker->{marker_id};
        $corr->{to_marker_id}   ||= $marker->{marker_id};
        &prlog("[INFO][CORR] Creating between IDs ".
               "$corr->{from_marker_id} and $corr->{to_marker_id}");
        if( $I ){
          $MDB->set_correspondence( %$corr ) || die( $MDB->error );
        }
      }

       print "after Correspondences\n" if $V;
      # Add the detail attributes to the list for validation
      my %details = %{$marker->{details}};
      foreach my $key( sort keys %details ){
        next if( $skip_fields{$key} or $detail_fields{$key});
        my $l = length($details{$key});
        $all_attribs{$key} = $l if( $l > $all_attribs{$key} ); 
        print sprintf( "%-16.16s = %-100.100s\n", $key, $details{$key} ) if $V;
      }
    } # MARKER
  } # ENTRY
} # STREAM

# All done - update the last_run field of analysis
$ANALYSIS->{last_run} = sprintf( "%4.4i-%2.2i-%2.2i", Date::Calc::Today );
if( $I ){
  $MDB->update_analysis(%$ANALYSIS) || die( $MDB->error );
}
prlog("[INFO][ANALYSIS] ".
      "Updating $ANALYSIS->{analysis_name}.last_run to ".
      "$ANALYSIS->{last_run}");


exit;



#======================================================================
# Returns a list of input stream hashrefs.
# The exact representation is analysis-specific, 
# but common keys are respected. TODO: document common keys!
sub get_input_streams{
  my $analysis = shift;
  my @datafiles = @_;

  unless( $analysis and ref($analysis) eq 'HASH' ){ 
    die( "Need an analysis hashref" );
  }

  if( $analysis->{type} eq 'entrez' ){
    return &get_entrez_input_streams( $analysis, @datafiles );
  } 
  elsif( $analysis->{type} eq 'unigene' ){
    return &get_unigene_input_streams( $analysis, @datafiles );
  } 
  elsif( $analysis->{type} eq 'tigr_gene_index' ){
    print "analysis = tigr_gene_index\n" if $V;
    return &get_tgi_input_streams( $analysis, @datafiles );
  }
  elsif( $analysis->{type} eq 'plantgdb_tug' ){
    return &get_tug_input_streams( $analysis, @datafiles );
  }
  else{ 
    prlog("[*DIE] Problem with analysis $analysis->{analysis_name}: ".
          "Cannot create stream for analysis.type $analysis->{type} " );
    exit;
  }
}

#----------------------------------------------------------------------
# This is the auto-update code that sync's gramene's unigene files
# with the latest from NCBI.
#
# Gramene's UniGene files are kept in the datadir 
# (/usr/local/data/cluster/unigene by default).
# These have filenames that match YYYYMMDD.species.unigene
# Files may be gzipped.E.g. 20050722.Zea_mays.unigene.gz
# The file's timestamp should correspond with the "ESTs are from dbEST 
# through DD Mmm YYYY" line in the <species>.info (e.g. Os.info) file
# available from the UniGene FTP site.
#
# This code compares the timestamp in indicated by the analysis
# with that in the UniGene <species>.info file. If the latter is greater
# then the 'newer' file will be downloaded into the datadir and named
# appropriately. 
#
# A list of 'stream' hashrefs corresponding to updated files will be returned
#
# Any unigene set for which the species matches one in Gramene MarkersDB 
# will be considered for processing, except those explicitly excluded.
#
sub get_unigene_input_streams{
  my $analysis = shift;
  my @datafiles = @_;
  my @streams;

  $datadir ||= $DATADIR_UNIGENE;

  my %ignore_species = ('Arabidopsis_thaliana'=>1); # Don't process these!

  #---
  # If datafiles, then use these for local load
  if( scalar( @datafiles ) ){
    foreach my $file( @datafiles ){
      my $fh;
      if( $file =~ /gz$/ ){ # Gzipped file
        $fh = IO::File->new();
        tie *$fh, 'IO::Zlib', $file, "rb";
      } else { # Plain text file
        my $fh = IO::File->new("< $file");
      }
      my $clustio = Bio::ClusterIO->new( -format=>'unigene', -fh=>$fh );
      push( @streams,
            { ClusterIO   => $clustio,
              datafile    => $file, 
              description => "NCBI UniGene data from $file" } );
      prlog("[INFO] Created stream for UniGene data from $file");
    }
  }

  #---
  # No datafiles, FTP the datafiles from GenBank
  else{
    # Initialise the FTP connection
    my $ftp = Net::FTP->new($FTP_HOST_UNIGENE) ||
        die( "[*DIE] Cannot connect to $FTP_HOST_UNIGENE: $@" );
    $ftp->login("anonymous",'-anonymous@') ||
        die( "[*DIE] Cannot login ", $ftp->message );
    $ftp->cwd($FTP_DIR_UNIGENE) ||
        die( "[*DIE] Cannot change working directory ", $ftp->message );

    # Loop through each UniGene species
  SP: foreach my $sp( $ftp->ls ){
    next if $ignore_species{$sp};
    my $sp_str = $sp;
    $sp_str =~ s/_/ /g;
   
    # If UniGene species matches MarkerDB, then consider processing
    if( my $gsp = $MDB->view_species( species=>$sp_str ) ){
     
      # Parse the date stamp and version from the <species>.info file
      my $ugene_date = 0;
      my( $info_file) = $ftp->ls("$sp/*.info");
      my( $prefix   ) = $info_file =~ m/(\w+).info/;
      my $info_data;
      my $sh = IO::Scalar->new(\$info_data);
      $ftp->ascii; # Need to play nice with Scalar::IO
      $ftp->get( $info_file, $sh );

      # Get UniGene version
      my $ugene_version;
      if( $info_data =~ m/(UniGene Build \#\d+)/ ){
        $ugene_version = $1;
      }

      # Get UniGene release date
      my( $ugene_date, $file_date );
      if( $info_data =~ m/dbEST through (\d+) (\w+) (\d+)/ ){
        my ( $d,$m,$y ) = ($1,$2,$3);
        $m = Date::Calc::Decode_Month($m);
        $ugene_date = sprintf( "%4.4d-%2.2d-%2.2d", $y,$m,$d );
        $file_date  = sprintf( "%4.4d%2.2d%2.2d", $y,$m,$d );
      } else {
        # Date not found
        &prlog("[WARN] UniGene date for $sp not found") && next(SP);
      }  
      
      # Download file if unigene date > analysis.last_run
      if( $ugene_date > ($analysis->{last_run} || 0) ){
        # FTP the latest unigene file
        my $grm_file = "$datadir/$ugene_date.$sp.unigene.gz";
        prlog( "[INFO] FTPing $sp $ugene_version $ugene_date to $grm_file" );
        $ftp->binary;
        $ftp->get( "$sp/$prefix.data.gz", $grm_file );
        
        # Create a stream on latest file
        my $fh = IO::File->new();
        tie *$fh, 'IO::Zlib', $grm_file, "rb";
        my $clustio = Bio::ClusterIO->new( -format=>'unigene', -fh=>$fh );
        push( @streams,
              { ClusterIO    => $clustio,
                datafile     => $grm_file, 
                version      => $ugene_version,
                date_updated => $ugene_date,
                species      => $sp,
                prefix       => $prefix,
                marker_type  => "est_cluster",
                description  => ( "UniGene data for ".
                                  "$sp $ugene_version $ugene_date" ),    
              } );
        prlog("[INFO] Created stream for UniGene data from $grm_file");

      }
    }
  }
  }

  return @streams;
}

#----------------------------------------------------------------------
# This is the auto-update code that sync's gramene's tgi files              
# with the latest from TIGR
#                            
# Gramene's tgi files are kept in the datadir 
# (/usr/local/data/cluster/tgi by default).
# These have filenames that match YYYYMMDD.species.tgi and 
# YYYYMMDD.species.fasta
# Files may be zipped.E.g. 20050722.Zea_mays.tigr.zip
#
# This code compares the timestamp in indicated by the analysis
# with that of the tgi files. If the latter is greater
# then the 'newer' file will be downloaded into the datadir and named
# appropriately. 
#
# A list of 'stream' hashrefs corresponding to updated files will be returned
#
# Any tgi set for which the species matches one in Gramene MarkersDB 
# will be considered for processing, except those explicitly excluded.
#
sub get_tgi_input_streams{
  my $analysis = shift;
  my @datafiles = @_;
  my @streams;

  $datadir ||= $DATADIR_TIGR_GI;

  my %ignore_species = ('Arabidopsis_thaliana'=>1); # Don't process these!      
  #---
  # If datafiles, then use these for local load
  if( scalar( @datafiles ) ){
    #TODO: Implement!
    
    my %files_for_streams;
    my ($latest_date, $sp, $latest_version, $prefix, $sp_str) ;
    foreach my $file( @datafiles ){ 
      print "In get_tgi_input_streams, process $file\n";
      my $filetype = 'fasta';
      if( $file =~ m=(\d{4}) (\d{2}) (\d{2}) # year month day
                     \. ([A-Z]+)             # prefix ZMGI
                     \. r(\d+)               # release
                     \. (\w+)                  # specise
                     \. tgi
                     \. (\w+)\z =xmsi ){ 
	#20051012.ZMGI.r16.Zea_mays.tgi.fasta
	
	$latest_date = "$1-$2-$3";
	$prefix      = $4;
	$latest_version = "$prefix release $5";
	$sp          = $6;
	$filetype    = lc($7);

	print "Matched $latest_date,\n" if $V;	
      }else{
	print "[*ERROR] Bad TIGR file name format, $file, filename should look like 20051012.ZMGI.r16.Zea_mays.tgi.fasta\n";
	next;
      }
				    
      $sp_str = $sp;
      $sp_str =~ s/_/ /;
      $files_for_streams{$filetype} = $file;
      
      print "$filetype => $files_for_streams{$filetype}\n" if $V;
				    
    }

    if(!$files_for_streams{tcs} || !$files_for_streams{fasta} || !$files_for_streams{tc_est}){
      die "[*DIE] Missing files, must have all there types tcs, fasta, tc_est\n";
    }
    # Create streams on the tcs, fasta and tc_est files.
    # There are three streams for each TGI set;
    #   a list of all TCs and their synonyms,
    #   a list of all TCs and their members,
    #   a fasta file of TC concensus sequences.
    # Set date_updated for the synonym file (first to be processed)
    # to two days before the real release, and the est file to one
    # day before. That way the marker gets updated on each pass of the 
    # main subroutine.
    my( $y, $m, $d ) = split( "-", $latest_date );
    my $tcs_date   = sprintf( "%4.4d-%2.2d-%2.2d",
			      Date::Calc::Add_Delta_Days($y,$m,$d,-2) );
    my $fasta_date = sprintf( "%4.4d-%2.2d-%2.2d",
			      Date::Calc::Add_Delta_Days($y,$m,$d,-1) );
    my $est_date   = $latest_date;       

print "$tcs_date, $fasta_date, $est_date\n" if $V;    
    push( @streams,
	  { FileHandle   => IO::File->new($files_for_streams{tcs}),
	    datatype     => 'tcs',
	    datafile     => $files_for_streams{tcs},
	    version      => $latest_version,
	    date_updated => $tcs_date,
	    species      => $sp_str,
	    prefix       => $prefix,
	    marker_type  => "est_cluster",
	    description  => ( "TGI tcs data for".
			      "$sp $latest_version, $latest_date" ) } );
    push( @streams,
	  { SeqIO        => Bio::SeqIO->new
	    (-format => 'fasta',
	     -file   => $files_for_streams{fasta}),
	    datatype     => 'fasta',
	    datafile     => $files_for_streams{fasta},
	    version      => $latest_version,
	    date_updated => $fasta_date,
	    species      => $sp_str,
	    prefix       => $prefix,
	    marker_type  => "est_cluster",
	    description  => ( "TGI fasta data for".
			      "$sp $latest_version, $latest_date" ) } );
    
    push( @streams,
	  { FileHandle   => IO::File->new($files_for_streams{tc_est}),
	    datatype     => 'tc_est',
	    datafile     => $files_for_streams{tc_est},
	    version      => $latest_version,
	    date_updated => $est_date,
	    species      => $sp_str,
	    prefix       => $prefix,
	    marker_type  => "est_cluster",
	    description  => ( "TGI tc_est data for".
			      "$sp $latest_version, $latest_date" ) } );
    
    #print "Stream content sample=\n" ;

    #for my $s(@streams){
     # for my $k (keys %{$s}){
	#print "$k\n";
      #}
    #}
    return @streams;
  }
  
  #---
  # No datafiles, FTP the datafiles from source
  else{ 
    # Initialise the FTP connection
    my $ftp = Net::FTP->new($FTP_HOST_TIGR_GI) ||
      die( "[*DIE] Cannot connect to $FTP_HOST_TIGR_GI: $@" );
    $ftp->login("anonymous",'-anonymous@') ||
      die( "[*DIE] Cannot login ", $ftp->message );
    $ftp->cwd($FTP_DIR_TIGR_GI) ||
      die( "[*DIE] Cannot change working directory ", $ftp->message );
    
    # Loop through each TGI species
  SP: foreach my $sp( $ftp->ls ){  
      next if $ignore_species{$sp};
      my $sp_str = $sp;
      $sp_str =~ s/_/ /g;
      print "TIGR ftp site : $sp_str\n" if $V;
      # If UniGene species matches MarkerDB, then consider processing
      
      my $gsp;
      eval{
	$gsp = $MDB->view_species( species=>$sp_str );
      };
      
      if( $@ ){
	print "$sp_str not a gramene species, $@\n";
	next;
      }
      
      #    if( my $gsp = $MDB->view_species( species=>$sp_str ) ){ 
      
      
      
      # This directory contains all tgi releases for $sp. FInd most recent
      #
      
      my $latest_version = 0;
      my $latest_date = 0;
      my $latest_cmp_date = 0;
      my $prefix;
      my $latest_file;
      my $release = 0;
      foreach my $dir( $ftp->dir("$sp/*.release_*.zip" )){
	print "Process $dir\n" if $V;
	my @bits = split( /\s+/, $dir);
	my $file = $bits[8];
	my( $m,$d,$y ) = @bits[5..7]; #Oct 12 20:45
	$m = Date::Calc::Decode_Month($m); #
	
	# in case of Oct 12 20:45
	if($y =~ /\d+:\d+/ ){
	  $y =  $m > $month ? $year - 1 : $year;
	}
	
	my $date      = sprintf( "%4.4d-%2.2d-%2.2d", $y,$m,$d );
	my $cmp_date  = sprintf( "%4.4d%2.2d%2.2d", $y,$m,$d );
	
	if( $file =~ m/(\w+)\.release_(.+)\.zip/ ){
	  if( $cmp_date > $latest_cmp_date ){
	    $prefix = $1;
	    $release = $2;
	    $latest_version = "$1 release $2";
	    $latest_date = $date;
	    $latest_cmp_date = $cmp_date;
	    $latest_file = $file;
	  }
	}
      }
      
      # Download file if latest tgi date > analysis.last_run
      my $last_run = $analysis->{last_run} || 0;
      $last_run =~ s/-//g; print "$latest_cmp_date > $last_run ??\n" if $V;
      if( $latest_cmp_date > $last_run ){
	my $grm_file = "$datadir/$latest_cmp_date.$prefix.r$release.$sp.tgi.zip";
	
	prlog( "[INFO] FTPing $sp $latest_version $latest_date to $grm_file" );
	$ftp->binary;
	$ftp->get( $latest_file, $grm_file );
	
	# Have to use unzip to unpack the file
	my @args = ("unzip","-u","-q","-d",$datadir,$grm_file);
	system( @args ) == 0 or die( "[*DIE] system @args failed: $?" );
	
	# Rename unpacked files
	unlink( "$datadir/README" );
	my %files_for_streams;
	foreach my $file( glob("$datadir/$prefix*") ){
	  my $filetype = 'fasta';
	  if( $file =~ m/$datadir\/$prefix\.(\w+)\./ ){
	    $filetype = lc($1);
	  }
	  my $grm_name = "$datadir/$latest_cmp_date.$prefix.r$release.$sp.tgi.$filetype"; #20051012.ZMGI.r16.Zea_mays.tgi.fasta
	File::Copy::move($file,$grm_name);
	$files_for_streams{$filetype} = $grm_name;
      }
      
      # Create streams on the tcs, fasta and tc_est files.
      # There are three streams for each TGI set;
      #   a list of all TCs and their synonyms,
      #   a list of all TCs and their members,
      #   a fasta file of TC concensus sequences.
      # Set date_updated for the synonym file (first to be processed)
      # to two days before the real release, and the est file to one
      # day before. That way the marker gets updated on each pass of the 
      # main subroutine.
      my( $y, $m, $d ) = split( "-", $latest_date );
      my $tcs_date   = sprintf( "%4.4d-%2.2d-%2.2d",
				Date::Calc::Add_Delta_Days($y,$m,$d,-2) );
      my $fasta_date = sprintf( "%4.4d-%2.2d-%2.2d",
				Date::Calc::Add_Delta_Days($y,$m,$d,-1) );
      my $est_date   = $latest_date;       
      
      push( @streams,
	    { FileHandle   => IO::File->new($files_for_streams{tcs}),
	      datatype     => 'tcs',
	      datafile     => $files_for_streams{tcs},
	      version      => $latest_version,
	      date_updated => $tcs_date,
	      species      => $sp_str,
	      prefix       => $prefix,
	      marker_type  => "est_cluster",
	      description  => ( "TGI tcs data for".
				"$sp $latest_version, $latest_date" ) } );
      push( @streams,
	    { SeqIO        => Bio::SeqIO->new
	      (-format => 'fasta',
	       -file   => $files_for_streams{fasta}),
	      datatype     => 'fasta',
	      datafile     => $files_for_streams{fasta},
	      version      => $latest_version,
	      date_updated => $fasta_date,
	      species      => $sp_str,
	      prefix       => $prefix,
	      marker_type  => "est_cluster",
	      description  => ( "TGI fasta data for".
				"$sp $latest_version, $latest_date" ) } );
      
      push( @streams,
	    { FileHandle   => IO::File->new($files_for_streams{tc_est}),
	      datatype     => 'tc_est',
	      datafile     => $files_for_streams{tc_est},
	      version      => $latest_version,
	      date_updated => $est_date,
	      species      => $sp_str,
	      prefix       => $prefix,
	      marker_type  => "est_cluster",
	      description  => ( "TGI tc_est data for".
				"$sp $latest_version, $latest_date" ) } );
      
    }
    
    print "TIGR ftp site end: \n" if $V;
  } 
  
}
print "there are " . scalar @streams . " files\n";
return @streams;
}

#----------------------------------------------------------------------
# For a given analysis and stream hashref, return the next entry in 
# stream.
# The exact representation of the entry will depend on the analysis.
sub get_next_entry{
  my $analysis = shift;
  my $stream = shift;

  if( $stream->{ClusterIO} ){ # Bio::ClusterIO stream
    return $stream->{ClusterIO}->next_cluster;
  }
  if( $stream->{SeqIO} ){ # Bio::SeqIO stream
    return $stream->{SeqIO}->next_seq;
  }
  if( my $fh = $stream->{FileHandle} ){ # Simple file handle stream
    return <$fh>;
  }
  die( "Cannot get next entry from $stream->{description}" );
}

#----------------------------------------------------------------------
# This method turns the entry from the stream into a list of markers.
# Analysis-specific 
sub get_markers{
  my $analysis = shift;
  my $stream   = shift;
  my $entry    = shift;

    unless( $analysis and ref($analysis) eq 'HASH' ){ 
    die( "Need an analysis hashref" );
  }

  if( $analysis->{type} eq 'entrez' ){
    return &get_entrez_markers( $analysis, $stream, $entry );
  } 
  elsif( $analysis->{type} eq 'unigene' ){
    return &get_unigene_markers( $analysis, $stream, $entry );
  } 
  elsif( $analysis->{type} eq 'tigr_gene_index' ){
    return &get_tgi_markers( $analysis, $stream, $entry );
  }
  elsif( $analysis->{type} eq 'plantgdb_tug' ){
    return &get_tug_markers( $analysis, $stream, $entry );
  }
  else{ 
    prlog("[*DIE] Problem with analysis $analysis->{analysis_name}: ".
          "Cannot get markers for analysis.type $analysis->{type} " );
    exit;
  }
}

#----------------------------------------------------------------------
#
sub get_unigene_markers{
  my $analysis = shift;
  my $stream   = shift;
  my $clust    = shift;

  my $class = "Bio::Cluster::UniGeneI";
  $clust->isa($class) || die( "Entry must be a $class, not $clust" );

  # Initialise the marker annotation variables, and populate the easy ones
  my $marker = {
    marker_name     => $clust->unigene_id,
    marker_type     => $stream->{marker_type},
    description     => $clust->description,
    analysis_id     => $analysis->{analysis_id},
    synonyms        => [],
    details         => {},
  };
    
  # Process species
  my $species_obj = $clust->species;
  $marker->{species_id} = &markerdb_species_id($species_obj);
  $marker->{species}    = $species_obj->binomial;
  
  # Process synonyms
  if( my $syn = $clust->display_name ){
    if( $syn ne $marker->{marker_name} ){
      # Not sure if this is ever the case
      push( @{$marker->{synonyms}}, $syn );
    }
  }

  # Process other annotation fields
  my %details = ( version      => $stream->{version},
                  date_updated => $stream->{date_updated} );

  # others (single val)
  $details{authority} = $clust->authority if $clust->authority; # NCBI
  $details{namespace} = $clust->namespace if $clust->namespace; # UniGene

  # Sequence counts
  $details{seqence_count} = $clust->scount; # total
  foreach my $seq( @{$clust->sequences} ){
    my $stype = lc( $seq->{seqtype} );
    $details{$stype."_count"} ++;
  }

  #---
  # Other possible annotation that we are not using for now;
  #if( $clust->title ne $marker->{description} and $clust->title ){
  #  # Not sure if this is ever the case
  #  $details{title} = $clust->title;
  #}
  #$details{version}       = $clust->version       if $clust->version;
  #$details{gene}          = $clust->gene          if $clust->gene;
  #$details{cytoband}      = $clust->cytoband      if $clust->cytoband;
  #$details{mgi}           = $clust->mgi           if $clust->mgi;
  #$details{restr_expr}    = $clust->restr_expr    if $clust->restr_expr;
  #$details{gnm_terminus}  = $clust->gnm_terminus  if $clust->gnm_terminus;
  #$details{cluster_score} = $clust->cluster_score if $clust->cluster_score;
  # others (multi val)
  #if( my $mval = join( ", ", @{$clust->locuslink}) ){
  #  $details{locuslink} = $mval;
  #}
  #if( my $mval = join( ", ", @{$clust->express} ) ){
  #  $details{express} = $mval;
  #}
  #if( my $mval = join( ", ", @{$clust->chromosome} ) ){
  #  $details{chromosome} = $mval;
  #}
  #if( my $mval = join( ", ", @{$clust->sts} ) ){
  #  $details{sts} = $mval;
  #}
  #if( my $mval = join( ", ", @{$clust->txmap} ) ){
  #  $details{txmap} = $mval;
  #}
  # homology/protein-similarity. Skip for now
  #prlog( "==> homol:        ". $clust->homol );
  #prlog( "==> protsim:      ". join(", ", @{$clust->protsim}) );
  # Stuff duplicated by next_seq
  #prlog( "==> sequences:    ". join(", ", @{$clust->sequences}) );
  #prlog( "==> get_members:  ". join(", ", $clust->get_members) );
  
  #---
  # loop through each sequence in order to;
  #   count coresponding sequences of each type
  #   collect correspondence info
#  my @correspondences;
#  while( my $seq = $clust->next_seq ){
#    my $accession = $seq->accession;
#    
#    # Determine type of sequence, and update counter
#    my $seqtype;
#    my $marker_type_id;
#    if( $seq->annotation ){
#      ($seqtype) = $seq->annotation->get_Annotations('seqtype');
#      $details{lc($seqtype)."_count"} ++ if $seqtype;
#      $marker_type_id = $MARKER_TYPES{lc($seqtype)};  
#    }
#    
#    # Find IDs of correspondence markers
#    unless( $marker_type_id ){
#      prlog( "[WARN][CORR] Marker type $seqtype not found" );
#      next;
#    }
#    my @corr_markers = $MDB->marker_synonym_search
#        ( 
#          synonyms => [$accession],
#          marker_type_id => $marker_type_id );
#    if( ! @corr_markers ){
#      prlog( "[WARN][CORR] Marker $accession ($seqtype) not found" )
#        } elsif ( @corr_markers > 1 ){
#          prlog( "[WARN][CORR] Marker $accession ($seqtype) maps to ".
#                 scalar @corr_markers. " entries" );
#        }
#    push(@correspondences, @corr_markers);
#  } # end next_seq
  
  $marker->{details} = {%details};
  return( $marker );
}

#----------------------------------------------------------------------
#
sub get_tgi_markers{
  my $analysis = shift;
  my $stream   = shift;
  my $entry    = shift;

  if( $stream->{datatype} eq 'tcs' ){
    # The tcs datatype contains the list of synonyms
    my @synonyms = split( /\s+/, $entry );
    my $marker_name = shift @synonyms;
    $marker_name =~ s/^>//;
    return({
      marker_name     => $marker_name,
      marker_type     => $stream->{marker_type},
      species         => $stream->{species},
      synonyms        => [@synonyms],
      details         => { date_updated => $stream->{date_updated},
                           authority    => 'TIGR',
                           namespace    => 'Gene_Index',
                           version      => $stream->{version} }
    });
  }
  
  if( $stream->{datatype} eq 'fasta' ){
    # The fasta datatype contains the concensus sequence
    my $class = "Bio::SeqI";
    $entry->isa($class) || die( "Entry must be a $class, not $entry" );
    return({
      marker_name     => $entry->display_id,
      description     => $entry->description,
      marker_type     => $stream->{marker_type},
      species         => $stream->{species},
      details         => { date_updated => $stream->{date_updated},
                           sequence     => $entry->seq, }
    });
  }

  if( $stream->{datatype} eq 'tc_est' ){
    # The tc_est datatype contains the list of mappings to ESTs
    my @ests = split( /\s+/, $entry );
    return({
      marker_name     => shift @ests,
      marker_type     => $stream->{marker_type},
      species         => $stream->{species},
      details         => { date_updated => $stream->{date_updated},
                           seqence_count => scalar(@ests) }
    });
  }

  die("[*DIE] datatype must be tcs, tc_est or fasta, not $stream->{datatype}");

}

#----------------------------------------------------------------------
# This method turns the entry from the stream into a list of markers.
# Analysis-specific 
sub get_correspondences{
  my( $analysis, $stream, $entry, $marker ) = @_;

  unless( $analysis and ref($analysis) eq 'HASH' ){ 
    die( "Need an analysis hashref" );
  }

  if( $analysis->{type} eq 'entrez' ){
    return ();
    #return &get_entrez_correspondences( @_ );
  } 
  elsif( $analysis->{type} eq 'unigene' ){
    return &get_unigene_correspondences( @_ );
  } 
  elsif( $analysis->{type} eq 'tigr_gene_index' ){
    return &get_tgi_correspondences( @_ );
  }
  elsif( $analysis->{type} eq 'plantgdb_tug' ){
    return ();
    #return &get_tug_correspondences( @_ );
  }
  else{ 
    prlog("[*DIE] Problem with analysis $analysis->{analysis_name}: ".
          "Cannot get correspondences for analysis.type $analysis->{type} " );
    exit;
  }
}

#----------------------------------------------------------------------
#
sub get_unigene_correspondences{
  my( $analysis, $stream, $clust, $marker ) = @_;

  my $class = "Bio::Cluster::UniGeneI";
  $clust->isa($class) || die( "Entry must be a $class, not $clust" );

  my @correspondences;
  foreach my $seq( @{$clust->sequences} ){

    my $marker_type_id = $MARKER_TYPES{lc($seq->{seqtype})};
    
    # Find IDs of correspondence markers
    unless( $marker_type_id ){
      prlog( "[WARN][CORR] Marker type $seq->{seqtype} not found" );
      next;
    }

    my @corr_markers = $MDB->marker_synonym_search
        ( synonyms => [$seq->{acc}],
          marker_type_id => $marker_type_id );

    if( ! @corr_markers ){
      prlog( "[WARN][CORR] Marker $seq->{acc} ($seq->{seqtype}) not found" );
    } 
    elsif ( @corr_markers > 1 ){
      prlog( "[WARN][CORR] Marker $seq->{acc} ($seq->{seqtype}) maps to ".
             scalar @corr_markers. " entries" );
    }
    foreach my $cmarker_id( @corr_markers ){
      push(@correspondences, { from_marker_id => $cmarker_id,
                               analysis_id    => $ANALYSES{est_cluster} });
    }
  }
  return @correspondences;
}

#----------------------------------------------------------------------
#
sub get_tgi_correspondences{
  my( $analysis, $stream, $entry, $marker ) = @_;

  my @correspondences;
  if( $stream->{datatype} eq 'tc_est' ){

    # Entry is a space-separated list of EST IDs, prefixed with 'GB|'
    my @cnames = map{ $_ =~ s/\w+\|//; $_ } split( /\s+/, $entry );
    shift @cnames; # Remove TC name


    # Get IDs of corresponding EST and mRNA markers
    my @cids;
    push( @cids, 
          $MDB->marker_synonym_search(synonyms=>[@cnames],
                                      marker_type_id=>$MARKER_TYPES{'est'}) );
    push( @cids, 
          $MDB->marker_synonym_search(synonyms=>[@cnames],
                                      marker_type_id=>$MARKER_TYPES{'mrna'}) );

    # Create corresondences for each found marker
    foreach my $cid( @cids ){
      push @correspondences, { from_marker_id => $cid,
                               analysis_id    => $ANALYSES{est_cluster} };
    }
  }
  return @correspondences;
}

#----------------------------------------------------------------------

sub annotation_to_hash{
  my $ac = shift;
  $ac->isa('Bio::AnnotationCollectionI') || 
      die( "Need an AnnotationCollection" );

  my %annots;
  foreach my $key ( $ac->get_all_annotation_keys() ){
    foreach my $annot( $ac->get_Annotations( $key ) ){
      if( $annot->isa('Bio::Annotation::SimpleValue') ){
        $annots{$key} = $annot->value;
      } elsif( $annot->isa('Bio::Annotation::Comment') ){
        $annots{$key} = $annot->text;
      } elsif( $annot->isa('Bio::Annotation::Reference' ) ) {
        $annots{'ref_authors'}  = $annot->authors;
        $annots{'ref_title'}    = $annot->title;
        $annots{'ref_location'} = $annot->location;
        $annots{'ref_pubmed'}   = $annot->pubmed;
        $annots{'ref_medline'}  = $annot->medline;
        if( $annots{'ref_location'} =~ /\((\d{4,4})\)/ ){
          $annots{'ref_year'} = $1;
        }
      } else {
        $annots{$key} = $annot;
      }
    }
  }
  return %annots;
}

#----------------------------------------------------------------------
#
sub seqfeature_to_hash{
  my $sf = shift;
  $sf->isa("Bio::SeqFeatureI") ||
      die( "Need a SeqFeature" );

  my %annots;
  foreach my $tag( $sf->get_all_tags ){
    $annots{$tag} = join( " || ", $sf->get_tag_values( $tag ) );
  }
  return %annots;
}

#----------------------------------------------------------------------
#
# Pre-cache a list of all marker species keyed by gramene_taxonomy_id.
our %all_marker_species;
sub markerdb_species_id{
  unless( %all_marker_species ){
    # Initialise cache
    %all_marker_species = ( map{ $_->{species} => $_ }
                            @{$MDB->get_all_species} );
  }

  my $bio_species = shift || die( "Need a Bio::Species" );
  my $species_name = $bio_species->binomial;

  if( my $sp = $all_marker_species{$species_name} ){
    # Found in cache
    return $sp->{species_id};
  } 

  # Create new
  my $species_id = '???';

  my $gramene_tax_id;
  if( my $ncbi_taxid = $bio_species->ncbi_taxid ){
    my $term = $ODB->get_term_by_xref('NCBI_taxid',$ncbi_taxid) ||
        die $ODB->error; # TODO - handle this gracefuly!
    $gramene_tax_id = $term->{term_accession};
  }

  my $common_name = $bio_species->common_name;
  if( $species_name eq $common_name ){ $common_name = undef }
  my %sp_data = ( species_id          => $species_id,
                  species             => $species_name,
                  common_name         => $common_name,
                  gramene_taxonomy_id => $gramene_tax_id );
  prlog( "[INFO][SPECIES] Creating $sp_data{species} ($gramene_tax_id)" );
  if( $I ){
    $species_id = $MDB->find_or_create_species( $species_name ) ||
        die $MDB->error;
    $sp_data{species_id} = $species_id;
    $MDB->update_species( %sp_data ) ||
        die $MDB->error;
  }
  # Update cache
  $all_marker_species{$species_name} = \%sp_data;
  
  return $species_id;
}

#----------------------------------------------------------------------
#
# Cache a list of all marker germplasms keyed by germplasm name.
our %all_marker_germs;
sub markerdb_germplasm_id{

  unless( %all_marker_germs ){
    # Initialise cache
    foreach( @{$MDB->get_all_germplasm} ){
      my $sid = $_->{species_id};
      my $gid = $_->{germplasm_id};
      my $nm  = $_->{germplasm_name};
      $all_marker_germs{$sid} ||= {};
      $all_marker_germs{$sid}->{$nm} = $gid;
    }
  }

  my $germplasm  = shift || die( "Need a germplasm name" );
  my $truncated = 0;
  if( length( $germplasm ) > 50 ){
    # Truncate name to fit database field
    $germplasm = substr($germplasm,0,47) . "...";
    $truncated ++;
  }
  my $species_id = shift || die( "Need a species_id" );
  my $germplasm_id = '???';
  if( my $id = $all_marker_germs{$species_id}->{$germplasm} ){
    # Found in cache
    $germplasm_id = $id;
  }
  else{
    # Create new
    prlog( "[INFO][GERMPLASM] Creating $germplasm (species_id $species_id)" );
    if( $truncated ){
      prlog( "[WARN][GERMPLASM] Truncated germplasm_name to $germplasm" ); 
    }
    if( $I ){
      $germplasm_id = $MDB->find_or_create_germplasm($germplasm,$species_id) ||
          die( $MDB->error );
    }
    # Update cache
    $all_marker_germs{$species_id}->{$germplasm} = $germplasm_id;
  }
  return $germplasm_id;
}

#----------------------------------------------------------------------
# Compares two species. 
# Assumes that Oryza sativa (japonica) is same as Oryza sativa
sub compare_species{
  my $sp1 = shift || die( "Need a species to compare" );
  my $sp2 = shift || die( "Need a species to compare with" );
  if( $sp1 eq $sp2 ){ return 'EXACT' };
  my( $g1, $s1 ) = split( /\s/, $sp1 );
  my( $g2, $s2 ) = split( /\s/, $sp2 );
  if( $g1 eq $g2 and $s1 eq $s2 ){ return 'NON_EXACT' }
  return undef;
}

#----------------------------------------------------------------------
#
sub log_correspondence_problem{
  my $type = shift || die( "Need a problem type" );
  my $marker1 = shift;
  my $marker2 = shift;

  $type = uc( $type );
  if( $type eq 'SPECIES' ){
    my $t = ( "[WARN][CORR] X-species shared syn: ".
              "Marker %s (%s, %s) vs. ".
              "Marker %s (%s, %s)" );
    prlog( sprintf( $t,
                   $marker1->{marker_name},
                   $marker1->{marker_type},
                   $marker1->{species},
                   $marker2->{marker_name},
                   $marker2->{marker_type},
                   $marker2->{species}, ) );
  } else {
    die( "Unknown correspondence problem, $type" );
  }
}

#----------------------------------------------------------------------
#
sub prlog{
  my $message = shift;
  print LOG $message."\n";
}

#----------------------------------------------------------------------
#
sub print_report{

  # Look for unhandled attributes
  foreach my $attrib( sort keys %all_attribs ){
    my $m = "[WARN] Unhandled attrib: $attrib. Length: $all_attribs{$attrib}";
    prlog($m); print($m."\n");
  }

  # Print status info
  my $msg = sprintf
      ( 
        "[INFO] Done. Processed %s record%s of %s with %s error%s.",
        $PROCESSED_COUNT, ( $PROCESSED_COUNT == 1 ) ? '' : 's',
        $COUNT          , 
        $ERROR_COUNT    , ( $ERROR_COUNT     == 1 ) ? '' : 's',
        ); 
  prlog($msg);
  print("$msg\n");

};


#======================================================================


