#!/usr/local/bin/perl

=head1 NAME

load-markers-genbank.pl - load marker data into db direct from GenBank

=head1 SYNOPSIS

  load-markers.pl [options]

Options:

  -h|--help         Show brief help and exit.
  -v|--verbose      Talk about what's happening.
  -c|--config_file  Path to DB config file, def $ENV{GrameneConfPath}.
  -a|--analysis     analysis_name in MarkersDB analysis table.
  -m|--marker_type  Marker type (e.g. EST).
  -n|--no_update    Do not update the marker if the marker is found by name, type, species, ( analysis).
  --datafile        Path to source GenBank file (if debugging this way).
  --logdir          Directory to write logs into. Def $PWD
  --logfile         File to write logs into. Def date.pid.script.analysis.log

=head1 DESCRIPTION

Loads data directly into the markers DB from GenBank using an Entrez
query. Under development.

The program retrieves the requested 'analysis' from the MarkersDB
(connection defined by config_file) and uses the query defined therein
to pull entries from GenBank and load them into the MarkersDB.

Format of config_file;

  <markers_admin>
      db_dsn   dbi:mysql:{dbname}:{myhost}:{myport}
      db_user  marker_rw_user
      db_pass  secret
  </markers_admin>

If the program is called with e.g. '-analysis dbest_poaceae', then, so
long as the type is 'entrez', GenBank will be queried with the text in
the description, limited between the last_run date and today's
date. This allows for incremental updates. The last_run date will be
updated upon completion. If last_run is NULL, then no limits will be
applied.

An example MarkersDB.analysis entry;

  analysis_id: 6
analysis_name: dbest_poaceae
         type: entrez
  description: Poaceae[ORGN] AND gbdiv_est[PROP]
     last_run: 2005-07-26

The program itterates through each sequence returned by the entrez
query and uses the data to;

  - Construct a variable representing a MarkersDB-compliant marker,
  - Update the MarkersDB if the marker already exists,
  - Insert the Marker into the DB if it is 'new',
  - Create a correspondenc between the marker and markers of the 
    same species that share a synonym.



Everything printed to STDERR by the program will be logged to a
file. Lines start with '[INFO]', '[WARN]' or '[*DIE]' for ease of
grepping. The logfile and logdir can be specified as program
arguments. You can also specify '--logfile=STDOUT' to get the log to
print to the terminal (useful for debug).

If it all goes wrong. Here is some SQL that blows away all traces of
an analysis and its markers. Run in order due to key constraints;

  #Clean up correspondences

  delete ca
  from  marker m, analysis a, analytical_correspondence ac,
        correspondence_to_analysis ca
  where m.analysis_id=a.analysis_id
  and   m.marker_id=ac.from_marker_id
  and   ac.analytical_correspondence_id=ca.analytical_correspondence_id
  and   a.analysis_name="dbest_poaceae_2003";
 
  delete ac
  from  marker m, analysis a, analytical_correspondence ac
  where m.analysis_id=a.analysis_id 
  and   m.marker_id=ac.from_marker_id
  and   a.analysis_name="dbest_poaceae_2003";

  delete ca
  from  marker m, analysis a, analytical_correspondence ac,
        correspondence_to_analysis ca
  where m.analysis_id=a.analysis_id
  and   m.marker_id=ac.to_marker_id
  and   ac.analytical_correspondence_id=ca.analytical_correspondence_id
  and   a.analysis_name="dbest_poaceae_2003";
 
  delete ac
  from marker m, analysis a, analytical_correspondence ac
  where m.analysis_id=a.analysis_id 
  and m.marker_id=ac.to_marker_id
  and a.analysis_name="dbest_poaceae_2003";

  # Clean up synonyms
  
  update marker m, analysis a 
  set    m.display_synonym_id=NULL 
  where  m.analysis_id=a.analysis_id 
  and    a.analysis_name="dbest_poaceae_2003";

  delete ms 
  from  marker m, marker_synonym ms, analysis a 
  where m.marker_id=ms.marker_id 
  and   m.analysis_id=a.analysis_id 
  and   a.analysis_name="dbest_poaceae_2003";

  # And the marker

  delete md
  from marker m, analysis a, marker_details_est md
  where m.analysis_id=a.analysis_id
  and   m.marker_id=md.marker_id 
  and   a.analysis_name="dbest_poaceae_2003";

  delete m 
  from  marker m, analysis a 
  where m.analysis_id=a.analysis_id 
  and   a.analysis_name="dbest_poaceae_2003";

  # Reset the analysis to re-run

  update analysis 
  set last_run=NULL 
  where analysis_name="dbest_poaceae_2003";


=head1 SEE ALSO

Gramene::Marker::DB, Text::RecordParser.

=head1 AUTHOR

Will Spooner E<lt>whs@ebi.ac.ukE<gt>.

=cut

# ----------------------------------------------------

use strict;
use Getopt::Long;

use Gramene::Ontology::OntologyDB;
use Pod::Usage;
use Text::RecordParser;

use Bio::DB::Query::GenBank;
use Bio::DB::GenBank;
use Data::Dumper qw(Dumper);
use Date::Calc;
use Carp;

use lib "/home/weix/gramene/lib/perl";
use Gramene::Marker::DB;

our ( $no_processed, $count, $no_errors ) = ( 0, '???', 0 );

my %species_name = (
		    Barley => 'Hordeum vulgare',
		    Maize  => 'Zea mays',
		    Millet => 'Pennisetum glaucum',
		    Rice   => 'Oryza sativa',
		    RiceAlta => 'Oryza sp.',
		    RiceAustraliensis => 'Oryza sp.',
		    RiceBrachyantha   => 'Oryza sp.',
		    RiceCoarctata     => 'Oryza coarctata',
		    RiceGlaberrima    => 'Oryza sp.',
		    RiceJaponica      => 'Oryza sp.',
		    RiceMinuta        => 'Oryza sp.',
		    RiceNivara        => 'Oryza sp.',
		    RicePunctata      => 'Oryza sp.',
		    RiceRufipogon     => 'Oryza sp.',
		    Ryegrass          => 'Secale cereale',
		    Sorghum           => 'Sorghum bicolor',
		    Sugarcane         => 'Saccharum',
		    Wheat             => 'Triticum aestivum',
		    nonRice           => 'UNKNOWN',
		   );

#END{ &print_report; exit(0) }
$SIG{'INT'} = sub { exit(0) };

my ( $help, $v, $conffile,  $analysis_name_f, $marker_type,
     $no_update, $logdir, $logfile, $species_f );
GetOptions(
           'help'           => \$help,
           'config_file:s'  => \$conffile,
           #'datafile:s'     => \$gbfile,
           'verbose'        => \$v,
           'analysis:s'     => \$analysis_name_f,
           'marker_type:s'  => \$marker_type,
           'no_update'      => \$no_update,
           'logdir:s'       => \$logdir,
           'logfile:s'      => \$logfile,
	   'species:s'      => \$species_f,
);
pod2usage(-verbose => 2) if $help;

#----
# Validate params
if( defined $conffile ){ $ENV{GrameneConfPath} = $conffile }

unless( $marker_type ){
  warn( "\n[*DIE] Must specify a --marker_type, e.g. est\n\n" );
  pod2usage;
}


if( $no_update ){ 
  warn( "[INFO] Evaluation run - no db update will be made if the marker is found by name, species, type (analysis)\n");
}

foreach my $file( $ENV{GrameneConfPath}, @ARGV ){
  unless( $file ){
    warn( "\n[*DIE] Unable to find config file\n\n" );
    pod2usage;
  } unless( -e $file ){
    warn( "\n[*DIE] File $file does not exist\n\n" );
    pod2usage;
  } unless( -r $file ){
    warn( "\n[*DIE] Cannot read $file\n\n" );
    pod2usage;
  } unless( -f $file ){
    warn( "\n[*DIE] File $file is not plain-text\n\n" );
    pod2usage;
  } unless( -s $file ){
    warn( "\n[*DIE] File $file is empty\n\n" );
    pod2usage;
  }
}


# Log stderr to logfile
$logdir ||= $ENV{PWD};
unless( $logfile ){
  my( $file ) = ( $0 =~ m/([^\/]+)$/ );
  $file =~ s/\.\w+$//;
  my $date = sprintf('%4.4i%2.2i%2.2i',Date::Calc::Today);
  $logfile = join( ".", $date, $$, $file, ($analysis_name_f||()),'log' );
}
if( uc($logfile) eq 'STDERR' or uc($logfile) eq 'STDOUT' ){
  # Log to STDOUT
  open( LOG, ">&$logfile" ) or die( $! );
} else {
  # Log to file
  open( LOG, ">$logfile" ) or die( $! );
  open( STDERR, ">&LOG" ) or die( $! );
}


my $today = sprintf( "%4.4i-%2.2i-%2.2i", Date::Calc::Today );
#-----
# Connect to Marker and Ontology DB
our( $MDB);
$ENV{GATEWAY_INTERFACE} ++; # tell DBD::mysql to auto-reconnect

$MDB = Gramene::Marker::DB->new( admin=>1 ) ||
  die "\n[*DIE] " . Gramene::Marker::DB->error . "\n\n";


foreach my $file(@ARGV ){

  my $analysis_name = $analysis_name_f;
  unless( $analysis_name ){
    $file =~ m= ([^/]+) \. fa \z =xms;
    $analysis_name = $1;
  }

  my $species = $species_f;
  unless( $species){
    my @t = split "_", $analysis_name;
    $species = $species_name{$t[0]};
    
  }

  prlog( "$file, $analysis_name, $species\n");
#next;
  
  unless ($analysis_name && $species){
   
    print STDERR "Missing analysis_name($analysis_name) or species($species)\n";
    next; 
  }
  #----

#----
# Get analysis data from DB
  my ($analysis);

  if( $analysis_name ){
  
    $analysis = $MDB->find_or_create_Analysis( { analysis_name => $analysis_name } ) ||
      die "\n[*DIE] " . $MDB->error . "\n\n";
    
  }

  
  #-----
  # Get the detail table for this type
  # print "The marker types are\n" . (join "\n", @marker_type_names) ."\n";
  
  my $marker_types      = $MDB->get_all_marker_types( ); #order_by =>"marker_type is default
  my @marker_type_names = reverse sort map { lc($_->{marker_type}) } @{$marker_types};
  my $marker_type_re =  join "|", @marker_type_names;
  
  #print "[DG] The marker type regex is $marker_type_re\n";
  
  if( lc($marker_type) =~ / ($marker_type_re) /xms){
    #print "matched $1\n";
  }else{
    my $msg = "[*DIE] --marker_type $marker_type unknown. Use est, mrna or bacend";
    prlog( $msg );
    #print( "\n$msg\n\n" );
    exit;
  }
  
  my ($table, @fields) = $MDB->marker_type_to_table_name($marker_type);
  print "[DG] The detail table to use is $table\n";
  
  #-----
  # Get the SeqIO stream
  
  my( $seqio) = &get_seqio( $analysis, $file );
  
  #print "[DG] After get Seqio for $analysis->{analysis_name}, $file\n";


  #---
  # Loop through each seq
  my $batch_size = 1000; # For progress output
  #my  $c = 0;
 ENTRY: while( my $seq =  $seqio->next_seq ){
    my $batch = $no_processed/$batch_size;
    if( $batch - int($batch) eq 0 ){
      print( "...processed $no_processed of $count...\n" );
    }
    $no_processed ++;
    
    #warn Dumper( $seq );
    
    # Initialise the marker annotation variables, and populate the easy ones
    
    
    my %marker  = ( marker_name     => $seq->display_id,
		    description     => $seq->desc,
		    marker_type     => $marker_type,
		    species         => $species,
		    analysis_id     => $analysis->analysis_id,
		    details         => { date_updated => $today,
					 seq     => $seq->seq, }
		  );
    
    #my $marker_id = $MDB->create_marker(%marker) ||
     # die( $MDB->error );
  
    my $marker_id;

    if($no_update){
      
      $marker_id = $MDB->find_or_create_marker(%marker ) ||
	die( $MDB->error );
      
    }else{
      
      $marker_id = $MDB->update_or_create_marker(%marker ) ||
	die( $MDB->error );
    }
    prlog("[INFO] Creating marker $marker_id");
            
    
  }
  
  # All done - update the last_run field of analysis
  if( $analysis  ){
    #warn Dumper( $analysis );
    $analysis->{last_run} = $today;
    $MDB->update_analysis(%$analysis) || croak( $MDB->error );

    prlog("[INFO][ANALYSIS] ".
	  "Updating $analysis->{analysis_name} last_run to ".
	  "$analysis->{last_run}");
  }
  
}

exit;

#======================================================================
#
sub get_seqio{
  my $analysis = shift;
  my $gb_file  = shift;

    
  my $seqio = Bio::SeqIO->new(-file=>$gb_file, -format=>'Fasta');
  print "gb_file = $gb_file\n";
    
  return( $seqio );
}

sub prlog{
  my $message = shift;
  print LOG $message."\n";
}

#----

__END__

