#!/usr/local/bin/perl

=head1 NAME

load-mappings-from-ensembl.pl - load marker data into db direct from GenBank

=head1 SYNOPSIS

  load-mappings-from-ensembl.pl [options]

Options:

  -h|--help         Show brief help and exit.
  -i|--info         Show extended help info.
  -v|--verbose      Talk about what's happening.
  -c|--config_file  Path to DB config file, def $ENV{GrameneConfPath}.
  -e|--ensembl_reg  Path to ensembl registry file.
  -s|--species      Species to use from ensembl registry.
  -a|--analysis     Logic name of the ensembl analysis to load
  -m|--map_set      The MarkersDB map set that markers align to.
  -n|--no_insert    Do not make changes to the database. Useful for debug.
  --logdir          Directory to write logs into. Def /tmp
  --logfile         File to write logs into. Def date.pid.analysis.log

=head1 DESCRIPTION

TODO: add description

=head1 AUTHOR

Will Spooner E<lt>whs@ebi.ac.ukE<gt>.

=cut


use strict;
use Data::Dumper qw(Dumper);
use Getopt::Long;
use Pod::Usage;
use Date::Calc;

use Gramene::Marker::DB;
use Bio::EnsEMBL::Registry;

# The following paths to the ensembl API may need to be changed!
use lib qw(/usr/local/gramene_ensembl/ensembl-live/ensembl/modules);
use lib qw(/usr/local/gramene_ensembl/ensembl-live/ensembl-compara/modules);

use vars qw( $I $V $EDB $MDB $LOGIC_NAME $MAP_SET_ID );
BEGIN{  #Argument Processing
  my $help=0;
  my $man=0;
  my( $species, $conffile, $ensreg, $map_set );
  my( $no_insert, $logdir, $logfile );
  GetOptions
      (
       "help|?"          => \$help,
       "info"            => \$man,
       "species=s"       => \$species,
       "config_file:s"   => \$conffile,
       "ens_registry=s"  => \$ensreg,
       "l|logic_name=s"    => \$LOGIC_NAME,
       "map_set=s"       => \$map_set,
       'no_insert'       => \$no_insert,
       'logdir:s'        => \$logdir,
       'logfile:s'       => \$logfile,
       ) or pod2usage(2);
  pod2usage(-verbose => 2) if $man;
  pod2usage(1) if $help;

  # Do we update DB?
  $I = $no_insert ? 0 : 1;

  # Validate conf files
  if( defined $conffile ){ $ENV{GrameneConfPath} = $conffile }
  $ensreg               or warn("[*DIE] Need an ens_registry") && pod2usage;
  $ENV{GrameneConfPath} or warn("[*DIE] Need a config_file" )  && pod2usage;

  foreach my $file( $ENV{GrameneConfPath},$ensreg ){
    -e $file or warn("\n[*DIE] File $file does not exist\n\n") && pod2usage;
    -r $file or warn("\n[*DIE] Cannot read $file\n\n")         && pod2usage;
    -f $file or warn("\n[*DIE] File $file is not text\n\n")    && pod2usage;
    -s $file or warn("\n[*DIE] File $file is empty\n\n")       && pod2usage;
  }

  # Create Database adaptors
  $ENV{GATEWAY_INTERFACE} ++; # tell DBD::mysql to auto-reconnect
  my $admin = $I ? 1 : 0;
  $MDB = Gramene::Marker::DB->new( admin=>$admin ) ||
      die "\n[*DIE] " . Gramene::Marker::DB->error . "\n\n";
  $MDB->db->{RaiseError} = 0; # Try to avoid Lost connection to MySQL errs

  $species || ( warn( "Need a --species\n" ) && pod2usage(1) );
  Bio::EnsEMBL::Registry->load_all( $ensreg );
  $EDB = Bio::EnsEMBL::Registry->get_DBAdaptor( $species, 'core' );
  $EDB or warn("No ens core DB for $species set in $ensreg\n" ) && pod2usage;

  # Other args
  $LOGIC_NAME or warn("Need a --logic_name\n") && pod2usage;
  $map_set or warn("Need a --map_set\n") && pod2usage;
  my $map_set_name;
  foreach my $set( @{$MDB->get_all_map_sets||[]} ){
    if( $set->{map_set_name} =~ /$map_set/i ){
      unless( $MAP_SET_ID ){
        $map_set_name = $set->{map_set_name};
        $MAP_SET_ID = $set->{map_set_id};
      } else {
        warn( "[*DIE] map_set $map_set maps to >1 DB entries\n" );
        warn( "[*DIE]     $map_set_name and $set->{map_set_name}\n" );
        exit;
      }
    }
  }
  if( $MAP_SET_ID ){
    warn( "[INFO] using map_set $map_set_name [ID: $MAP_SET_ID]\n" );
  } else {
    warn( "[*DIE] Could not find map_set for $map_set. Use one of:\n" );
    map{warn "[*DIE]    $_->{map_set_name}\n"} @{$MDB->get_all_map_sets||[]};
  }

  #----
  # Log stderr to logfile
  $logdir ||= $ENV{PWD};
  unless( $logfile ){
    my( $file ) = ( $0 =~ m/([^\/]+)$/ );
    $file =~ s/\.\w+$//;
    my $date = sprintf('%4.4i%2.2i%2.2i',Date::Calc::Today);
    $logfile = join( ".", $date, $$, $file, ($LOGIC_NAME||()),'log' );
  }
  if( uc($logfile) eq 'STDERR' or uc($logfile) eq 'STDOUT' ){
  # Log to STDOUT
    open( LOG, ">&$logfile" ) or die( $! );
  } else {
  # Log to file
    open( LOG, ">$logfile" ) or die( $! );
    #open( STDERR, ">&LOG" ) or die( $! );
  }
}

MAIN:{

  # How many features in ens for the analysis?
  my $fcount = &feature_count();
  unless( defined $fcount ){ 
    my @all_lnames = &all_logic_names;
    warn( "[*DIE] logic_name $LOGIC_NAME not found. Select from:\n" );
    map{ warn "[INFO] $_\n"} @all_lnames;
    pod2usage;
  }
  warn( "[INFO] features corresponding to $LOGIC_NAME: $fcount" );
  my $fncount = &feature_name_count;

  warn( "[INFO] feature names corresponding to $LOGIC_NAME: $fncount" );

  # Get the MarkersDB analysis_id
  my $analysis_id = $MDB->find_or_create_analysis($LOGIC_NAME) ||
      die "\n[*DIE] " . $MDB->error . "\n\n";
  if( $I ){
    $MDB->update_analysis
        ( analysis_id   => $analysis_id,
          analysis_name => $LOGIC_NAME,
          type          => 'ensembl_mapping',
          description   => ( 'dna_align_feature mapping '.
                             'from EnsemblDB corresponding to the '.
                             "$LOGIC_NAME analysis.logic_name" ) ) ||
    die "\n[*DIE] " . $MDB->error . "\n\n";
  }

  #---
  # Loop through each feature_name in dna_align_feature
  my $batch_size = 1000; # For progress output
  my $no_processed = 0;
  my $sth = &feature_name_sth;
  while( my $name = ($sth->fetchrow_array)[0] ){
    my $batch = $no_processed/$batch_size;
    if( $batch - int($batch) eq 0 ){
      print( "...processed $no_processed of $fncount...\n" );
    }
    $no_processed ++;

    #my @markers = $MDB->marker_synonym_search(synonyms=>[$name]);
    my @markers = $MDB->marker_search(marker_name=>$name,type=>'est');

    my $num_markers = scalar( @markers );
    unless( $num_markers ){
      &prlog( "[WARN][MARKER] No marker corresponds to feature $name" );
      next;
    } elsif( $num_markers > 1 ){
      &prlog( "[WARN][MARKER] $num_markers markers correspond to feature $name" );
    }
    
    my @mappings = &feature_mappings( $name );
    my $num_mappings = scalar(@mappings);

    # Loop for each mapping
    foreach my $mapping( @mappings ){
      my $map_id = &seq_region_to_map_id($mapping->{seq_region_name}) ||
          ( &prlog( "[WARN][MAPPING] $name: No map found for ".
                    $mapping->{seq_region_name} ) &&
            last );
      $mapping->{map_id} = $map_id;
      $mapping->{analysis_id} = $analysis_id;      

      # And for each marker
      foreach my $m( @markers ){
        $mapping->{marker_id} = $m->{marker_id};
        $mapping->{marker_name} = $m->{marker_name};
        if( $I ){
          $MDB->set_marker_mapping(%$mapping) ||
              ( die( "[*DIE] ".$MDB->error ) );
        }
      }
    }
    &prlog( "[INFO][MAPPING] $name: ".
            "Created $num_mappings mappings for $num_markers markers" );
  }

  if( $I ){
    my $last_run = sprintf( "%4.4i-%2.2i-%2.2i", Date::Calc::Today );
    $MDB->update_analysis
        ( analysis_id   => $analysis_id,
          analysis_name => $LOGIC_NAME,
          type          => 'ensembl_mapping',
          last_run      => $last_run,
          description   => ( 'dna_align_feature mapping '.
                             'from EnsemblDB corresponding to the '.
                             "$LOGIC_NAME analysis.logic_name" ) ) ||
    die "\n[*DIE] " . $MDB->error . "\n\n";
    prlog("[INFO][ANALYSIS] ".
          "Updating $LOGIC_NAME.last_run to ".
          "$last_run");
  }

  &prlog( "[INFO] Processed $no_processed features of $fncount" );

}
exit;

#======================================================================
# Counts the number of dna_align_features in $EDB corresponding to $LOGIC_NAME
sub feature_count{
  my $analysis_adaptor = $EDB->get_adaptor('Analysis');
  my $analysis = $analysis_adaptor->fetch_by_logic_name($LOGIC_NAME) ||
      return;
  my $q = qq(
SELECT count(*)
FROM   dna_align_feature
WHERE  analysis_id=?);
  my $sth = $EDB->dbc->prepare($q);
  my $rv = $sth->execute($analysis->dbID) || die( $sth->errstr);
  return ($sth->fetchrow_array)[0];
}

#----------------------------------------------------------------------
# Counts the number of distinct feature names corresponding to $LOGIC_NAME
sub feature_name_count{
  my $analysis_adaptor = $EDB->get_adaptor('Analysis');
  my $analysis = $analysis_adaptor->fetch_by_logic_name($LOGIC_NAME) ||
      return;
  my $q = qq(
SELECT COUNT( DISTINCT(hit_name) ) 
FROM   dna_align_feature 
WHERE  analysis_id=?);
  my $sth = $EDB->dbc->prepare($q);
  my $rv = $sth->execute($analysis->dbID) || die( $sth->errstr);
  return ($sth->fetchrow_array)[0];
}

#----------------------------------------------------------------------
# Returns a DBI sth containing the feature name query
sub feature_name_sth{
  my $analysis_adaptor = $EDB->get_adaptor('Analysis');
  my $analysis = $analysis_adaptor->fetch_by_logic_name($LOGIC_NAME) ||
      return;
  my $q = qq(
SELECT DISTINCT(hit_name)
FROM   dna_align_feature
WHERE  analysis_id=? );
  my $sth = $EDB->dbc->prepare($q);
  $sth->{"mysql_use_result"} = 1;
  delete( $EDB->dbc->{'connected'.$$} ); # Needed for mysql_use_result
  $EDB->dbc->connect;                    # Ditto
  my $rv = $sth->execute($analysis->dbID) || die( $sth->errstr);
  return $sth;
}

#----------------------------------------------------------------------
# Converts the ensembl seq_region_name to a MarkersDB map_id
my %name_to_id;
sub seq_region_to_map_id{
  my $name = shift || die( "Need a seq_region_name" );

  unless( %name_to_id ){
    # Update cache
    my @maps = @{$MDB->get_all_maps(map_set_id=>$MAP_SET_ID)};
    foreach my $map( @maps ){
      my $map_name = $map->{map_name};
      $map_name =~ s/^chr\D*//i;
      $name_to_id{$map_name} = $map->{map_id};
    }
  }
  return $name_to_id{$name};
}

#----------------------------------------------------------------------
# Returns the mappings from the dna_align_feature table between the 
# feature and the genome. 
sub feature_mappings{
  my $fname = shift || die( "Need a feature name" );
  my $q = qq(
SELECT cs.name as coord_system_name,
       sr.name as seq_region_name,
       seq_region_start as start,
       seq_region_end as end,
       seq_region_strand as strand,
       hit_name,
       hit_start as marker_start,
       hit_end as marker_end,
       hit_strand as marker_strand,
       score,
       evalue,
       perc_ident as percent_identity,
       cigar_line
FROM   dna_align_feature f,
       seq_region sr,
       coord_system cs
WHERE  sr.seq_region_id   = f.seq_region_id
AND    sr.coord_system_id = cs.coord_system_id
AND    hit_name           = ? );

  my $sth = $EDB->dbc->prepare($q);
  my $rv = $sth->execute($fname) || die( $sth->errstr);
  return @{$sth->fetchall_arrayref({})};
}


#----------------------------------------------------------------------
# Returns all logic_names corresponding to DnaAlignFeature
sub all_logic_names{
  my $aa = $EDB->get_adaptor('Analysis');
  return( map{$_->logic_name} 
          @{$aa->fetch_all_by_feature_class('DnaAlignFeature')} );
}

#----------------------------------------------------------------------
# Prints to log file
sub prlog{
  my $message = shift;
  print LOG $message."\n";
}

#----------------------------------------------------------------------
# Retrieves all logic names from the end DB matching a regexp string
# (unused)
sub matching_logic_names{
  my $regexp = shift || die( "Need a regexp string" );
  my $analysis_adaptor = $EDB->get_adaptor('Analysis');
  return( map{ $_->logic_name }
          grep{ $_->logic_name =~ /$regexp/i } 
          @{$analysis_adaptor->fetch_all} );
}
