#
# You may distribute this module under the same terms as perl itself

# POD documentation - main docs before the code

=head1 NAME

Bio::EnsEMBL::Pipeline::Tools::TranslationUtils - 

=head1 SYNOPSIS

=head1 DESCRIPTION

=head1 CONTACT

ensembl-dev@ebi.ac.uk

=head1 APPENDIX

The rest of the documentation details each of the object
methods. Internal methods are usually preceded with a _

=cut


# Let the code begin...

package Bio::EnsEMBL::Analysis::Tools::TranslateTools;

use vars qw(@ISA);
use strict;

use Bio::EnsEMBL::Root;
use Bio::EnsEMBL::Transcript;
use Bio::EnsEMBL::Exon;
use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Seg;
use Bio::EnsEMBL::DnaPepAlignFeature;
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonUtils;
use Bio::EnsEMBL::Utils::PolyA;
use Bio::EnsEMBL::Utils::Exception;
use Bio::EnsEMBL::DBSQL::SliceAdaptor;
use Data::Dumper;

@ISA = qw(Bio::EnsEMBL::Root);

 



sub return_translation{
  my ($self, $trans) = @_;
  $trans = $self->compute_translation($trans);
  return $trans->translation;
}

############################################################

=head2

method to calculate the translation for a given transcript
in slice coordinates. The translation is attached to the transcript
as a translation object.

The method calculates the longest translation starting with Methionine.
If there is none, it will take the longest translation. It is simple
but it seems to give similar results to genomewise (regarding translation length).

=cut

sub compute_translation{
  my ($self,$trans,$pos1,$pos2) = @_;
  
  my $verbose = 0;
  my @met_predictions   = $self->run_translate( $trans,1,$pos1, $pos2);
  my @nomet_predictions = $self->run_translate( $trans,0,$pos1, $pos2);

  my $count = 0;
  while ( $count < 2 && $met_predictions[$count] ){
    my @entry = @{$met_predictions[$count]};
    #print STDERR "MET length:$entry[0] start:$entry[1] end:$entry[2]\n";
    $count++;
  }
  $count = 0;
  while ( $count < 2 && $nomet_predictions[$count] ){
    my @entry = @{$nomet_predictions[$count]};
    #print STDERR "NO_MET length:$entry[0] start:$entry[1] end:$entry[2]\n";
    $count++;
  }
#  my $translation = $trans->translation;
  my $length = $trans->seq->length;
  my $best;
  if ( @met_predictions && @nomet_predictions ){
    my $met_best   = $met_predictions[0];
    my $nomet_best = $nomet_predictions[0];
    if ( #$nomet_best->[0] > 2*$met_best->[0] 
	 $nomet_best->[1] <= 3 
	 && ($met_best->[0] < 50 
	     && $nomet_best->[0] > $met_best->[0]
	     || $met_best->[0] < 100 
	     && $nomet_best->[0] > 1.5*$met_best->[0]
	     || $met_best->[0] < 200 
	     && $nomet_best->[0] > 2*$met_best->[0])
	 ){
	$best = $nomet_best;
    }
    else{
	$best = $met_best;
    }
  }
  elsif( @met_predictions ){
    $best = $met_predictions[0];
  }
  elsif( @nomet_predictions ){
    $best = $nomet_predictions[0];
  } else{
    # if there are no nomet_predictions and no met_predictions return to caller
    verbose('DEPRECATE');
    warning("No translation could be computed for transcript \n");
    return $trans;
  }

  if (@met_predictions && $best == $met_predictions[0]) {
      my $second = $met_predictions[1];
      if ($trans->biotype=~/cdna/ 
	  && $best->[1] < 10 && $second && $second->[0]> 0.9*$best->[0] 
	  && $second->[1]>$best->[1]){
	  $best = $second;
      }
#      print &Dumper($trans->dbID, $met_predictions[0], $met_predictions[1], $best);
  }
  
  my @entry = @{$best};
  my $orf_start = $entry[1];
  my $orf_end   = $entry[2];
  print STDERR "BEST length:$entry[0] start:$entry[1] end:$entry[2]\n" if $verbose;

  my @exons = @{$trans->get_all_Exons};

  my $transl_start;
  my $transl_end;
  my $transl_start_Exon;
  my $transl_end_Exon;
  my $exon_count = 0;
  print STDERR "transcript length: $length\n" if $verbose;
  my $pos = 1;
  foreach my $exon ( @exons ){
    $exon_count++;
    print STDERR "exon:$exon_count exon_length:".$exon->length." pos:$pos orf_start:$orf_start orf_end:$orf_end pos+:".($pos + $exon->length - 1)."\n" if $verbose;
    if ( $orf_start >= $pos && $orf_start <= $pos + $exon->length - 1 ){
      $transl_start_Exon = $exon;
      $transl_start      = $orf_start - $pos + 1;
      print STDERR "start found\n" if $verbose;
    }
    if ( $orf_end >= $pos && $orf_end <= $pos + $exon->length - 1 ){
      $transl_end_Exon   = $exon;
      $transl_end        = $orf_end - $pos + 1;
      print STDERR "end found\n" if $verbose;
    }
    $pos += $exon->length;
  }
  
  my $newtranslation;
  if ( $transl_start && $transl_end &&  $transl_start_Exon && $transl_end_Exon ){
    $newtranslation = Bio::EnsEMBL::Translation->new();
    $newtranslation->start( $transl_start );
    $newtranslation->end( $transl_end );
    $newtranslation->start_Exon( $transl_start_Exon );
    $newtranslation->end_Exon( $transl_end_Exon );
    $trans->translation($newtranslation);
  }
  else{
    print STDERR "problem making the translation\n";
  }
  $trans->flush_Exons;
  
  foreach my $exon ( @exons ){
    $trans->add_Exon($exon);
  }

  @exons = @{$trans->get_all_Exons};

  my $found_start = 0;
  my $found_end   = 0;

  my $last_end_phase;

  foreach my $exon (@{$trans->get_all_Exons}) {
    $exon->phase(-1);
    $exon->end_phase(-1);

    if ($newtranslation->start_Exon == $exon) {
      if ($newtranslation->start == 1) {
        $exon->phase(0);
      }
      $found_start = 1;
    } elsif ($found_start and not $found_end) {
      $exon->phase($last_end_phase);
    }


    if ($newtranslation->end_Exon == $exon ) {
      if ($newtranslation->end == $exon->length) {
        if ($exon == $newtranslation->start_Exon) {
          my $start_tln = $exon->start + $newtranslation->start - 1;
          $exon->end_phase(($exon->end - $start_tln + 1) % 3);
        } else {
          $exon->end_phase(($exon->length + $exon->phase) % 3);
        }
      }
      $found_end = 1;
    } elsif ($found_start and not $found_end) {
      if ($exon == $newtranslation->start_Exon) {
        my $start_tln = $exon->start + $newtranslation->start - 1;
        $exon->end_phase(($exon->end - $start_tln + 1) % 3);
      } else {
        $exon->end_phase(($exon->length + $exon->phase) % 3);
      }
    }

    $last_end_phase = $exon->end_phase;
  }

  return $trans;
}

############################################################

sub run_translate{
    my ($self,$trans,$met, $pos1, $pos2) = @_;
    $pos1 ||= 0;
    $pos2 ||= $pos1;
    my $verbose = 1;

    my $trans_id = $trans->stable_id || $trans->dbID;
    #print "Have trans id ".$trans_id."\n";
    unless ( $trans_id ){
      if ( $trans->biotype ){
	$trans_id = $trans->biotype;
  #print "Have trans id ".$trans_id."\n";
      }
      else{
	$trans_id = "transcript_".$$;
      }
    }
    my $seq = $trans->seq;
    unless ( $seq->display_id ){
	$seq->display_id( $trans_id );
    }
    my $length = $seq->length;
    
    #print STDERR "display_id = ".$seq->display_id."\n";
    ############################################################
    # create file
    my $file = ($ENV{WORK_HOME}||$ENV{HOME})."/tmp/"."cdna_".$$.".fa";
    open ( SEQ, ">$file" ) || die("could not open file $file $!");
    my $seqout = Bio::SeqIO->new('-format' => 'Fasta',
				 '-fh'     => \*SEQ);
    $seqout->write_seq($seq);
    close(SEQ);
#print STDERR $seq->seq, "\n";    
    my $command;
    if ( $met){
	$command = "translate -m $file |";
    }
    else{
	$command = "translate $file |";
    } 
    open ( ORF, $command ) || die( "Error running translate" );
    ############################################################
    # out put is of the form:
    #> gi|20070124|ref|NM_000918.2|.44    length 62, nt 2236..2051
    #AHDRRRSPGLREGEGPGLCRAPGLAATSSSSRHGGHPDRIRKSPFTQKCKSHDQSWRHCRRY
    #> gi|20070124|ref|NM_000918.2|.45    length 34, nt 2047..1946
    #VTMSSPAPSLPHGGQASPRRPGQGGTNTLMSKNV
    #> gi|20070124|ref|NM_000918.2|.46    length 34, nt 1942..1841
    #KSHRRNFQKEEKPPAGGRQRDSEHGSKHSGQTHV
    
    my @orf_predictions;
  ORF:
    while ( <ORF> ){
	chomp;
	next ORF unless /\>/;
#	print STDERR "$_\n" if $verbose;
	my @entries = split;
	next ORF unless ( $entries[3] && $entries[5] );
	my $id = $entries[1];
	my $orf_length = $entries[3];
	$orf_length =~s/\,//;
	$entries[5] =~/(\d+)\.\.(\d+)/;
	my $orf_start = $1;
	my $orf_end   = $2;
	next ORF if $orf_start>=$orf_end;
	print STDERR "id:$id\torf_length:$orf_length\tstart:$orf_start\tend:$orf_end  --1\n" if $verbose;
	next ORF if $pos1 && ($pos1 > $orf_end || $pos2 < $orf_start 
			      || abs($pos1-$orf_start) % 3 != 0 && abs($pos2-$orf_end)%3 != 0);
	print STDERR "id:$id\torf_length:$orf_length\tstart:$orf_start\tend:$orf_end  pos1:$pos1\tpos2:$pos2  --2\n" if $verbose;
	my @prediction = ($orf_length,$orf_start,$orf_end);
	push( @orf_predictions, \@prediction );
    }
#    print STDERR scalar(@orf_predictions), " orfs\n";
#    foreach my $orf (@orf_predictions) {print STDERR join('--', @$orf), "\n";} 
    my @sorted_predictions = 
#	sort { $b->[0] <=> $a->[0] } @orf_predictions;
	map { $_->[1] } sort { $b->[0] <=> $a->[0] } map { [$_->[0], $_] } @orf_predictions;
#    print STDERR scalar(@orf_predictions), " sorted orfs\n";
#    foreach my $orf (@sorted_predictions) {print STDERR join('--', @$orf), "\n";} 
    return @sorted_predictions;
}


############################################################

 1;
