=head1 NAME

  Gramene::Util::IndexedFasta - Build and access fasta file indexes

=head1 SYNOPSIS

  use Gramene::IndexedFasta;
  
 #build an index of the fasta file
 my $if=Gramene::Util::IndexedFasta->new();
 $if->build_index_file(FASTA=>"input.fa",INDEX=>"index_name");

 #get a sequence from the fasta file:
 $if->do_sequence_search(SEQ_ID=>'Sequence1',FASTA=>'input.fa',
			 INDEX=>"index_name");

 print $if->header,"\n";
 print $if->sequence,"\n";

 #get a subsequence
 $if->do_sequence_search(SEQ_ID=>'Sequence1',FASTA=>'input.fa',
			 INDEX=>"index_name",START=>15,STOP=>30);

=head1 DESCRIPTION

  This module is used for building indexes of fasta files.
  It also allows sequence retrieval from the fasta files
  using the indexes. 

  When creating an index file, the index name may be left out.  The
  new index will then have the fasta_file name with '.index' appended
  to it.  Similarly, sequences may be retrieved without specifying the
  index name.  In that case, the programm will attempt to use a default
  index of fasta_file.index, and if it does not exist, will build the index.

  To retrieve a complete sequence, use:

  $if->do_sequence_search(SEQ_ID=>'Seq_Accession',FASTA=>'input.fa',
			  INDEX=>"index_name");
   
  To get a subsequence, use:

  $if->do_sequence_search(SEQ_ID=>'Seq_Accession',FASTA=>'input.fa',
			  INDEX=>"index_name",START=>start_int,
			  STOP=>stop_int);

=head1 AUTHOR

  Leonid Teytelman 

=cut

package Gramene::Util::IndexedFasta;
use strict;

sub new{	
  my $self={};
  bless $self;
  return $self;
}


=head2 do_sequence_search

 Title   : do_sequence_search
 Usage   : $if->do_sequence_search(SEQ_ID=>'Seq_Accession',FASTA=>'input.fa',
			  INDEX=>"index_name",START=>start_int,
			  STOP=>stop_int);
 Function: This is the method for extracting the sequence from the fasta file.
           If no index is given, it trys to find the default sequence.  If
           no index exists, it creates the default index.
           After finding the sequence, it will set the internal 
           $self->{header} and $self->{sequence} attributes.
 Returns : none
 Args    : SEQ_ID,FASTA,INDEX (optional),START(optional),STOP (optional)
 Note    : Looks for FASTA and INDEX in the warehouse if they're not found


=cut

sub do_sequence_search{
  my $sequence;
  my $self=shift;
  my %args=(
	    START=>undef,
	    STOP=>undef,
	    SEQ_ID=>'',
	    FASTA=>'',
	    INDEX=>'',
	    @_,
	   );
  
  my $begin=$args{START};
  my $end=$args{STOP};
  ($begin,$end)=($end,$begin) if $end<$begin;
  my $seq_id=$args{SEQ_ID};

  if( $args{FASTA} && ! -f $args{FASTA} ) {
      $args{FASTA}=warehouse_path($args{FASTA});
  }
  if( $args{INDEX} && ! -f $args{INDEX} ) {
      $args{INDEX}=warehouse_path($args{INDEX});
  }

  open(FASTA_FILE,"$args{FASTA}") or  die "Can't open the file: $args{FASTA}. $!\n";
  my $index_name=$args{INDEX}; 
  $index_name=$self->_get_default_index($args{FASTA}) unless $index_name;
  warn $index_name;
  unless (-e $index_name){$self->build_index_file(FASTA=>$args{FASTA},INDEX=>$index_name);}
  open(INDEX_FILE,"$index_name") or  die "Can't open the index file: $index_name. $!\n";
  
  #get the byte-offset for the sequence in the fasta file
  while (<INDEX_FILE>) {
    my ($id,$curr_offset,$curr_size,$header_size)=split (/\t/);
    #if ($id eq $seq_id){
    if ($id eq $seq_id || $id =~ /$seq_id\.\d+/ || $seq_id =~ /$id\.\d+/){  #in case the version number is ommitted
      $curr_size-=$header_size;
      seek(*FASTA_FILE, $curr_offset, 0) or die "did not find $id in $args{FASTA}\n"; 
      read(*FASTA_FILE, $sequence,$header_size);  
      chomp $sequence; 
      $self->header($sequence);
      if (defined $begin && defined $end){
	$begin=0 if $begin<0 or $begin>$curr_size;
	$end=$curr_size if $end>$curr_size or $end<0;
	$curr_offset=$begin;
	my $temp_size=$end-$begin;
	$curr_size=$temp_size if $temp_size<$curr_size;
      }
      else{$curr_offset=0;}

      seek(*FASTA_FILE, $curr_offset, 1) or die "did not find $id in $args{FASTA}\n";  
      read(*FASTA_FILE, $sequence,$curr_size);   
      chomp $sequence;  
      $self->sequence($sequence);
      last; 
    } 
  }   

  
  close (FASTA_FILE) or die "Couldn't close IN_FILE";
  close (INDEX_FILE) or die "Couldn't close INDEX_FILE"; 
}


=head2 _get_default_index

 Title   : _get_default_index
 Usage   : $index_name=$self->_get_default_index($args{FASTA}) 
                      unless $index_name;
 Function: Given a filename such as rice_clones.fa, this method removes
           all that follows the final period and returns rice_clones.index
 Returns : index_name
 Args    : fasta_file_name


=cut

sub _get_default_index{
  my $self=shift;
  my $index_name=shift;
  $index_name=~s/\.[^.]*$//;
  $index_name.='.index';
  return $index_name;
}

=head2 header

 Title   : header
 Usage   : $self->header($header_string);
 Function: Get/set the self->{HEADER} value
 Returns : value of header
 Args    : header_string (optional)


=cut

sub header{
  my $self=shift;
  if(@_){$self->{HEADER}=shift;}
  return $self->{HEADER};
}


=head2 sequence

 Title   : sequence
 Usage   : $self->sequence($seq_string);
 Function: Get/set the self->{SEQUENCE} value.
 Returns : value of sequence
 Args    : seq_string (optional)


=cut

sub sequence{
  my $self=shift;
  if(@_){
    my $seq=shift;
    $seq=uc($seq);
    $self->{SEQUENCE}=$seq;
  }
  return $self->{SEQUENCE};
}


=head2 _extract_accession

 Title   : _extract_accession
 Usage   : _extract_accession($header_string)
 Function: Parses the accession number from a header line.  
           The version number, if any, is cut off.
 Returns : accession
 Args    : header_string


=cut

sub  _extract_accession{
  my $line=shift;
  #check for genbank-style accession
  (my $acc)=($line=~m/^>.+\|(.+\.?\d+?)\|.+$/);
  #chech for first accession is the first word on the line.  
  unless ($acc){($acc)=($line=~m/^>(.+?)\s/);}
  #return the whole line if no accession thus far
  unless ($acc){$acc=$line;}
  #strip off the version number
  $acc=~s/\.\d+$//;	     
  return $acc;
}


=head2 build_index_file

 Title   : build_index_file
 Usage   : $if->build_index_file(FASTA=>"input.fa",INDEX=>"index_name");
 Function: Creates an index file for a given fasta file.  For each sequence
           from the fasta file, it calculates the offset, sequence 
           and header lengths.
 Returns : none
 Args    : FASTA,INDEX (optional)


=cut

sub build_index_file{
  my $self=shift;
  my %args=(
	    FASTA=>'',
	    INDEX=>'',
	    @_,
	   ); 
  
  my ($offset,$accession,$sequence,$header_size);
  
  open(FASTA,"$args{FASTA}") or  die "Can't open the file: $args{FASTA}. $!\n";
  my $index_name=$args{INDEX};   
  $index_name=$self->_get_default_index($args{FASTA}) unless $index_name;
  open(INDEX,">>$index_name") or  die "Can't write to index file $index_name: $!\n";
  
   while (<FASTA>) {
    if (m/^>/){
      my $size=length $sequence if $sequence;
      print INDEX "$accession\t$offset\t$size\t$header_size\n" if $accession;
      $sequence='';
      $header_size=length $_;
      #get the pure Accession, without the version number.
      $accession=_extract_accession($_);
      $offset = tell(*FASTA)-length $_;
    }
    $sequence.=$_;
  }
  my $size=length $sequence;
  print INDEX "$accession\t$offset\t$size\t$header_size\n" if $accession;
  
  close (FASTA) or die "Couldn't close FASTA FILE $args{FASTA}:$!";
  close (INDEX) or die "Couldn't close INDEX FILE $index_name:$!"; 
}

=head2 warehouse_path

 Argument: Fasta or index filename 
 Value:    Path of file in data warehouse if the file exists, else null.

=cut

sub warehouse_path {
    my $self=shift;
    my ($file)=@_;

    my $warehouse=$ENV{PlantDataWarehouse} || '/usr/local/data/warehouse/data';

    return $file unless $file && -d $warehouse;

    my $subdir='';
    $file=~ /\.([^.]+)\.[^.]+$/ and $subdir=$1;
    return "$warehouse/fasta/$subdir/$file";

}

1;

__END__
