#!/usr/local/bin/perl

=head1 NAME

dump_pipeline_fasta_sequences.pl - Create fasta files for use with pipeline

=head1 SYNOPSIS

dump_pipeline_fasta_sequences.pl [options]

Options:

 -h|--help      Show brief help and exit.
 -m|--man       Show the man page.
 -c|--grm_conf  Path to DB config file, def $ENV{GrameneConfPath}.
 -d|--dump_dir  FASTA directory, def /usr/local/data/fasta/build_databases
 -t|--type      Marker type of sequences to dump
 -s|--species   Species to which the sequences will be mapped
 --only_species Only dump markers from --species

=head1 DESCRIPTION 

This program creates FASTA files for mapping to genomes using the
Gramene pipeline.

The files that are dumped are named according to the following convention;

  <date>.<species>.<seq_type>.<source>.fa e.g.
  20060329.Oryza.EST.dbEST.fa

Sequences are dumped into /usr/local/data/fasta/build_databases by
default, but the --dump_dir lag can be used to specify an alternative.

The fasta file headers are formatted as follows;

  ><marker_id> <display_synonym> <description>

E.g.

 >5152655 AM071409 Avena sativa partial mRNA for CBF-like...

The marker_id is included to make it unambiguous to which marker inj
the DB the sequence belongs to, and against which any mappings should
be loaded.

=head2 --grm_conf

The config_file configures the read-only connection to the Gramene
MarkersDB

If unspecified, the GrameneConfPath environment variable will be used,
followed by /usr/local/gramene/conf/gramene.conf.

An example config_file;

  <markers>
    db_dsn   dbi:mysql:markersXX:myhost:3307
    db_user  marker_ro_user                                                   
    db_pass  secret
  </markers>

=head2 --species

The aim of the gramene pipeline is to align marker sequences to a
reference genome (or genome components). The --species param specifies
the species of the genome to which the alignments will be
performed. Two files will be dumped, one containing sequences from the
same species as the reference species, and the other containing
sequences from all other species. If you specify --only_species then
only the sequences corresponding to --species will be dumped.

The scope of the species is wildcarded, i.e. if the species is 'Oryza'
then all sequences for the Oryza genus (i.e. O.sativa, O.brachyantha,
O.alta etc) will be selected. Similarly, 'Oryza sativa' will select
O.sativa, O.sativa ssp indica and O.sativa ssp japonica.

=head2 --type

The marker type specifies the type of sequences to dump; EST, EST
Cluster, GSS, Clone etc.

=cut

use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;

# Ensure that code is using perl lib in same project
use FindBin qw($Bin);
use File::Basename qw( dirname );
BEGIN{
  my $project_root = dirname(dirname($Bin));
  unshift( @INC, $project_root.'/lib/perl' );
}

use Gramene::Marker::DB;

# Argument processing
our( $MDB, $DUMPDIR, @CDBI_SPECIES, $CDBI_MARKER_TYPE, 
     $SPECIES, $TYPE, $ONLY_SPECIES );
BEGIN{
  my( $help, $man, $conffile, $species, $type );
  GetOptions(
             'help'         => \$help,
             'man'          => \$man,
             'grm_conf:s'   => \$conffile,
             'dump_dir:s'   => \$DUMPDIR,
             'species:s'    => \$species,
             'only_species' => \$ONLY_SPECIES,
             'type:s'       => \$type
             );
  pod2usage(-verbose => 2) if $man;
  pod2usage(-verbose => 1) if $help;

  if( defined $conffile ){ $ENV{GrameneConfPath} = $conffile }
  $MDB = Gramene::Marker::DB->new( ) ||
      die "\n[*DIE] " . CSHL::Marker::DB->error . "\n\n"; 

  $DUMPDIR ||= '/usr/local/data/fasta/build_databases';
  $species || (print "[*DIE] Need a --species param\n" and pod2usage());
  $type    || (print "[*DIE] Need a --type param\n" and pod2usage());
  
  # Validate species
  $species =~ s/_/ /g; # Substitiue underscore with space
  @CDBI_SPECIES = $MDB->search_like_Species( species => "$species%" );
  unless( scalar @CDBI_SPECIES ){ # Give up!
    print( "[*DIE] Species $species is not found\n" );
    exit 1;
  }
  $species =~ s/ /_/g; # Revert spaces to underscores for filenames
  $SPECIES = lc($species);

  # Validate marker type
  $type =~ s/_/ /g; # Substitiue underscore with space
  ( $CDBI_MARKER_TYPE ) = $MDB->search_MarkerType( marker_type => $type );
  unless( $CDBI_MARKER_TYPE ){
    print( "[*DIE] MarkerType $type is not found\n" );
    exit 1;
  }
  $type =~ s/ /_/g; # Revert spaces to underscores for filenames
  $TYPE = lc($type);
}



MAIN:{
  # Dump sequences for species against which pipeline will map
  my $file1 = generate_filename( species=>$SPECIES, type=>$TYPE );
  my $sql1  = query_by_type_and_species_list( $CDBI_MARKER_TYPE, 
                                              [@CDBI_SPECIES], [] );
  print( "[INFO] Created 'include' SQL for $TYPE $SPECIES. Running...\n" );
  my $rv1 = dump_file_from_sql( "$DUMPDIR/$file1", $sql1 );
  print( "[INFO] Dumped $rv1 sequences to $DUMPDIR/$file1\n" );

  # Dump sequences for all speices excluding pipeline species
  unless( $ONLY_SPECIES ){
    my $file2 = generate_filename( species=>"exclude_${SPECIES}", 
                                   type=>$TYPE );
    my $sql2 = query_by_type_and_species_list( $CDBI_MARKER_TYPE, 
                                               [], [@CDBI_SPECIES] );
    print( "[INFO] Created 'exclude' SQL for $TYPE $SPECIES. Running...\n" );
    my $rv2 = dump_file_from_sql( "$DUMPDIR/$file2", $sql2 );
    print( "[INFO] Dumped $rv2 sequences to $DUMPDIR/$file2\n" );
  }
}


#======================================================================
# Returns SQL to generate fasta file from;
# ARG1 - a CDBI::MarkerType object
# ARG2 - a list of CDBI::Species objects to include
# ARG2 - a list of CDBI::Species objects to exclude
sub query_by_type_and_species_list{
  my $type  = shift || die "Need a marker type";
  my $include_species_ref = shift;
  my $exclude_species_ref = shift;
  my @include_species = @$include_species_ref;
  my @exclude_species = @$exclude_species_ref;

  unless( scalar(@include_species) or scalar(@exclude_species) ){
    die "Need one or more species to include/exclude";
  }

  my $details_table = $MDB->marker_type_to_table_name( $type->marker_type )
      || die( "Cannot find details table for ".$type->marker_type );

  my $sqlt = q[
SELECT CONCAT( '>', m.marker_id, " ", ms.marker_name, ' ',
               if( m.description IS NULL, '', m.description), ' ',
               '\n', md.seq)
FROM   marker m,
       %s md,
       marker_synonym ms
WHERE  m.marker_id=md.marker_id
AND    m.display_synonym_id=ms.marker_synonym_id %s
AND    md.seq IS NOT NULL ];

  my $species_sql = '';
  if( @include_species ){
    $species_sql .= sprintf( "\nAND    m.source_species_id IN ( %s )",
                             join(', ', map{ $_->id } @include_species ) );
  }
  if( @exclude_species ){
    $species_sql = sprintf( "\nAND    m.source_species_id NOT IN ( %s )",
                            join(', ', map{ $_->id } @exclude_species ) );
  }
  return sprintf( $sqlt, $details_table, $species_sql );
}

#----------------------------------------------------------------------
sub dump_file_from_sql{
  my $file = shift || die( "Need a file to dump to" );
  my $sql  = shift || die( "Need some sql to run" );

  my $sth = $MDB->db->prepare( $sql, {mysql_use_result=>1} );
  $sth->execute || die( $sth->errstr );
  
  open( FASTA, ">$file" ) || die "Cannot open $file: $!";
  my $rv = '0E0';
  while( my $row = $sth->fetchrow_arrayref ){
    print FASTA $row->[0] . "\n";
    $rv++;
  }
  close( FASTA );

  return $rv;
}
#----------------------------------------------------------------------
our $date;
use Date::Calc;
sub generate_filename{
  my %params = @_;
  
  $date ||= join '', Date::Calc::Today;

  return join( '_',
               $date,
               $params{species} || 'NO_SPECIES',
               $params{type}    || 'NO_TYPE',
               $params{source}  || () ) . '.fa';
}
