#!/usr/local/bin/perl

=head1 NAME

retrieve-datasets.pl - create gramene track sequence datasets from markers database for mapping

=head1 SYNOPSIS

  retrieve-datasets.pl [options]

Options:

  -h|--help         Show brief help and exit.
  -v|--verbose      Talk about what's happening.
  -c|--config_file  Path to DB config file, def $ENV{GrameneConfPath}.
  --dir             Directory to write output files into. Def $PWD
  --q               the query SQL
 --ds_name          dataset logic name

=head1 DESCRIPTION

Create fasta sequence files for mapping, each file corresponds to a distinct
track on the gramene genome browser 

The program retrieves the requested sequences using markers database query 
formulated for each gramene track, output the sequences into a fasta file
using the track's logic name as the file name. It also produe a log file
reporting how many sequences were retieved for each dataset.

Format of config_file;

  <markers_admin>
      db_dsn   dbi:mysql:{dbname}:{myhost}:{myport}
      db_user  marker_rw_user
      db_pass  secret
  </markers_admin>


Everything printed to STDERR by the program will be logged to a
file. Lines start with '[INFO]', '[WARN]' or '[*DIE]' for ease of
grepping. The output dir can be specified as program
arguments. 


=head1 SEE ALSO

Gramene::Marker::DB, Text::RecordParser.

=head1 AUTHOR

Sharon Wei E<lt>weix@cshl.eduE<gt>.

=cut

# ----------------------------------------------------

use strict;
use Getopt::Long;

use Gramene::Ontology::OntologyDB;
use Pod::Usage;
use Text::RecordParser;

use Bio::SeqIO;
use Data::Dumper qw(Dumper);
use Date::Calc;
use Carp;

use lib "/home/weix/gramene/lib/perl";
use Gramene::Marker::DB;
use IO::Prompt;
use List::MoreUtils qw(firstval);
use Readonly;

Readonly my %LOGICNAME_TO_QUERY => (
      Rice_est   => {
		     query => "select m.marker_id from marker m, species s, marker_type t where m.source_species_id = s.species_id  and m.marker_type_id = t.marker_type_id and t.marker_type = 'EST' and s.species like 'Oryza %' ",
		     #marker_type => 'EST',
		    },

     Barley_est  => {
		     query => "select m.marker_id from marker m, species s, marker_type t  where m.source_species_id = s.species_id and m.marker_type_id = t.marker_type_id  and t.marker_type = 'EST' and s.species like 'Hordeum %' ",
		    },


     Maize_est  => {
		     query => "select m.marker_id from marker m, species s,  marker_type t where m.source_species_id = s.species_id  and m.marker_type_id = t.marker_type_id and t.marker_type = 'EST' and s.species like 'Zea %' ",
		    },

    Millet_est  => {
		     query => "select m.marker_id from marker m, species s,  marker_type t where m.source_species_id = s.species_id  and m.marker_type_id = t.marker_type_id and t.marker_type = 'EST' and s.species like 'Pennisetum %' ",
		    },				    

    Sorghum_est => {
		    query => "select m.marker_id from marker m, species s,  marker_type t where m.source_species_id = s.species_id  and m.marker_type_id = t.marker_type_id and t.marker_type = 'EST' and s.species like 'Sorghum %' ",
		   },

    Sugarcane_est => {
		    query => "select m.marker_id from marker m, species s,  marker_type t where m.source_species_id = s.species_id  and m.marker_type_id = t.marker_type_id and t.marker_type = 'EST' and s.species like 'Saccharum%' ",
		   },

    Wheat_est => {
		    query => "select m.marker_id from marker m, species s,  marker_type t where m.source_species_id = s.species_id  and m.marker_type_id = t.marker_type_id and t.marker_type = 'EST' and s.species like 'Triticum %' ",
		   },


    Rice_CDS  => {
		  query => "select m.marker_id from marker m, species s, marker_type t where m.source_species_id = s.species_id  and m.marker_type_id = t.marker_type_id and t.marker_type = 'mRNA' and s.species like 'Oryza %' ",
		 },
    
    Maize_BACend  => { 
	             query => "select m.marker_id from marker m, species s , marker_type t where m.source_species_id = s.species_id  and m.marker_type_id = t.marker_type_id and t.marker_type = 'BAC end sequence' and s.species like 'Zea %' ",
		    },


    RiceAlta_BACend_OMAP => {
			     query => "select m.marker_id from marker m, species s, marker_type t where m.source_species_id = s.species_id and m.marker_type_id = t.marker_type_id and t.marker_type = 'GSS' and species = 'oryza alta' ",
			     #"select m.marker_id from marker m, library l where m.library_id = l.library_id  and l.library_name = 'OA_BBa' ",
			     #marker_type => '',
			    },

      RiceAustraliensis_BACend_OMAP  => {
			     query => "select m.marker_id from marker m, species s, marker_type t where m.source_species_id = s.species_id and m.marker_type_id = t.marker_type_id and t.marker_type = 'GSS' and species = 'oryza australiensis' ",
#		      #query => "select m.marker_id from marker m, library l where m.library_id = l.library_id  and l.library_name = 'OA_ABa' ",
					},

      RiceBrachyantha_BACend_OMAP => {
			     query => "select m.marker_id from marker m, species s, marker_type t where m.source_species_id = s.species_id and m.marker_type_id = t.marker_type_id and t.marker_type = 'GSS' and species = 'oryza brachyantha' ",	
#	      #query => "select m.marker_id from marker m, library l where m.library_id = l.library_id  and l.library_name = 'OB__Ba ' ",
				     },

      RiceCoarctata_BACend_OMAP => {
			     query => "select m.marker_id from marker m, species s, marker_type t where m.source_species_id = s.species_id and m.marker_type_id = t.marker_type_id and t.marker_type = 'GSS' and species = 'oryza coarctata' ",
#		      query => "select m.marker_id from marker m, library l where m.library_id = l.library_id  and l.library_name = 'OC__Ba' ",
			      },

      RiceGlaberrima_BACend_OMAP => {
			     query => "select m.marker_id from marker m, species s, marker_type t where m.source_species_id = s.species_id and m.marker_type_id = t.marker_type_id and t.marker_type = 'GSS' and species = 'oryza glaberrima' ",
#		      query => "select m.marker_id from marker m, library l where m.library_id = l.library_id  and l.library_name = 'OG_BBa' ",
			      },
      
      RiceGranulata_BACend_OMAP => {
			     query => "select m.marker_id from marker m, species s, marker_type t where m.source_species_id = s.species_id and m.marker_type_id = t.marker_type_id and t.marker_type = 'GSS' and species = 'oryza granulata' ",
#		      query => "select m.marker_id from marker m, library l where m.library_id = l.library_id  and l.library_name = 'OG_ABa' ",
			      },

      RiceJaponica_BACend_OMAP => {
			     query => "select m.marker_id from marker m, species s, marker_type t where m.source_species_id = s.species_id and m.marker_type_id = t.marker_type_id and t.marker_type = 'GSS' and species like 'oryza sativa%' ",	
#	      query => "select m.marker_id from marker m, library l where m.library_id = l.library_id  and l.library_name = 'OSJNBa' ",
			      },

      RiceMinuta_BACend_OMAP => {
			     query => "select m.marker_id from marker m, species s, marker_type t where m.source_species_id = s.species_id and m.marker_type_id = t.marker_type_id and t.marker_type = 'GSS' and species = 'oryza minuta' ",
#		      query => "select m.marker_id from marker m, library l where m.library_id = l.library_id  and l.library_name = 'OM__Ba' ",
			      },

      RiceNivara_BACend_OMAP => {
			     query => "select m.marker_id from marker m, species s, marker_type t where m.source_species_id = s.species_id and m.marker_type_id = t.marker_type_id and t.marker_type = 'GSS' and species = 'oryza nivara' ",
#		      query => "select m.marker_id from marker m, library l where m.library_id = l.library_id  and l.library_name = 'OR_BBa' ",
			      },

      RicePunctata_BACend_OMAP => {
			     query => "select m.marker_id from marker m, species s, marker_type t where m.source_species_id = s.species_id and m.marker_type_id = t.marker_type_id and t.marker_type = 'GSS' and species = 'oryza punctata' ",
#		      query => "select m.marker_id from marker m, library l where m.library_id = l.library_id  and l.library_name = 'OP__Ba' ",
		      #marker_type => '',
			      },

      RiceRufipogon_BACend_OMAP => {
			     query => "select m.marker_id from marker m, species s, marker_type t where m.source_species_id = s.species_id and m.marker_type_id = t.marker_type_id and t.marker_type = 'GSS' and species = 'oryza rufipogon' ",
#		      query => "select m.marker_id from marker m, library l where m.library_id = l.library_id  and l.library_name = 'OR_CBa' ",
		      #marker_type => '',
			      },

     RiceOfficinalis_BACend_OMAP => {
			     query => "select m.marker_id from marker m, species s, marker_type t where m.source_species_id = s.species_id and m.marker_type_id = t.marker_type_id and t.marker_type = 'GSS' and species = 'oryza officinalis' ",
				    },

     RiceRidleyi_BACend_OMAP => {
			     query => "select m.marker_id from marker m, species s, marker_type t where m.source_species_id = s.species_id and m.marker_type_id = t.marker_type_id and t.marker_type = 'GSS' and species = 'oryza ridleyi' ",
				},
				    
    Rice_FstTransposon => { #4183 sequences
                      query => "select m.marker_id
                                from marker m, marker_details_gss md, library l
                                where m.marker_id=md.marker_id
                                and m.library_id=l.library_id
                                and l.library_name IN (
                                'UCD RdSpm Rice Insertions',
                                'UCD RGT Rice Insertions',
                                'UCD RDs Rice Insertions',
                                'UCD RGdSpm Rice Insertions' )",
			  },

    Rice_T_DNA_Insert => { #14533 sequences
		       query => "select m.marker_id
                                 from marker m, marker_details_gss md, library l
                                 where m.marker_id=md.marker_id
                                 and m.library_id=l.library_id
                                 and l.library_name IN (
                                 'AS_TRIM_TDNA_B1',
                                 'Flanking Sequence Tag of Oryza sativa T-DNA insertion lines' )",
			  },


    Rice_jap_cDNA_KOME => { #32127 sequences
		     query => "select m.marker_id
                               from marker m, marker_details_mrna md, library l
                               where m.marker_id=md.marker_id
                               and m.library_id=l.library_id
                               and l.library_name like 'KOME database of full-length cDNA%' ",
			 },
				    
    Rice_tos17_insert  => { #32127 sequences,
		      query => "select m.marker_id
                                from marker m, marker_details_gss md, library l
                                where m.marker_id=md.marker_id
                                and m.library_id=l.library_id
                                and l.library_name IN(
                                'PCR product directly amplified from rice genomic DNA' )",
			  },



      Barley_GI => {
		      query => " select m.marker_id from marker m, analysis a, species s, marker_details_est_cluster d where m.analysis_id = a.analysis_id  and m.source_species_id = s.species_id and m.marker_id = d.marker_id and a.analysis_name = 'tigr_gene_index' and s.species = 'Hordeum vulgare' and d.version = 'HVGI release 9' ",
		      #marker_type => '',
			      },

      Maize_GI => {
		      query => " select m.marker_id from marker m, analysis a, species s, marker_details_est_cluster d where m.analysis_id = a.analysis_id  and m.source_species_id = s.species_id and m.marker_id = d.marker_id and a.analysis_name = 'tigr_gene_index' and s.species = 'Zea mays' and d.version = 'ZMGI release 15' ",
		      #marker_type => '',
			      },
      Rice_GI => {
		      query => " select m.marker_id from marker m, analysis a, species s, marker_details_est_cluster d where m.analysis_id = a.analysis_id  and m.source_species_id = s.species_id and m.marker_id = d.marker_id and a.analysis_name = 'tigr_gene_index' and s.species = 'Oryza sativa' and d.version = 'OGI release 16' ",
		      #marker_type => '',
			      },

      Sorghum_GI => {
		      query => " select m.marker_id from marker m, analysis a, species s, marker_details_est_cluster d where m.analysis_id = a.analysis_id  and m.source_species_id = s.species_id and m.marker_id = d.marker_id and a.analysis_name = 'tigr_gene_index' and s.species = 'Sorghum bicolor' and d.version = 'SBGI release 1-1' ",
		      #marker_type => '',
			      },
      Wheat_GI => {
		      query => " select m.marker_id from marker m, analysis a, species s, marker_details_est_cluster d where m.analysis_id = a.analysis_id  and m.source_species_id = s.species_id and m.marker_id = d.marker_id and a.analysis_name = 'tigr_gene_index' and s.species = 'Triticum aestivum' and d.version = 'TAGI release 10' ",
		      #marker_type => '',
			      },
				    

      Barley_ESTCluster_PlantGDB => {
		      query => "select m.marker_id from marker m, analysis a where m.analysis_id = a.analysis_id and a.analysis_name = 'Barley_ESTCluster_PlantGDB' ",
			      },
      Maize_ESTCluster_PlantGDB => {
		      query => "select m.marker_id from marker m, analysis a where m.analysis_id = a.analysis_id and a.analysis_name = 'Maize_ESTCluster_PlantGDB' ",
			      },
      Rice_ESTCluster_PlantGDB => {
		      query => "select m.marker_id from marker m, analysis a where m.analysis_id = a.analysis_id and a.analysis_name = 'Rice_ESTCluster_PlantGDB' ",
			      },
      Sorghum_ESTCluster_PlantGDB => {
		      query => "select m.marker_id from marker m, analysis a where m.analysis_id = a.analysis_id and a.analysis_name = 'Sorghum_ESTCluster_PlantGDB' ",
			      },
      Wheat_ESTCluster_PlantGDB => {
		      query => "select m.marker_id from marker m, analysis a where m.analysis_id = a.analysis_id and a.analysis_name = 'Wheat_ESTCluster_PlantGDB' ",
			      },

      Sorghum_ESTCluster3P_LGBPratt => {
		      query => "select m.marker_id from marker m, analysis a where m.analysis_id = a.analysis_id and a.analysis_name = 'Sorghum_ESTCluster3P_LGBPratt' "
			      },
      
      Rice_ind_cluster => {
		      query => "select m.marker_id from marker m, analysis a where m.analysis_id = a.analysis_id and a.analysis_name = 'Rice_ind_cluster' ",
			      },
      Rice_ind_est => {
		      query => "select m.marker_id from marker m, analysis a where m.analysis_id = a.analysis_id and a.analysis_name = 'Rice_ind_est' ",
			      },


      Maize_hi_cot_TIGR => {
		      query => "select m.marker_id from marker m, analysis a where m.analysis_id = a.analysis_id and a.analysis_name = 'Maize_hi_cot_TIGR' ",
			      },

     Maize_meth_filt_hi_cot_cluster => {
		      query => "select m.marker_id from marker m, analysis a where m.analysis_id = a.analysis_id and a.analysis_name = 'Maize_meth_filt_hi_cot_cluster' ",
			      },

      Maize_MAGI_ISU => {
		      query => "select m.marker_id from marker m, analysis a where m.analysis_id = a.analysis_id and a.analysis_name = 'Maize_MAGI_ISU' ",
			      },


      Ryegrass_Sequence => {
		      query => "select m.marker_id from marker m, analysis a where m.analysis_id = a.analysis_id and a.analysis_name = 'Ryegrass_Sequence' ",
			      },
      Ryegrass_Assembly => {
		      query => "select m.marker_id from marker m, analysis a where m.analysis_id = a.analysis_id and a.analysis_name = 'Ryegrass_Assembly' ",
			      },

      'Sorghum_gss-read_Klein' => {
		      query => "select m.marker_id from marker m, analysis a where m.analysis_id = a.analysis_id and a.analysis_name = 'Sorghum_gss-read_Klein' ",
			      },


      Sorghum_orion => {
		      query => "select m.marker_id from marker m, analysis a where m.analysis_id = a.analysis_id and a.analysis_name = 'Sorghum_orion' ",
			      },

     Sorghum_CDNA => {
		      query => "select m.marker_id from marker m, analysis a where m.analysis_id = a.analysis_id and a.analysis_name = 'Sorghum_CDNA' ",
			      },
);


=stub

     Maize_hi_cot_Bennetzen => { # 446926 sequences, 186047 in markers21
		     query => "select m.marker_id
                               from marker m, marker_details_gss md, library l
                               where m.marker_id=md.marker_id
                               and m.library_id=l.library_id
                               and l.library_name IN (
                               'ZM_0.6_1.0_KB',
                               'Maize Cot library',
                               'Maize Cot100 library',
                               'Maize Cot200 library',
                               'Maize Cot466 library' )",
			       },

    Maize_meth_filt_CSHL_Mccombie => { #66390 sequences, 0 in markers21
		      query => "select m.marker_id
			       from marker m, marker_details_gss md, library l
                               where m.marker_id=md.marker_id
                               and m.library_id=l.library_id
                               and l.library_name IN (
                               'WGS-ZmaysF (DH5a methyl filtered)',
                               'WGS-ZmaysF (JM107 adapted methyl filtered)',
                               'JM107 adapted methyl filtered library' )",
			       },

    Maize_meth_filt_TIGR => {#450197 sequences, 199096 in markers21
		      query => "select m.marker_id
                              from marker m, marker_details_gss md, library l
                              where m.marker_id=md.marker_id
                              and m.library_id=l.library_id
                              and l.library_name IN (
                              'ZM_0.7_1.5_KB',
                              'ZM2_0.7_1.5_KB' )",
			    },

    Maize_Mu_Insert  => { #191715 sequences, 60851 in markers21
                      query => "select m.marker_id
                               from marker m, marker_details_gss md, library l
                               where m.marker_id=md.marker_id
                               and m.library_id=l.library_id
                               and l.library_name LIKE '% - RescueMu Grid %' ",
			},

=cut



$SIG{'INT'} = sub { exit(0) };

my ( $help, $v, $conffile, $dir, $query, $ds_name);

GetOptions(
           'help'           => \$help,
           'config_file:s'  => \$conffile,
           'verbose'        => \$v,
           'dir:s'          => \$dir,
	   'q:s'            => \$query,
	   'ds_name:s'      => \$ds_name,
#	   'analysis:s'     => \@analyses,
);
pod2usage(-verbose => 2) if $help;

#----
# Validate params
if( defined $conffile ){ 

  unless( -e $conffile ){
    warn( "\n[*DIE] File $conffile does not exist\n\n" );
    pod2usage;
  } unless( -r $conffile ){
    warn( "\n[*DIE] Cannot read $conffile\n\n" );
    pod2usage;
  } unless( -f $conffile ){
    warn( "\n[*DIE] File $conffile is not plain-text\n\n" );
    pod2usage;
  } unless( -s $conffile ){
    warn( "\n[*DIE] File $conffile is empty\n\n" );
    pod2usage;
  }

  $ENV{GrameneConfPath} = $conffile ;
}

#----
# output dir
$dir ||= $ENV{PWD};

#---
# logfile
my( $file ) = ( $0 =~ m/([^\/]+)$/ );
$file =~ s/\.\w+$//;
my $date = sprintf('%4.4i%2.2i%2.2i',Date::Calc::Today);
my $logfile = join( ".", $date, $$, $file, 'log' );
$logfile    = "$dir/$logfile";

=stub

my $run = prompt -yn, "Creating gramene track seq datasets using the following parameters? 
config file = $conffile (default /usr/local/gramene/conf/gramene.conf)
output dir  = $dir
log file    = $logfile
[y/n]
";

exit unless ( $run );

=cut

# Log to file
open( LOG, ">$logfile" ) or die( $! );
open( STDERR, ">&LOG" ) or die( $! );


my $MDB = Gramene::Marker::DB->new ||
    die "\n[*DIE] " . Gramene::Marker::DB->error . "\n\n";


my ( $num_of_created, $num_of_errors ) = ( 0, 0 );


print "Logic_name = $ds_name\n" if $v;

my @marker_ids = $MDB->search_marker_ids_by_query( $query ); 

my $num_of_markers = scalar @marker_ids;
print "found $num_of_markers markers for $ds_name\n" if $v;

my $outfile = "$dir/$ds_name-$date.fa";

( $num_of_created, $num_of_errors ) = create_fasta( $outfile, \@marker_ids);

print LOG "\n", (join "\t", ($ds_name, $num_of_markers, $num_of_created, $num_of_errors));


close LOG;

#======================================================================
#
sub create_fasta{

 # my $MDB             = shift;
  my $outfile_name    = shift;
  my $marker_ids_ref  = shift;

  my ($track_logic_name) = ($outfile_name =~ m=([^/]+) \. fa* =xms);

  my ( $num_of_created, $num_of_errors ) = (0, 0);
  my $seqio = Bio::SeqIO->new(-file=>">$outfile_name", -format=>'fasta');


  for my $marker_id ( @{$marker_ids_ref} ){

    
    unless( $marker_id ){
      print STDERR "[*ERR] $track_logic_name: No marker_id $marker_id\n";
      $num_of_errors++;
      next;
    }

    print "$marker_id\n" if $v;

    my @marker_synonyms = $MDB->get_marker_synonyms( marker_id => $marker_id );

#    for (keys %{$marker_synonyms[0]}){
#      print "$_ => $marker_synonyms[0]->{$_}\n";
#    }

    my ($marker_display_syn) = (firstval { $_->{synonym_type} eq 'GENBANK_VERSION'} @marker_synonyms)
                         || (firstval {$_->{synonym_type} eq 'GENBANK_ACCESSION'} @marker_synonyms)
	                 || (firstval {$_->{synonym_type} eq 'GENBANK_GI'} @marker_synonyms);
  
    my $marker_display_name = $marker_display_syn ? 
      $marker_display_syn->{marker_name} :
	$MDB->get_marker_display_name( 
				      marker_id => $marker_id,
				     );

    unless( $marker_display_name ){
      print STDERR "[*ERR] $track_logic_name: No marker_display_name found for marker_id $marker_id\n";
      $num_of_errors++;
      next;
    }

    print "$marker_id => $marker_display_name\n" if $v;

    #my $marker_details = $MDB->get_marker_details(
#						  marker_id => $marker_id,
#						 );

    my $seq_obj = Gramene::CDBI::Markers::MarkerSequence->retrieve($marker_id);
    my $seq;

    eval{ $seq = $seq_obj->seq()};

    if($@){
      print STDERR "[*ERR] $track_logic_name: No sequence found for marker_id $marker_id|$marker_display_name\n";
    }

    unless( $seq ){
      print STDERR "[*ERR] $track_logic_name: No sequence found for marker_id $marker_id|$marker_display_name\n";
      $num_of_errors++;
      next;
    }

    my $seq_obj = Bio::Seq->new(
			   -display_id => "mi|${marker_id}|mn|$marker_display_name|",
			   -seq       => $seq,
			  );

    $seqio->write_seq($seq_obj) && $num_of_created++;

  }
  
  return( $num_of_created, $num_of_errors );
}

__END__

#======================================================================


