#!/usr/local/bin/perl

# vim: tw=78: sw=4: ts=4: et: 

# $Id: load-genemodels-from-ensembl.pl,v 1.7 2007/06/05 20:20:42 kclark Exp $


# testing example:
# no changes: LOC_Os01g01010
# new gene:  LOC_Osm1g00260
# obsolete: LOC_Os01g01020 a complete list can be found at ftp://ftp.tigr.org/pub/data/Eukaryotic_Projects/o_sativa/annotation_dbs/pseudomolecules/version_5.0/all.chrs/v4.obsolete.loci.with.v5.coords

use strict;
use warnings;

use Getopt::Long;
use Pod::Usage;

#use lib "/home/weix/gramene/lib/perl";
use Gramene::CDBI::Markers;
use Gramene::Marker::DB;
use Date::Calc;
use Data::Dumper;

BEGIN {
    $ENV{'GrameneDir'} ||= '/usr/local/gramene/'; 
    $ENV{'GrameneEnsemblDir'} ||= '/usr/local/gramene_ensembl/'; 
}

# The first shall be last...
use lib map { $ENV{'GrameneDir'}."/$_" } qw ( lib/perl );

use lib map { $ENV{'GrameneEnsemblDir'}."/$_" } 
        qw ( bioperl-live modules ensembl/modules conf
             ensembl-external/modules ensembl-draw/modules
             ensembl-compara/modules );

use lib map { $ENV{'GrameneEnsemblDir'}."/ensembl-live/$_" } 
        qw ( bioperl-live modules ensembl/modules ensembl-external/modules
             ensembl-draw/modules ensembl-compara/modules );


use Bio::EnsEMBL::Registry;

my @test_genes = qw(
                   LOC_Os01g01010
                   LOC_Osm1g00260
                   LOC_Os01g01020
);

my($ensembl_species,$verbose,$markersdb_conf, $ms_acc, $test, $markersdb_anal, $registry_file);
my $gene_analysis_logic_name; #= 'tigr_gene';
my $marker_type = 'Gene Prediction';
my $marker_species;# = 'Oryza sativa';
my $synonym_type = 'GENE_MODEL';

my %gene_stable_id_list; # Screen for duplicates. E.g. in ITGR3 XML DB
                         # global across chromosomes

{  #Argument Processing
  my $help=0;
  my $man=0;
  
  Getopt::Long::Configure("no_ignore_case");
  GetOptions( "help|?"           => \$help,
	      "man"              => \$man,
	      "species=s"        => \$ensembl_species,
	      "markersdb_conf=s" => \$markersdb_conf,
	      "v+"               => \$verbose,
	      'm|ms-acc=s'       => \$ms_acc,
	      'a|analysis=s'     => \$gene_analysis_logic_name,
	      'markersdb_anal=s' => \$markersdb_anal,
	      't'                => \$test,
	      'r|registry_file=s' => \$registry_file,
	      #"markersdb_map_set_name=s" => \$markersdb_map_set_name,
	      #"markersdb_species_id=i" => \$markersdb_species_id,
	      #"markersdb_map_type_id=i" => \$markersdb_map_type_id,
	      
	      	      
	    )
    or pod2usage(2);
  pod2usage(-verbose => 2) if $man;
  pod2usage(1) if $help;
  #($ensembl_species && $ensembl_species =~ /Oryza_sativa/i) || ( warn( "Need a --species\n" ) &&
#			pod2usage(1) );
  $ensembl_species  || ( warn( "Need a --species\n" ) &&
			pod2usage(1) );
  $ms_acc          || ( warn( "Need a --cmap accession (for example gt0506)\n" ) &&
			pod2usage(1) );
 
  $gene_analysis_logic_name || ( warn( "Need the analysis logic name of the genes to be loaded\n" )&&
			 pod2usage(1) );
  $markersdb_anal || ( warn( "Need markers db analysis name for this load option -markersdb_anal\n" ) &&
			pod2usage(1) );
  #$markersdb_species_id || ( warn( "Need markersdb_species_id\n" ) &&
	#		pod2usage(1) );


  $marker_species = $ensembl_species;
  $marker_species =~ s/_/ /g;
  if( defined $markersdb_conf ){ $ENV{GrameneConfPath} = $markersdb_conf };

  print "markersdb conf path = $ENV{GrameneConfPath}\n";
}


my $rundate = sprintf( '%4.4i%2.2i%2.2i', Date::Calc::Today );
my $mdb = Gramene::Marker::DB->new;
my $markersdb_anal_id = $mdb->find_or_create_analysis($markersdb_anal);

$registry_file       ||= "$ENV{GrameneEnsemblDir}/conf/ensembl.registry";
print"Ensembl registry file is $registry_file\n";

my $reg = "Bio::EnsEMBL::Registry";	#Use this to get adaptors 
$reg->load_all( $registry_file ) or die "load_all failed";
     
my $slice_adaptor = $reg->get_adaptor($ensembl_species,'core','Slice') 
  or die "can't get Slice adaptor for $ensembl_species";

my @map_sets = Gramene::CDBI::Markers::MapSet->search(
        cmap_map_set_accession => $ms_acc
    );

unless ( scalar @map_sets == 1 ) {
  die "More than one or no map set with that accession $ms_acc?!\n";
}

my $map_set_id = $map_sets[0]->map_set_id;
print "Map set id is $map_set_id\n";

my $synonym_type_id = $mdb->get_synonym_type_id($synonym_type);

#testing
my $gene_adaptor = $reg->get_adaptor($ensembl_species,'core','Gene');
if( $test ){

  for my $g(@test_genes){
    print "testing $g\n";
    my $gene_obj = $gene_adaptor->fetch_by_stable_id($g);
    if( !$gene_obj){
      print "Not Found: $g\n" ;
      next;
    }
    my $chr_slice = $gene_obj->slice;
    my $chr_name = $chr_slice->seq_region_name();
 
    print "\nChr : $chr_name, ";
    $chr_name =~ s/^0+//;
    my $chr_start = $chr_slice->start();
    my $chr_end = $chr_slice->end();
 
    print "$chr_name, $chr_start, $chr_end\n";
    #next;

    my $map = Gramene::CDBI::Markers::Map->find_or_create({
							    map_set_id => $map_set_id,
							    map_name   => "Chr. $chr_name",
							    start      => $chr_start,
							    end        => $chr_end,
							   });
    process_ensembl_gene($gene_obj, $map, $chr_name) if $map;
 }

  exit;
}
# Create slice corresponding to pseudochromosome

#for my $chr(1..12){
  
 # my $slice=$slice_adaptor->fetch_by_region('chromosome',$chr)
  #  || ( warn( "cannot make slice for chromosome $chr") && next );

my $chr_slices = $slice_adaptor->fetch_all('chromosome');
#my $chr_slices = $slice_adaptor->fetch_all('toplevel');#not working

for my  $chr_slice( @{$chr_slices} ){

  my $chr_name = $chr_slice->seq_region_name();
  next if $chr_name =~ /R[^_]*_/; #we don't want unassembled clones
  $chr_name =~ s/^0+//;
  my $chr_start = $chr_slice->start();
  my $chr_end = $chr_slice->end();
 
  print "\n\n$chr_name: $chr_start - $chr_end\n" ;
  
  my $map = Gramene::CDBI::Markers::Map->find_or_create({
							    map_set_id => $map_set_id,
							    map_name   => "Chr. $chr_name",
							    start      => $chr_start,
							    end        => $chr_end,
							   });
  print "map_id=" . $map->map_id ."\n\n";


  my @genes = @{$chr_slice->get_all_Genes($gene_analysis_logic_name, '', 1)};
  
  for my $gene(@genes){

    process_ensembl_gene( $gene, $map, $chr_name );

  }


}






sub process_ensembl_gene{

    my $gene = shift;
    my $map  = shift;
    my $chr_name = shift;

    my $stable_id  = $gene->stable_id();
    my $gene_start = $gene->start();
    my $gene_end   = $gene->end();
    my $gene_strand = $gene->strand();

    my @trpt_synonyms = 
      grep{
	$_->{marker_name} ne $stable_id
      }
	map{
	  {marker_name => $_->stable_id(),
	   synonym_type_id => $synonym_type_id,
	   }
	}
	  @{$gene->get_all_Transcripts()};

#    print "$stable_id, $gene_start, $gene_end, $gene_strand\n\t" . join(' ', @trpt_stable_ids)."\n";

    my $marker_id;
    my $action;  #update/create

    #Check if the same marker exist in previous build
    my $markers      =  $mdb->marker_search(
                    marker_name => $stable_id,
                    marker_type => $marker_type,
                    species     => $marker_species,
                    #synonyms    => [@trpt_stable_ids],
                );
    
    for my $mk_hash(@{$markers}){
      print "Dumper: ".Dumper($mk_hash);
      my $mid = $mk_hash->{marker_id};
      my $m = Gramene::CDBI::Markers::Marker->retrieve( $mid );
      
      if ($m->analysis_id == $markersdb_anal_id){
	$marker_id = $mid;
	#$action = 'found';
	last ;
      }
      
      $marker_id = $mid if(!$marker_id || $marker_id < $mid); #choose the most recent marker to update
      
    }

    # if already exists from previous analysis, update the analysis_id
    # else create a new one

   if($marker_id){
     $action = 'update';
   }else{
     $action = 'create';
   }

    print "$stable_id needs $action\n";

    if( $action eq 'update' ){
      $mdb->update_marker(
			  marker_id => $marker_id,
			  marker_name => { marker_name=>$stable_id,
					   synonym_type_id=>$synonym_type_id }  ,
			  synonyms    => [@trpt_synonyms],
			  analysis_id => $markersdb_anal_id,
			  date_updated => $rundate,
			  details     => {
					  chromosome => $chr_name,
					 }
			 );
      
    }elsif( $action eq 'create' ){

      $marker_id      =  $mdb->create_marker(
                    marker_name => $stable_id,
                    marker_type => $marker_type,
                    species     => $marker_species,
                    synonyms    => [@trpt_synonyms],
		    analysis_id => $markersdb_anal_id,
		    date_created => $rundate,
		    details     => {
				    chromosome => $chr_name,
				   }  
					    );
    }

    print "$action $stable_id is done\n";
    my $marker = Gramene::CDBI::Markers::Marker->retrieve( $marker_id );

    unless($marker){
      print "[*ERR]: Cannot create marker for $stable_id, $gene_start, $gene_end, $gene_strand, trpt_synonyms = " . join(' ', map{$_->{marker_name}} @trpt_synonyms)."\n";
      return;
    }

my $gene_len = $gene_end-$gene_start+1;
my $map_id = $map->map_id;
my $marker_synonym_id = $marker->display_synonym_id;

    print "before set_marker_mapping 
marker_id              => $marker_id, 
map_id                 => $map_id,
analysis_id            => $markersdb_anal_id
marker_synonym_id      => $marker_synonym_id,
start                  => $gene_start,
end                    => $gene_end,
strand                 => 1,
marker_start           => 1,
marker_end             => $gene_len,
marker_strand          => $gene_strand,
date_created           => $rundate, 
";
    my $mapping_id = $mdb->set_marker_mapping(
    
           marker_id              => $marker_id,
           
           map_id                 => $map->map_id,
           
           analysis_id            => $markersdb_anal_id,
           
          # marker_synonym_id      => $marker->display_synonym_id,

           start                  => $gene_start,             # REQUIRED
           end                    => $gene_end,
           strand                 => 1,
           marker_start           => 1,
           marker_end             => $gene_end-$gene_start+1,
           marker_strand          => $gene_strand,
           date_created           => $rundate, 

         );

    print "$stable_id => mappingID $mapping_id\n\n";

    return 1;
}


__END__

    my $stable_id  = $gene->stable_id();
    my $gene_start = $gene->start();
    my $gene_end   = $gene->end();
    my $gene_strand = $gene->strand();

    my @trpt_stable_ids = grep{
      $_ ne $stable_id
    }
      map{
      $_->stable_id()
    }
      @{$gene->get_all_Transcripts()};

#    print "$stable_id, $gene_start, $gene_end, $gene_strand\n\t" . join(' ', @trpt_stable_ids)."\n";


    #create the marker
    my $marker_id      =  $mdb->find_or_create_marker(
                    marker_name => $stable_id,
                    marker_type => $marker_type,
                    species     => $marker_species,
                    synonyms    => [@trpt_stable_ids],
                );
    my $marker = Gramene::CDBI::Markers::Marker->retrieve( $marker_id );

    unless($marker){
      print "[*ERR]: Cannot create marker for $stable_id, $gene_start, $gene_end, $gene_strand, trpt_synonyms = " . join(' ', @trpt_stable_ids)."\n";
      next;
    }

    my $mapping    = Gramene::CDBI::Markers::Mapping->find_or_create({
            marker_id          => $marker->id,
            display_synonym_id => $marker->display_synonym_id,
            map_id             => $map->map_id,
            start              => $gene_start,
            end                => $gene_end,
            strand             => $gene_strand,
        });

    print "$stable_id => mappingID ". $mapping->mapping_id ."\n";


# ----------------------------------------------------

=head1 NAME

load-genemodels-from-ensembl.pl - load specified gene models from ensembl database to markers db

=head1 VERSION

This documentation refers to load-genemodels-from-ensembl.pl version $Revision: 1.7 $

=head1 SYNOPSIS

  load-genemodels-from-ensembl.pl [options] 

  For example:

  testing:
  perl load-genemodels-from-ensembl.pl -t -m gt0506 -s Oryza_sativa -markersdb_conf /usr/local/gramene/conf/gramene.conf -markersdb_anal TIGRv5_gene_model -a tigr_gene

  production:
  perl load-genemodels-from-ensembl.pl -m gt0506 -s Oryza_sativa -markersdb_conf /usr/local/gramene/conf/gramene.conf -markersdb_anal TIGRv5_gene_model -a tigr_gene > /scratch/weix-tmp/ensembl_gene2markersdb/TIGRv5genes2markersdb.out &

Required Arguments:

  --species                 ensembl_species for connect to correct ensembl database 
                            (for example: Oryza_sativa, 
                            the markers db species will derive from it => 
                                          Oryza sativa)

  --m|ms-acc                map set accession in the markers db (for example gt0506)

  --a|analysis              the ensembl database analysis logic_name 
                            corresponding to the gene sets you want to load 
                            into markers db (for example: tigr_gene)

  --markersdb_anal          the markers db analysis_name for this load
	      

Options:

  --markersdb_conf          the conf file for markersdb to be loaded into, default is
                            	      /usr/local/gramene/conf/gramene.conf

  --r|registry_file         the conf file for connecting to ensembl database, default 
                            is /usr/local/gramene_ensembl//conf/ensembl.registry

  --t                       testing

  --help                    Show brief help and exit
  --man                     Show full documentation
  --v                       verbose,

=head1 DESCRIPTION

This script retrieves a set of gene models (distinguish by analysis logic_name) from the specified ensembl database, load them into markers db as type GENE_MODEL under the specified markers db analysis_name. If the same gene model already exists in the markers db, update it including its mapping, otherwise, create a new gene model and create it mapping. The obsolete gene models will stay in the markers db with their mappings. The analysis_name should be different


  
=head1 SEE ALSO

Gramene::Markers::DB, Gramene::CDBI::Markers.

=head1 AUTHOR

Sharon Wei E<lt>weix@cshl.eduE<gt>.

=head1 COPYRIGHT

Copyright (c) 2006 Cold Spring Harbor Laboratory

This library is free software;  you can redistribute it and/or modify 
it under the same terms as Perl itself.

=cut
