#!/usr/bin/perl -w

#transfer/dump genes. The source genes/transcripts can be all 
#transcripts in query db or use IDs from an input file

use strict;
use Getopt::Long;
use Bio::SeqIO;
use Data::Dumper;
use Pod::Usage;
use File::Basename;
use Cwd 'abs_path';
use vars qw[ $VERSION ];
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Gene;
use Bio::EnsEMBL::Transcript;
use Bio::EnsEMBL::Translation;
use Bio::EnsEMBL::DnaDnaAlignFeature;
use Bio::EnsEMBL::Pipeline::Tools::TranscriptUtils;
use Bio::EnsEMBL::Pipeline::Tools::TranslationUtils;


$VERSION = sprintf "%d.%02d", q$Revision: 1.16 $ =~ /(\d+)\.(\d+)/;
pod2usage(0) unless $ARGV[0];

my ($_help, $_version, $sdbhost, $sdbport, $sdbuser, $sdbpass, $sdbname, 
    $sdnadb, $tdbhost, $tdbport, $tdbuser, $tdbpass, $tdbname, $idfile, $type, 
    $istrans, $coord_system, $addid, $idbase, $keepid, $addscore, $newtype,
    $glen, $plen, $write, $usechr, $baselen);
GetOptions(
	   'h|help'        => \$_help,
	   'v|version'     => \$_version,
	   'dbhost=s'      => \$sdbhost,
	   'dbport=s'      => \$sdbport,
	   'dbuser=s'      => \$sdbuser,
	   'dbpass=s'      => \$sdbpass,
	   'dbname=s'     => \$sdbname,
	   'dnadb=s'       => \$sdnadb,
	   'tdbhost=s'      => \$tdbhost,
	   'tdbport=s'      => \$tdbport,
	   'tdbuser=s'      => \$tdbuser,
	   'tdbpass=s'      => \$tdbpass,
	   'tdbname=s'     => \$tdbname,
	   'idfile=s'      => \$idfile,
	   'type=s'        => \$type,
	   'trans'          => \$istrans,
	   'coord_system=s'      => \$coord_system,
	   'addid=s'         => \$addid,
	   'idbase=s'      => \$idbase,
	   'keepid'    => \$keepid,
	   'score=s'       => \$addscore,
	   'newtype=s'   => \$newtype,
	   'glen=s'        => \$glen,
	   'plen=s'        => \$plen,
	   'write'          => \$write,
	   'usechr'         => \$usechr,
	   'baselen=s'      => \$baselen,
	   ) or die;
pod2usage(2) if $_help;
if ( $_version ) {
    my $prog = basename( $0 );
    print "$prog v$VERSION\n";
    exit(0);
}

$sdbport ||= '3306';
$tdbhost ||= $sdbhost;
$tdbuser ||= $sdbuser;
$tdbport ||= $sdbport;
$tdbpass ||= $sdbpass;

print STDERR "Target: $tdbhost $tdbport $tdbname\n";

my $sdba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
					       -user   => $sdbuser,
					       -pass   => $sdbpass,
					       -dbname => $sdbname,
					       -host   => $sdbhost,
					       -port => $sdbport,
					       -driver => 'mysql',
					       );
my $tdba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
					       -user   => $tdbuser,
					       -pass   => $tdbpass,
					       -dbname => $tdbname,
					       -host   => $tdbhost,
					       -port => $tdbport,
					       -driver => 'mysql',
					       );
if ($sdnadb) {
    my $sdnadba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
						     -user   => $sdbuser,
						     -pass   => $sdbpass,
						     -dbname => $sdnadb,
						     -host   => $sdbhost,
						     -port => $sdbport,
						     -driver => 'mysql',
						     );
    $sdba->dnadb($sdnadba);
#    $tdba->dnadb($sdnadba);
}

$baselen ||= 6;
my $isgene = !$istrans;
my $csa = $sdba->get_CoordSystemAdaptor();
my $cs = $csa->fetch_by_rank(1)||$csa->fetch_all->[0];
$coord_system ||= $cs->name || 'chromosome';
print "$coord_system\n";
$idbase ||= 'TMP';
my $slice_adaptor = $tdba->get_SliceAdaptor;

my $ids = get_ids($idfile) if ($idfile);
my $genes;
my $transcripts;
if ($ids) {
    if ($isgene) {
	$genes = get_by_id($sdba->get_GeneAdaptor, $ids);
	print STDERR "id file -- gene num: ", (scalar @{$genes||[]}), "\n";
#	print_transcripts($genes->[0]->get_all_Transcripts);
    }else {
	if ($type && $type =~ /^predict/) { 
	    $transcripts = get_by_id($sdba->get_PredictionTranscriptAdaptor, $ids);
	}else {
	    $transcripts = get_by_id($sdba->get_TranscriptAdaptor, $ids);
	}
    print STDERR "id file -- trans num: ", (scalar @{$transcripts}), "\n";
    }
}else {
    $genes = get_all_genes($sdba, $type);
    print STDERR "no id file -- gene : ", scalar @$genes, "\n";
    if ($type && $type =~ /^predict/) { 
	$transcripts = $genes; 
	$genes = undef;
    }
}

my $id_start = $addid||1;
my $gene_id = int($id_start/2)*2+1;
my $genedba = $tdba->get_GeneAdaptor();

if ($transcripts) {
    my $num=0;
    foreach my $i (0..@$transcripts-1) {
	my $trans = $transcripts->[$i];
	$trans = $trans->transform($coord_system);
	my $slice = $slice_adaptor->fetch_by_name($trans->slice->name);
	if (!$slice) {print STDERR "no slice: ",$trans->slice->name, "\n";next;}
#	print $trans->seq->seq, "\n";
	my $tl = $trans->translation;
	if (! $tl) {
	    $trans = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
		->compute_translation( $trans );
	    if (! $trans ) { next;}
	}elsif ($addscore || $type && $type =~ /^predict/) {
	    print_transcript($trans);
#	    print 'translation: ', $tl->start, ' ', $tl->end, "\n";
	    print $trans->translate->seq, " type 2\n";
	    my $ts_new = Bio::EnsEMBL::Transcript->new
		(
#		 -exons       => $trans->get_all_Exons,
		 -stable_id   => $trans->stable_id,
		 -version     => 1,
 		 -analysis    => $trans->analysis,
		 -slice       => $trans->slice,
		 -start       => $trans->start,
		 -end         => $trans->end,
		 -strand      => $trans->strand,
		 );
	    my @tfs;
	    my $hstart=0;
	    my $hend=0;
	    my $start_exon;
	    my $end_exon;
	    my $in_codon=0;
	    my @exons = @{$trans->get_all_Exons};
	    foreach my $i (0..@exons-1) {
		my $e = $exons[$i];
		if (!$in_codon) { $hstart = $hend+1; }
#		$hend += int($e->length);
		my $start = $e->start;
		my $end = $e->end;
		if ($i == 0 && $e->strand == 1) {
		    $start = $e->start + (3-$e->phase)%3;
		}elsif ($i == 0 && $e->strand == -1) {
		    $end = $e->end - (3-$e->phase)%3;
		}
		$hend += $end-$start+1;
#		if ($e->length%3 != 0) {$hend += 1; $in_codon = 1;}
		print "query: $hstart - $hend\n";
		print "target: $start - $end\n";
		my $feature_pair = Bio::EnsEMBL::FeaturePair
		    ->new(#-seqname    => $trans->stable_id,
			  -start      => $start,
			  -end        => $end,
			  -strand     => $e->strand,
			  -hseqname   => 'predict',#$trans->stable_id,
			  -hstart     => $hstart,
			  -hend       => $hend,
			  -hstrand    => 1,
			  -hcoverage  => 100,
			  -score      => 100,
			  -percent_id => $addscore||0,
			  -slice      => $e->slice,
			  );
		push @tfs, $feature_pair;
		my @efs = ($feature_pair);
		my $sf = Bio::EnsEMBL::DnaDnaAlignFeature
		    ->new(-features =>\@efs);
		$sf->analysis($trans->analysis);
		my $exon = Bio::EnsEMBL::Exon
		    ->new(
			  -stable_id   => $e->stable_id,
			  -version     => 1,
			  -analysis    => $e->analysis,
			  -slice       => $e->slice,
			  -start       => $e->start,
			  -end         => $e->end,
			  -strand      => $e->strand,
			  );
		$exon->phase($e->phase);
		$exon->end_phase($e->end_phase);
		if ($addscore) {$exon->add_supporting_features($sf); }
		$ts_new->add_Exon($exon);
		if ($exon->start==$tl->start_Exon->start) { $start_exon = $exon;}
		if ($exon->start==$tl->end_Exon->start) {$end_exon = $exon;}
	    }
	    $trans = $ts_new;
	    $trans->biotype('predict');
	    $trans->description('predict');
#	    print $trans->get_all_Exons->[0]->get_all_supporting_features()->[0]->percent_id, "\n";
# 	    @tfs = sort {$a->start*$a->strand<=>$b->start*$b->strand} @tfs;
	    my $sf = Bio::EnsEMBL::DnaDnaAlignFeature
		->new(-features =>\@tfs);
	    print "prediction supporting feature: ", $sf->start, '-', $sf->end, "\n";
	    $sf->analysis($trans->analysis);
	    if ($addscore) {$trans->add_supporting_features($sf);}
#	    if ($newtype) {$trans->biotype($newtype);}
	    
	    # can't use old translation object due to bug(?)
 	    my $tl_new = Bio::EnsEMBL::Translation->new
		(
#		 -stable_id  => $tl->stable_id,
		 -start_exon => $start_exon,
		 -end_exon   => $end_exon,
		 -seq_start  => $tl->start,
		 -seq_end    => $tl->end,
		 );
	    $tl_new->stable_id($tl->stable_id||$trans->stable_id.'P');
	    $tl_new->version(1);
	    $trans->translation($tl_new);
	    print $trans->translate->seq, " type 1\n";
	}

	my $gene = Bio::EnsEMBL::Gene->new();
	my $sid = $trans->stable_id;
	if (!($sid =~ s/T(\d+)$/$1/)) { $trans->stable_id($trans->stable_id . 'T');}
	$gene->stable_id($sid);
	$gene->version(1);
	if ($newtype) {$gene->biotype($newtype);}
 	$gene->analysis($trans->analysis);
	$gene->add_Transcript($trans);
	if ($addid) {
	    $tl->stable_id(sprintf("${idbase}%0${baselen}d.P1",$gene_id));
	    $tl->version(1);
	    $trans->stable_id(sprintf("${idbase}%0${baselen}d.T1",$gene_id));
	    $trans->version(1);
	    $gene->stable_id(sprintf("${idbase}%0${baselen}d",$gene_id));	    
	    $gene->version(1);
	    $gene_id++;
	    my $exon_id = 1;
	    foreach my $exon (@{$trans->get_all_Exons}) {
		$exon->stable_id(sprintf($trans->stable_id.".e%02d", $exon_id++));
		$exon->version(1);
	    }
	}
	print 
	    $gene->stable_id, 
	    ' ', $gene->get_all_Transcripts->[0]->stable_id,
	    ' ', $gene->get_all_Transcripts->[0]->translation->stable_id, 
	    "\n";
	if ($write) {$genedba->store($gene);$num++;}
	$transcripts->[$i] = undef;
    }
    print "$num genes transferred\n";
}else {
    my $num=0;
    foreach my $i (0..@$genes-1) {
	my $gene = $genes->[$i];
	if ($glen && $gene->length < $glen) {next;}
	my $genebase = $idbase;
	if ($addid && $coord_system eq 'chromosome' && $usechr) {
	    $genebase .= sprintf "%02sg", substr($gene->seq_region_name, 0, 2);
#	    print "$genebase\n";
	}
	my $tran_num = 1;
	my $exon_id = 1;
	my $valid = 0;
	if ($addid) {
	    if ($gene->strand == -1 && $gene_id%2 != 0) {$gene_id+=1;}
	    elsif ($gene->strand == 1 && $gene_id%2 == 0) {$gene_id -= 1;}
	}
	foreach my $tran (@{$gene->get_all_Transcripts}) {
	    if (@{$tran->get_all_Exons||[]}==0) {next;}
#	    $tran = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
#		->compute_translation( $tran );
	    my $t = $tran->translation;
	    if (!$plen || length($tran->translate->seq) >= $plen) {$valid=1;}
	    if (! $addid) {next;}
	    if (!$keepid || !$tran->stable_id()) {
		$tran->stable_id(sprintf("${genebase}%0${baselen}d_T%02d",$gene_id, $tran_num));
	    }
	    $tran->version(1);
	    $tran->created_date(time());
	    my $tl_id = $tran->stable_id;
	    $tl_id =~ s/_T/_P/i;
	    if ($t) {
		$t->stable_id($tl_id);
		$t->version(1);
		$t->created_date(time());
	    }
	    $tran_num++;
	}
	my @exons = @{$gene->get_all_Exons};
#	print scalar (@exons), " exons\n";
	foreach my $exon (sort {$a->start<=>$b->start} @exons) {	   
	    if (!$addid || $keepid && $exon->stable_id) {next;}
	    $exon->stable_id(sprintf("${genebase}%0${baselen}d_E%02d",$gene_id,$exon_id++));
#	    print $exon, ' ', $exon->stable_id, "\n";
	    $exon->version(1);
	    $exon->created_date(time());
	}

	foreach my $tran (@{$gene->get_all_Transcripts}) {
	    foreach my $exon (sort {$a->start<=>$b->start} @{$tran->get_all_Exons}) {
		foreach my $e (@{$gene->get_all_Exons}) {
		    if ($exon->start==$e->start && $exon->end==$e->end) {
			$exon->stable_id($e->stable_id) if $e->stable_id;
			$exon->version($e->version) if $e->version;
			last;
		    }
		}
#		print $exon, ' ', $exon->stable_id, "\n";
	    }
#	    print "\n";
	}
	foreach my $tran (@{$gene->get_all_Transcripts}) {
	    foreach my $exon (sort {$a->start<=>$b->start} @{$tran->get_all_Exons}) {
#		print $exon, ' ', $exon->stable_id, "\n";
	    }
#	    print "\n";
	}
	if (!$valid) {next;}
	if ($addid) {
	    $gene->stable_id(sprintf("${genebase}%0${baselen}d",$gene_id));
	    $gene_id += 2;
	    $gene->version(1);
	    $gene->created_date(time());
	    add_gene_desc($gene) if ! $gene->description;
	}
	print $gene->stable_id, ' ',$gene->get_all_Transcripts->[0]->stable_id,
	' ', $gene->get_all_Transcripts->[0]->translation && $gene->get_all_Transcripts->[0]->translation->stable_id, "\n";
#	print_transcript($gene->get_all_Transcripts->[0]);
#	print_transcript_seq($gene->get_all_Transcripts->[0], 1);
#	print $gene->display_xref, "\n";
	$gene->{display_xref} = undef;
	$gene->external_status('');
	$gene->external_db('');
	$gene->external_name('');
#	print $gene->external_status, "\n";
	if ($write) {$genedba->store($gene);}
	$genes->[$i] = undef;
	$num++;
    }
    print "$num genes transferred\n" if $write;
}


exit;



###########################################################
sub get_ids {
    my $idfile = shift;
    open(ID, "$idfile");
    my @ids = <ID>;
    close ID;
    my $ids = join '', @ids;
    $ids =~ s/^\s+//;
    $ids =~ s/\s+$//;
    @ids = split /\s+/, $ids;
    return \@ids;
}

# retrieve a Gene/Transcript from DB using stable_id
sub get_by_id {
    my ($adaptor, $ids) = @_;
    my @objs;
    foreach my $id (@$ids) {
	my $obj = $adaptor->fetch_by_stable_id($id);
	if (!$obj && $id =~ /^\d+$/) {$obj = $adaptor->fetch_by_dbID($id);}
	push @objs, $obj if $obj;
    }
    return \@objs;
}


# retrieve all transcripts on rice all chromosomes
sub get_all_genes {
    my ($dba, $type) = @_;
    my $slice_adaptor = $dba->get_SliceAdaptor;
    my @genes;
    my @slices = @{$slice_adaptor->fetch_all($coord_system)};
    print STDERR scalar @slices, " $coord_system(s)\n";
    for my $slice (@slices) {
#	my $repeats = $slice->get_all_RepeatFeatures();
#	print scalar @$repeats, " repeat regions\n";
	my @g;
	if ($type && $type =~ /^predict/) { 
	    @g = @{$slice->get_all_PredictionTranscripts()};
	}elsif ($type) { 
	    @g = @{$slice->get_all_Genes_by_type($type)};
	}else { @g = @{$slice->get_all_Genes};}
	push @genes, sort {$a->seq_region_start <=> $b->seq_region_start} @g;
    }
    return \@genes;
}

sub get_all_genes1 {
    my ($dba, $type) = @_;
    my @genes;
    if ($type && $type =~ /^predict/) { 
	push @genes, @{$dba->get_PredictionTranscriptAdaptor->fetch_all};
    }elsif ($type) { 
	push @genes, @{$dba->get_GeneAdaptor->fetch_all_by_biotype($type)};
    }else { push @genes, @{$dba->get_GeneAdaptor->fetch_all};}
    
    @genes = sort {$a->slice->seq_region_name cmp $b->slice->seq_region_name ?
		       $a->seq_region_start <=> $b->seq_region_start :
		       $a->slice->seq_region_name cmp $b->slice->seq_region_name
		   } @genes;
    return \@genes;
}

sub get_transcripts {
    my ($genes) = @_;
    my @trans;
    foreach my $gene (@$genes) {
	push @trans, @{$gene->get_all_Transcripts()};
    }
    return \@trans;
}

sub add_gene_desc {
    my $gene = shift;
    my $trans = $gene->get_all_Transcripts();    
    my @types;
    foreach my $tran (@$trans) {
	push @types, split /\||;|\s+/, $tran->description if $tran->description;
    }
    my %biotype = ();
    foreach my $b (@types) {
	if ($b) {$biotype{$b}=1;}
    }
    my $biotype;
    foreach my $b (sort keys %biotype) {
	$biotype ? ($biotype .= '|'.$b) : ($biotype = $b);
    }
    $gene->description($biotype);
}

sub print_transcripts {
    my ($trans) = @_;
    print "### ", scalar @$trans, " transcripts.\n";
    foreach my $t (@$trans) {
	print_transcript($t);
    }
}

sub print_transcript {
    my ($t) = @_;
    print $t->slice->name, "\n";
    $t->slice->name =~ /^(\w+)/;
    $t = $t->transform($1);
    $t = $t->transfer($t->slice);
    print $t->stable_id || $t->dbID, ' ', $t->seq_region_name, ':', 
    $t->start, '-', $t->end, ' ', $t->get_all_Exons->[0]->strand, "\n";
    print_exons($t);

    return;
    my $new_tran = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
        ->compute_translation( $t );
    my $sequence = $new_tran->translate;    
    if ( $sequence ){
	print $sequence->seq, "\n";
    }else {
	print STDERR "TRANSCRIPT WITHOUT A TRANSLATION!!\n";
    }
}

sub print_exons {
    my ($transcript) = @_;
    print "    Exons: ";
    foreach my $exon (sort {$a->start <=> $b->start} @{$transcript->get_all_Exons}) {
	print " ", $exon->start, '-', $exon->end; #print ' ', $exon->slice;
    }
    print "\n";
}

sub print_transcript_seq {
    my ($trans, $pro) = @_;
    my $exons = $trans->get_all_Exons;
    my $introns = $trans->get_all_Introns;
    print '    ';
    foreach my $i (0..@$introns-1) {
	print uc $exons->[$i]->seq->seq;
	print lc $introns->[$i]->seq;
    }
    print uc $exons->[@$exons-1]->seq->seq;
    print "\n";
    
    if (!$pro) {return;}
    
    if (!$trans->translation) {
	print "check translation ....\n";
	$trans = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
	    ->compute_translation( $trans );
    }
    print 'start: ', $trans->translation->start, ' end: ', $trans->translation->end, "\n";
    my $sequence = $trans->translate;    
    if ( $sequence ){
	print $sequence->seq, "\n";
    }else {
	print STDERR "TRANSCRIPT WITHOUT A TRANSLATION!!\n";
    }
    
}


# ----------------------------------------------------

=head1 NAME

gene-transfer.pl

=head1 SYNOPSIS

  gene-transfer.pl -sdbhost host -sdbport dbport -sdbuser user -sdbname name 
    -sdnadb dnadb -tdbhost host -tdbport port -tdbuser user -tdbname name 
    -idfile file -gene -type type -coord_system coord

Options:

  -h|--help        Show brief help and exit
  -v|--version     Show version and exit
  -sdbhost=s         source db host
  -sdbport=s        source db port
  -sdbuser=s         db user 
  -sdbname=s        db for query genes
  -sdnadb=s          db for dna seq 
  -tdbhost=s        target  db host
  -sdbport=s        target db port
  -tdbuser=s         db user 
  -tdbname=s        db for target genes
  -idfile=s         IDs for source genes
  -isgene           if gene or transcript
  -type=s           gene type in DB
  -coord_system=s   if chromosome, gff dump each chromosome into a file

=head1 DESCRIPTION

To transfer genes from one db to another or dump genes into gff or fasta format

=head1 SEE ALSO

perl.

=head1 AUTHOR

Chengzhi Liang E<lt>liang@cshl.eduE<gt>.

=head1 COPYRIGHT

Copyright (c) 2005 Cold Spring Harbor Laboratory

This library is free software;  you can redistribute it and/or modify 
it under the same terms as Perl itself.

=cut

