#!/lab/bin/perl

use strict;
use warnings;

my @lib;
BEGIN {	#find libraries somewhere above us
    my @wantlib=qw ( bioperl-live modules ensembl/modules ensembl-external/modules lib/perl );
    @lib=();
    my $dir='.';
    my $slast='';
    while(-d $dir and @wantlib) {
	my $s=join(":",(stat $dir)[0,1]);	#need this because .. of root dir is root
	last if $s eq $slast;
	$slast=$s;
	#print "$dir $s\n";
	foreach (@wantlib) {
	    if( -d "$dir/$_") { push @lib,"$dir/$_"; $_=''; }
	}
	@wantlib=grep { $_ } @wantlib;
	$dir="../$dir";
    }
    #print join (" + ",@lib),"\n";
    die "Can't find ",join(" or ",@wantlib)," in parent directories" if @wantlib;
}

use lib (@lib);
use Fcntl;
use Carp( qw/ carp cluck /);
use DB_File;
use CSHL::Config;
use Bio::SeqIO;
use Bio::EnsEMBL::DBLoader;

sub usage {
    print STDERR "usage: $0 dbmfilename\n",
    	( @_ ? join("\n",@_)."\n" : '');
    exit 1;
}

#This will make sure id's are standard and incrementable:
my %idfix = (
    'Exon' => 'GRME',
    'Transc' => 'GRMT',
    'Gene' => 'GRMG',
    'Transl' => 'GRMP',
);

sub idfix {
my ($id)=@_;
    $id or cluck("idfix($id)");
    $id =~ s/^(\D+)(\d+)$/($idfix{$1}||$1).sprintf("%011d",$2)/e;
    return $id;
}

    my $newseqout = new Bio::SeqIO(-format => 'fasta',
				   -file     => ">newpeptides.fa") 
				   or die "newpeptides.fa:$!";
    my $oldseqout = new Bio::SeqIO(-format => 'fasta',
				   -file     => ">oldpeptides.fa") 
				   or die "oldpeptides.fa:$!";

    my $olddb=Bio::EnsEMBL::DBLoader->new( OldEnsemblLocator );
    my $newdb=Bio::EnsEMBL::DBLoader->new( EnsemblLocator );

my (%oldclones, 
#%newclones,
#%gene_rename, %transcript_rename, %translation_rename, %exon_rename,
%new_to_old_db,
@clone_novel,@gene_novel, @transcript_novel, @translation_novel, @exon_novel,   # =new ids not corresponding to an old one
);

my ($novel_gene_id,$novel_transcript_id,$novel_translation_id,$novel_exon_id)=('','','','');  #id to use for novel things
	#first set to largest old id


my $dbfilename=shift or usage;


    tie %new_to_old_db,'DB_File' , $dbfilename , O_RDWR|O_CREAT, 0664 , $DB_HASH or usage("$dbfilename:$!");


#for each old clone
foreach my $oldclone_id ($olddb->get_all_Clone_id) {
    print STDERR "clone $oldclone_id\n";
    my $oldclone=$olddb->get_Clone($oldclone_id) or carp("Can't get old $oldclone_id") and next;
    $oldclones{$oldclone_id}=1;   
    my @oldgenes= $oldclone->get_all_Genes() or carp("Can't get genes for old $oldclone_id") ;
#for my $junk (@oldgenes)   {
#    if(ref ($junk)) {
#	print STDERR "og ",$junk->can('id')?$junk->id:ref($junk)." can't ->id ","\n";
#    } else {
#	print STDERR "og $junk\n";
#    }
#}
    #get new clone of same name
    my $newclone=undef;
    eval { $newclone=$newdb->get_Clone($oldclone_id); };
    unless($newclone) { #if new clone doesn't exist then list old clone & its genes [& transcripts in a later version] and next
        print STDERR "$@\n" unless $@=~ /MSG: no clone for /;
	print "< ! $oldclone_id: ALL: ",join(",", map { $_->id } @oldgenes),"\n";
        next;
    }
    my @newgenes= $newclone->get_all_Genes() or carp("Can't get genes for new $oldclone_id") and next;
#for my $junk (@newgenes)   {
#    if(ref ($junk)) {
#	print STDERR "ng ",$junk->can('id')?$junk->id:ref($junk)." can't ->id ","\n";
#    } else {
#	print STDERR "ng $junk\n";
#    }
#}

    #calculate signature of each transcript =peptide sequence & exon sizes
    my ($old_gene_sum,$old_gbid) = summarize_genes(@oldgenes);
    my ($new_gene_sum,$new_gbid) = summarize_genes(@newgenes);

    #update max old ids:
    strmax_with_olds_ids_asnew (\$novel_gene_id,@oldgenes);
    strmax_with_olds_ids_asnew (\$novel_transcript_id, map { $_->[1] } @$old_gene_sum);
    strmax_with_olds_ids_asnew (\$novel_translation_id, map { $_->[3]->id } @$old_gene_sum);
    strmax_with_olds_ids_asnew (\$novel_exon_id, map { @{$_->[4]} } @$old_gene_sum);

    #map signature to [gene,transcript,...]
    my (%oldwithsig)=();
    foreach (@$old_gene_sum) { push @{$oldwithsig{$_->[2]}},$_; }

    # make prima-facie correspondences where signatures match
    my %oldGene_to_new=();
    my %newGene_to_old=();
    my %oldExon_to_new=();
    my %newExon_to_old=();
    my %oldTscript_to_new=();
    my %newTscript_to_old=();
    my %oldTlation_to_new=();
    my %newTlation_to_old=();
    foreach my $newscript (@$new_gene_sum) {
        if(my $pAoldscript=$oldwithsig{$newscript->[2]}) {
	    foreach my $oldscript (@$pAoldscript) {
		$oldGene_to_new{$oldscript->[0]}->{$newscript->[0]}=1;
		$newGene_to_old{$newscript->[0]}->{$oldscript->[0]}=1;
		$oldTscript_to_new{$oldscript->[1]}->{$newscript->[1]}=1;
		$newTscript_to_old{$newscript->[1]}->{$oldscript->[1]}=1;
		$oldTlation_to_new{$oldscript->[3]->id}->{$newscript->[3]->id}=1;
		$newTlation_to_old{$newscript->[3]->id}->{$oldscript->[3]->id}=1;
		for my $i (0..$#{$newscript->[4]}) {
		    $oldExon_to_new{$oldscript->[4]->[$i]}->{$newscript->[4]->[$i]}=1;
		    $newExon_to_old{$newscript->[4]->[$i]}->{$oldscript->[4]->[$i]}=1;
		}
	    }
	}
    }
    #List what doesn't match:
    print_idnotin("< $oldclone_id ",\%oldGene_to_new,@oldgenes);
    print_idnotin("> $oldclone_id ",\%newGene_to_old,@newgenes);
    print_idnotin("< $oldclone_id ",\%oldTscript_to_new,map { $_->[1] } @$old_gene_sum );
    print_idnotin("> $oldclone_id ",\%newTscript_to_old,map { $_->[1] } @$new_gene_sum );

    #output FASTA for matchless translations:
    foreach ( grep { ! $oldTlation_to_new{$_->[3]->id} } @$old_gene_sum ) {
	$oldseqout->write_seq($_->[3]);
    }
    foreach ( grep { ! $newTlation_to_old{$_->[3]->id} } @$new_gene_sum ) {
	$newseqout->write_seq($_->[3]);
    }

=pod

    Actually, 
    ok1=have a putative match of translations & exon lengths
    ok2=have a unique match
    ok3= matching id's in genbank ( ($id)= map { $_->display_id } grep { $_->database eq 'ENTREZPRO' } $gene->each_DBLink; )
    ng3 = both have a genbank id & they differ
    then ok = !ng3 && ok1 && ( ok2 || ok3 )

=cut

    #use only the bi-unique and warn about the others
    #   (Note that one_to_one_only does idfix)
    # and remember the unassigned new ids
    my ($hash,$msg);

    ($hash,$msg)=one_to_one_only(\%oldGene_to_new,\%newGene_to_old,$old_gbid,$new_gbid);
    print map { "$oldclone_id: $_\n" } @$msg;
    @new_to_old_db{keys %$hash}=values %$hash;
    push @gene_novel, grep { !$hash->{$_} } map { $_->id } @newgenes;
    print map { "= $oldclone_id ".$hash->{$_}." $_\n" } keys %$hash;

    ($hash,$msg)=one_to_one_only(\%oldTscript_to_new,\%newTscript_to_old,$old_gbid,$new_gbid);
    print map { "$oldclone_id: $_\n" } @$msg;
    @new_to_old_db{keys %$hash}=values %$hash;
    push @transcript_novel, grep { !$hash->{$_} } map { $_->[1] } @$new_gene_sum;

    ($hash,$msg)=one_to_one_only(\%oldTlation_to_new,\%newTlation_to_old,$old_gbid,$new_gbid);
    print map { "$oldclone_id: $_\n" } @$msg;
    @new_to_old_db{keys %$hash}=values %$hash;
    push @translation_novel, grep { !$hash->{$_} } map { $_->[3]->id } @$new_gene_sum;

    ($hash,$msg)=one_to_one_only(\%oldExon_to_new,\%newExon_to_old,$old_gbid,$new_gbid);
    print map { "$oldclone_id: $_\n" } @$msg;
    @new_to_old_db{keys %$hash}=values %$hash;
    push @exon_novel,   grep { !$hash->{$_} } map { @{$_->[4]} } @$new_gene_sum;

}

undef $oldseqout; undef $newseqout;


#for each new clone not an old clone
foreach my $newclone_id ($newdb->get_all_Clone_id) {
    unless ($oldclones{$newclone_id}) {
	my $newclone=$newdb->get_Clone($newclone_id);
	my @newgenes= $newclone->get_all_Genes();
	print "> ! $newclone_id: ALL: ",join(",", map { $_->id } @newgenes),"\n";
	push @clone_novel,$newclone_id;
        my ($new_gene_sum) = summarize_genes(@newgenes);
	push @gene_novel, map { $_->id } @newgenes;
	push @transcript_novel, map { $_->[1] } @$new_gene_sum;
	push @translation_novel, map { $_->[3]->id } @$new_gene_sum;
	push @exon_novel,   map { @{$_->[4]} } @$new_gene_sum;
    }
}

=pod

 For each novel gene,exon transcript, translation 
 give it an id > all old ids

=cut

foreach my $id (@gene_novel) {
    $new_to_old_db{$id}=++$novel_gene_id;
}
foreach my $id (@transcript_novel) {
    $new_to_old_db{$id}=++$novel_transcript_id;
}
foreach my $id (@translation_novel) {
    $new_to_old_db{$id}=++$novel_translation_id;
}
foreach my $id (@exon_novel) {
    $new_to_old_db{$id}=++$novel_exon_id;
}

untie %new_to_old_db;

=pod

Should keep transcript and translation numbers corresponding 
but we don't check it - look at the dumped transcript file

=cut


sub summarize_genes {
    my @summaries=();
    my %ids=();
    foreach my $gene (@_) {
	#genbankid(s) of Gene
	#my $gbid=join ",", map { $_->display_id } grep { $_->database eq 'ENTREZPRO' } $gene->each_DBLink;
	my $gbid=join ",", map { $_->database."/".$_->display_id }  $gene->each_DBLink;
	$ids{$gene->id}=$gbid if $gbid;
	#get signature of all transcripts
	foreach my $script ($gene->each_Transcript) {
	    my $lation=$script->translate();
	    my $aa_seq=lc($lation->seq());
	    my @exons = $script->each_Exon; #all exons(in order)
	    #set ids
	    if($gbid) {
		$ids{$script->id}=$ids{$lation->id}=$gbid;
		@ids{ map { $_->id } @exons}=$gbid x scalar(@exons);
	    }
	    #calculate signature:
	    my $signature=join("," ,map { $_->length } @exons).":".$aa_seq;
	    #push @summaries,[$script,$signature];	#,$lation,\@exons in transcript object
	    push @summaries,[$gene->id,$script->id,$signature,$lation,[ map { $_->id } @exons] ];
	}
    }
    #print STDERR map { join("/",@$_[0..2]).",",$_->[3]->id.":(".join(",",@{$_->[4]}).")\n" } @summaries;
    print STDERR map {  exists $ids{$_}?"$_ gb:$ids{$_}\n":'' } keys %ids ;
    return (\@summaries,\%ids);
}


sub one_to_one_only {	#take only the one-to-one part and apply idfix
my ($oldtonew,$newtoold,$oldid,$newid)=@_;

    my (%goodnewtoold);
    my (@outmsg);

    #first get those that are unique when require id match
    foreach my $old (keys %$oldtonew) {
	my $id=$oldid->{$old} or next;
	my (@new)= grep { $newid->{$_} eq $id } keys %{$oldtonew->{$old}};
	if(scalar(@new)==1) {
	    if(scalar( grep { $oldid->{$_} eq $id } keys %{$newtoold->{$new[0]}})==1) {
		$goodnewtoold{$new[0]}=idfix($old);
		push @outmsg,"$id: Old $old = ".$new[0];
	    }
	}
    }

    #now simply 1-1
    foreach my $old (keys %$oldtonew) {
	my (@new)=keys %{$oldtonew->{$old}};
	if(scalar(@new)==1) {
	    if(scalar(keys %{$newtoold->{$new[0]}})==1) {
		$goodnewtoold{$new[0]}=idfix($old);
	    } elsif(scalar(keys %{$newtoold->{$new[0]}})>1) {
		push @outmsg,"New $new[0] corresponds to many old: ".join(",",@{$newtoold->{$new[0]}});
		$newtoold->{$new[0]}={};  # so don't get this message again
	    }
	} else {
	    push @outmsg,"Old $old corresponds to many new: ".join(",",@new);
	}
    }
    (\%goodnewtoold,\@outmsg);
}

sub print_idnotin { 	#print a message for stuff not in the hash ref
my($prefix,$phash,@ids)=@_;
    foreach my $id (@ids) {
	$id=$id->id if ref $id;
	print "$prefix$id\n" unless $phash->{$id};
    }
}

sub strmax_with_olds_ids_asnew {
my ($pmax,@ids)=@_;
    foreach my $x (@ids) {
	print STDERR ref $x if ref $x and ! $x->can('id');
	$x=$x->id if ref $x;
	$x=idfix($x);
	#print STDERR "id $x\n";
	$$pmax=$x if $x gt $$pmax;
    }
}



