#
# You may distribute this module under the same terms as perl itself

# POD documentation - main docs before the code

=head1 NAME

Bio::EnsEMBL::Pipeline::Tools::TranscriptUtils - 

=head1 SYNOPSIS

=head1 DESCRIPTION

=head1 CONTACT

ensembl-dev@ebi.ac.uk

=head1 APPENDIX

The rest of the documentation details each of the object
methods. Internal methods are usually preceded with a _

=cut


# Let the code begin...

package Bio::EnsEMBL::Pipeline::Tools::GeneBuildUtils;

use vars qw(@ISA);
use strict;
use Data::Dumper;
use Bio::EnsEMBL::Root;
use Bio::EnsEMBL::Transcript;
use Bio::EnsEMBL::Exon;
use Bio::EnsEMBL::Pipeline::Runnable::Protein::Seg;
use Bio::EnsEMBL::DnaPepAlignFeature;
use Bio::EnsEMBL::Pipeline::Tools::ExonUtils;
use Bio::EnsEMBL::Utils::PolyA;
use Bio::EnsEMBL::DBSQL::SliceAdaptor;
use Bio::EnsEMBL::PredictionTranscript;

@ISA = qw(Bio::EnsEMBL::Root);

 

###########################################################c

=head2 cluster_Transcripts

 Description : It separates transcripts according to strand and then clusters 
               each set of transcripts by calling _cluster_Transcripts_by_genomic_range()
  Args       : Array of Bio::EnsEMBL::Transcript
  Return     : Array of Bio::EnsEMBL::Pipeline::GeneComparison::TranscriptCluster

=cut

sub cluster_Transcripts {
  my ($self,$transcripts) = @_;
 
  my @forward_transcripts;
  my @reverse_transcripts;
 
  foreach my $transcript (@$transcripts){
    my @exons = @{ $transcript->get_all_Exons };
    if ( $exons[0]->strand == 1 ){
      push( @forward_transcripts, $transcript );
    }
    else{
      push( @reverse_transcripts, $transcript );
    }
  }
  
  my @forward_clusters;
  my @reverse_clusters;
  
  if ( @forward_transcripts ){
    @forward_clusters = $self->_cluster_Transcripts_by_genomic_range( @forward_transcripts );
  }
  if ( @reverse_transcripts ){
    @reverse_clusters = $self->_cluster_Transcripts_by_genomic_range( @reverse_transcripts );
  }
  my @clusters;
  if ( @forward_clusters ){
    push( @clusters, @forward_clusters);
  }
  if ( @reverse_clusters ){
    push( @clusters, @reverse_clusters);
  }
  return \@clusters;
}

############################################################

=head2 _cluster_Transcripts_by_genomic_range

 Description : It clusters transcripts according to genomic overlap
  Args       : Array of Bio::EnsEMBL::Transcript
  Return     : Array of Bio::EnsEMBL::Pipeline::GeneComparison::TranscriptCluster

=cut

sub _cluster_Transcripts_by_genomic_range{
  my ($self,$slice, @mytranscripts) = @_;
  # first sort the transcripts

  my @transcripts = sort { $a->start <=> $b->start ? $a->start <=> $b->start : $b->end <=> $a->end } map {$_->transfer($slice);} @mytranscripts;

  # create a new cluster 
  my $cluster=Bio::EnsEMBL::Pipeline::GeneComparison::TranscriptCluster->new();
  my $count = 0;
  my @cluster_starts;
  my @cluster_ends;
  my @clusters;
  
  # put the first transcript into these cluster
  $cluster->put_Transcripts( $transcripts[0] );

  $cluster_starts[$count] = $transcripts[0]->start;
  $cluster_ends[$count]   = $transcripts[0]->end;
  
  # store the list of clusters
  push( @clusters, $cluster );
  
  # loop over the rest of the transcripts
 LOOP1:
  for (my $c=1; $c<=$#transcripts; $c++){
    #print STDERR "\nIn cluster ".($count+1)."\n";
    print STDERR "start: $cluster_starts[$count] end: $cluster_ends[$count] ";
    print STDERR $transcripts[$c]->start, '-', $transcripts[$c]->end, "\n";
    #print STDERR "comparing:\n";
    #Bio::EnsEMBL::Pipeline::Tools::TranscriptUtils->_print_Transcript( $transcripts[$c] );
    
    if ( !( $transcripts[$c]->end < $cluster_starts[$count] ||
	    $transcripts[$c]->start > $cluster_ends[$count] ) ){
      $cluster->put_Transcripts( $transcripts[$c] );
      
      # re-adjust size of cluster
      if ($transcripts[$c]->start < $cluster_starts[$count]) {
	$cluster_starts[$count] = $transcripts[$c]->start;
      }
      if ( $transcripts[$c]->end > $cluster_ends[$count]) {
	$cluster_ends[$count] =  $transcripts[$c]->end;
      }
    }
    else{
      # else, create a new cluster with this feature
      $count++;
      $cluster = Bio::EnsEMBL::Pipeline::GeneComparison::TranscriptCluster->new();
      $cluster->put_Transcripts( $transcripts[$c] );
      $cluster_starts[$count] = $transcripts[$c]->start;
      $cluster_ends[$count]   = $transcripts[$c]->end;
      
      # store it in the list of clusters
      push(@clusters,$cluster);
    }
  }
  return @clusters;
}

sub cluster_transcripts_with_strand {
    my ($self, $transcripts, $fslice, $rslice) = @_;
    my $clusters = $self->cluster_transcripts_no_strands($transcripts, $fslice);
    my @clusters;
    foreach my $cluster (@$clusters) {
	my ($f, $r) = group_by_strand($cluster, $fslice, $rslice);
	if (@$f) {push @clusters, $f;}
	if (@$r) {push @clusters, $r;}
    }
    return \@clusters;
}


sub cluster_transcripts_no_strands {
    my ($self, $transcripts, $slice) = @_;
    if (@$transcripts == 0) {return [];}
    my @trans = sort { $a->start <=> $b->start } map {$_->transfer($slice);} @$transcripts;
#    print_transcripts(\@trans, "before clusting");
    my @clusters;
    my @cluster;
    my $cstart = $trans[0]->start;
    my $cend = $trans[0]->end;
    push @cluster, $trans[0];
    foreach my $i (1..@trans-1) {
	if ($trans[$i]->start <= $cend) { 
#	    print STDERR "===cluster: $cstart-$cend, ", scalar @cluster, " transcripts ";
#	    print STDERR $trans[$i]->start, '-', $trans[$i]->end, "\n";
	    push @cluster, $trans[$i];
	    if ($trans[$i]->end > $cend) {
		$cend = $trans[$i]->end;
	    }
	}else {
	    push @clusters, [@cluster];
	    print STDERR "===cluster: $cstart-$cend, ", scalar @cluster, " transcripts\n";
	    @cluster = ($trans[$i]);
	    $cstart = $trans[$i]->start;
	    $cend = $trans[$i]->end;
	}
    }
    push @clusters, [@cluster];
    print STDERR "===cluster: $cstart-$cend, ", scalar @cluster, " transcripts\n";
    return \@clusters;
}

sub combine_clusters {
    my ($self, $clusterf, $clusterb) = @_;
    my @clusters;
    foreach my $cluster1 (@$clusterf) {
	my @trans = {$cluster1->get_Transcripts()};
	foreach my $cluster2 (@$clusterb) {
	    if ($cluster2->start >= $cluster1->start 
		&& $cluster2->start <= $cluster1->end
		|| $cluster2->end >= $cluster1->start
		&& $cluster2->end <= $cluster1->end) {
		push @trans, @{$cluster2->get_Transcripts()};
	    }
	}
	push @clusters, \@trans;
    }
    return \@clusters;
}

#############################################

sub process_transcripts {
    my ($self, $transcripts, $fslice, $rslice, $high_perc_id, $low_perc_id, $max_check_id, $min_keep_id, 
	$min_intron_size, $max_bad_ratio, $max_boundary_shift, $max_end_ext,
	$max_distance, $use_description) = @_;
    my $clusters = $self->cluster_transcripts_no_strands($transcripts, $fslice);
    foreach my $i (0..@$transcripts-1) { $transcripts->[$i] = undef; }
    my @ftrans;
    my @rtrans;
    foreach my $i (0..@$clusters-1) {
	my ($f, $r) = 
	    process_cluster_no_strand($clusters->[$i], $fslice, $rslice, $min_intron_size,
				      $max_check_id, $min_keep_id, $max_bad_ratio,
				      $max_boundary_shift, $max_end_ext, $max_distance,
				      $use_description);
	push @ftrans, @$f if $f;
	push @rtrans, @$r if $r;
	$clusters->[$i] = undef;
    }

    print STDERR "\n===post processing\n";
    my @ts; 
    foreach my $t (@ftrans, @rtrans) {
	push @ts, $t->transfer($fslice);	
    }
    @ftrans = ();
    @rtrans = ();
    print_transcripts(\@ts);
    print "\n";
    my $trans = connect_all_close_transcripts(\@ts, 200);
#    print "---after connection:\n";
#    print_transcripts($trans);
    @ts = ();
    while (1) {
	my $old_trans = $trans;
	$trans = cleanup_strand_by_intron($trans);
	my $clusters = $self->cluster_transcripts_with_strand($trans, $fslice, $rslice);
	$trans = $self->process_clusters($clusters, '', $high_perc_id, $low_perc_id, 0);
	$trans = fix_strand_by_translation($trans);
#	print "---after translation strand:\n";
#	print_transcripts($trans);
	$clusters = $self->cluster_transcripts_with_strand($trans, $fslice, $rslice);
	$trans = $self->process_clusters($clusters, '', $high_perc_id, $low_perc_id, 0);
#	print_transcripts($trans, "---after processing");
	if (@$trans == @$old_trans) { last; }
    }
#    $trans = fix_strand_by_translation($trans);
    my ($fts, $rts) = group_by_strand($trans, $fslice, $rslice);
    return ($fts, $rts);
}

sub process_cluster_no_strand {
    my ($transcripts, $fslice, $rslice, $min_intron_size,
	$max_check_id, $min_keep_id, $max_bad_ratio, $max_boundary_shift, $max_end_ext,
	$max_distance, $use_description) = @_;

    print STDERR "\n\n", scalar(@$transcripts), " transcripts in cluster\n";
    my ($top_trans, $all_trans) = select_top_trans($transcripts);
    if (@$all_trans == 0) { return (undef, undef); }
    $all_trans = preprocess_cluster($all_trans, 101, 0, 1); # remove duplications
    $all_trans = cleanup_strand_by_intron($all_trans);
    my ($ftrans, $rtrans) = group_by_strand($all_trans, $fslice, $rslice);

    my ($fts, $fts_all) = process_cluster_by_group($ftrans, $min_intron_size, $max_check_id, $min_keep_id, 
				       $max_bad_ratio, $max_boundary_shift, $max_end_ext,
				       $max_distance, $use_description) if @$ftrans > 0;
    my ($rts, $rts_all) = process_cluster_by_group($rtrans, $min_intron_size, $max_check_id, $min_keep_id, 
				       $max_bad_ratio, $max_boundary_shift, $max_end_ext,
				       $max_distance, $use_description) if @$rtrans > 0;

#    return ($fts, $rts);

    print STDERR "\n===== add non-overlapping =====\n";
    my @all_top = (@{$fts||[]}, @{$rts||[]});
    my $ofts = find_non_overlapping(\@all_top, $fts_all);
    my $orts = find_non_overlapping(\@all_top, $rts_all);
    print_transcripts([@$ofts, @$orts], "non-overlapping:");

    push @$fts, @{process_cluster($ofts,'', 99.5, 99, 0, $max_end_ext)} if @$ofts > 0;
    push @$rts, @{process_cluster($orts,'', 99.5, 99, 0, $max_end_ext)} if @$orts > 0;

    return ($fts, $rts);
}

sub process_cluster_by_group1 {
    my ($trans, $min_intron_size, $max_check_id, $min_keep_id, $max_bad_ratio, 
	$max_boundary_shift, $max_end_ext, $max_distance, $use_description) = @_;
    print_transcripts($trans, "\n*******process each cluster******");

    my ($ftop_trans, $fts_all) = select_top_trans($trans) if $trans;
    if (@$fts_all==0) {return ([], []);}
    print STDERR "\n####### ", scalar(@$ftop_trans), ':', scalar(@{$ftop_trans->[0]||[]}), " top transcripts\n";
    
    # fix strands based on intron and select good introns
    my @fts = @{$ftop_trans};
    my $good_f = get_good_introns($fts_all);
    my $exons = get_all_exons($fts_all);

    my $bad_ratio = $max_bad_ratio;
    my @all_trans;
    foreach my $i (0..@fts-1) {
	my $predict = 0;
	if (@{$fts[$i]||[]} == 0) {next;} 
	elsif (@{$fts[$i]->[0]->get_all_supporting_features()} == 0) {
	    $predict=1;
	    $min_intron_size = 4;
	}
	print STDERR "\n===== process top trans group f$i =====\n";
	$fts[$i] = correct_bad_introns($fts[$i], $good_f, $max_boundary_shift);
	$fts[$i] = clean_bad_introns($fts[$i], $exons, $min_keep_id, $min_intron_size, $predict);
	$fts[$i] = clean_cluster_by_intron_ratio($fts[$i], $max_check_id, $bad_ratio);
	$fts[$i] = process_cluster($fts[$i], '', 0, 0, 0, $max_end_ext);
	push @all_trans, @{$fts[$i]};
#	$fts[$i] = connect_all_close_transcripts($fts[$i], 200);
	$fts[$i] = extend_transcripts($fts[$i], $max_end_ext, 0, 0, 1);
	if ($bad_ratio > 0.25) {$bad_ratio *= 0.75;}
	print STDERR "\n===== after clean top trans group f$i =====\n";
	print_transcripts($fts[$i], "top f$i");
    }

    my $top_fts = fix_short_trans(\@fts, $max_end_ext, $max_distance) if @fts > 0;
    
    collect_supporting_evidence($top_fts, \@all_trans);
				
    return ($top_fts, \@all_trans);
}

sub process_cluster_by_group {
    my ($trans, $min_intron_size, $max_check_id, $min_keep_id, $max_bad_ratio, 
	$max_boundary_shift, $max_end_ext, $max_distance, $use_description) = @_;
    print_transcripts($trans, "\n*******process each cluster******");

    my ($ftop_trans, $fts_all) = select_top_trans($trans) if $trans;
    if (@$fts_all==0) {return ([], []);}
    print STDERR "\n####### ", scalar(@$ftop_trans), ':', scalar(@{$ftop_trans->[0]||[]}), " top transcripts\n";
    
    # fix strands based on intron and select good introns
    my @fts = @{$ftop_trans};
    my $good_f = get_good_introns($fts_all);
    my $exons = get_all_exons($fts_all);

    my $bad_ratio = $max_bad_ratio;
    my @all_trans;
    my @good_trans;
    my @bad_trans;
    my $predict = 0;
    foreach my $i (0..@fts-1) {
	if (@{$fts[$i]||[]} == 0) {next;} 
	elsif (@{$fts[$i]->[0]->get_all_supporting_features()} == 0) {
	    $predict=1;
	    $min_intron_size = 4;
	}
	print STDERR "\n===== process top trans group f$i =====\n";
	$fts[$i] = correct_bad_introns($fts[$i], $good_f, $max_boundary_shift);
	$fts[$i] = clean_bad_introns($fts[$i], $exons, $min_keep_id, $min_intron_size, $predict);
	if (!$predict) {
	    push @all_trans, @{$fts[$i]};
	    my $bad;
	    ($fts[$i], $bad) = clean_cluster_by_intron_ratio($fts[$i], $max_check_id, $bad_ratio);
	    $fts[$i] = process_cluster($fts[$i], '', 0, 0, 0, $max_end_ext);
	    push @good_trans, @{$fts[$i]}; 
	    push @bad_trans, @$bad;
	    $fts[$i] = extend_transcripts($fts[$i], $max_end_ext, 0, 0, 1);
	}
	if ($bad_ratio > 0.5) {$bad_ratio *= 0.5;}
	elsif ($bad_ratio >= 0.3) {$bad_ratio *= 0.75;}
	print_transcripts($fts[$i], "\n===== after clean top f$i =====");
    }

    my @good_ts;
    foreach my $ts (@fts) {
	if (@{$ts||[]} > 0) { push @good_ts, $ts; }
    }
    my $bad = process_cluster(\@bad_trans, '', 101, 101, 0, $max_end_ext) if @bad_trans>0;
    if ($predict && @good_ts == 1 && $bad || $max_bad_ratio > 0 && @good_ts == 0 && $bad) {
	unshift @good_ts, $bad;
    }
    my $top_fts = fix_short_trans(\@good_ts, $max_end_ext, $max_distance) if @good_ts > 0;
    if ($max_bad_ratio > 0 && @{$top_fts||[]} == 1 && @{$bad||[]}>0) {
	if (@{$top_fts->[0]->get_all_Exons} == 1) {
	    $top_fts = fix_trans_by_low($top_fts, $bad, $max_end_ext, $max_distance, 1);
	}
    }
    collect_supporting_evidence($top_fts, \@all_trans);
				
    return ($top_fts, \@good_trans);
}

# find in trans2 not in trans1 
sub find_non_overlapping {
    my ($trans1, $trans2, $min_perc) = @_;
    my @trans;
    foreach my $t2 (@$trans2) {
	my $found = 0;
	if (@{$t2->get_all_supporting_features} == 0) {next;}
	foreach my $t1 (@$trans1) {
	    if ($t1->start <= $t2->end && $t2->start <= $t1->end) {
		$found = 1;
	    }
	}
	if (!$found) { push @trans, $t2; }
    }
    return \@trans;
}

# add all supporting evidence from trans2 to trans1 (overlapping only)
sub collect_supporting_evidence {
    my ($trans1, $trans2, $use_predict) = @_;
    foreach my $top (@{$trans1}) {
	foreach my $t (@{$trans2}) {
	    if ($top->start <= $t->end && $t->start <= $top->end) {
		$top->description(combine_description($t, $top, $use_predict));
		my ($status) = compare_transcripts_approxy(($top, $t, 25));
		if ($status > 0 && $status < 4 || $status == 5 || $status == 7) {
		    $top->add_supporting_features(@{$t->get_all_supporting_features});
		}
	    }
	}
	clean_supporting_features($top);
    }   
}

sub combine_description {
    my ($tran1, $tran2, $use_predict) = @_;
    
    if (!$use_predict && ($tran1->description||'') eq 'predict'){return $tran2->description;}
    if (!$use_predict && ($tran2->description||'') eq 'predict'){return $tran1->description;}
    my @lbs = split /\||\s+|;/, $tran1->description if $tran1->description;
    my @rbs = split /\||\s+|;/, $tran2->description if $tran2->description;
    my %description = ();
    foreach my $b (@lbs, @rbs) {
	$description{$b}=1;
    }
    my $description;
    foreach my $b (sort keys %description) {
	$description ? ($description .= '|'.$b) : ($description = $b);
    }
    return $description;
}


sub fix_short_trans {
    my ($trans_group, $max_end_ext, $max_distance) = @_;
    
    my $predicts;
    my @group;
    foreach my $g (@$trans_group) {
	if (@{$g||[]} == 0) {next;}
	if (@{$g->[0]->get_all_supporting_features()} == 0){
	    $predicts = $g;
	}else {
	    push @group, $g;
	}
    }
    if (@group == 0) {return [];}

    foreach (my $i = @group-1; $i>0; $i--) {
	$group[$i-1] = fix_trans_by_low($group[$i-1], $group[$i], $max_end_ext, $max_distance, 0);
    }

    if ($predicts) {
	$group[0] = fix_trans_by_low($group[0], $predicts, $max_end_ext, $max_distance, 1);
    }
    my $ts = process_cluster($group[0], '', 99.5, 99, 0, $max_end_ext);
    if (!$predicts) { return $ts; }

    my $good = 0;
    foreach my $t (@$ts) {
	my @sps = sort { $a->percent_id <=> $b->percent_id } @{$t->get_all_supporting_features};
	print STDERR count_utr_exons($t), ' ', bad_intron_num($t), "\n";
	if (count_utr_exons($t) <= 4 && bad_intron_num($t)<2) {
	    if (complete_cds_length($t)) {$good = 2;}
	    elsif (@$ts == 1 && ($sps[0]->percent_id>=99 && @{$ts->[0]->get_all_Exons}>2)) {
		if (!$good) { $good = 1; }
	    }
	}
    }
    print STDERR "===== finish fix short top trans =====\n\n";
    # replace bad transcripts with prediction
    if ($good == 2) {return $ts;}
    if ($good == 1) { 
	my $t = extend_transcript_by_low($ts->[0], $predicts, $max_end_ext, 0, 1);
	if ($t) { return [$t]; }
	else { return $ts; }
    }
    print STDERR "---replace by prediction\n";
    my @trans = sort {translation_length($a) <=> translation_length($b)} @{$predicts};	
    my $tr = extend_transcript_by_low($trans[0], $ts, $max_end_ext, 0, 1);
    $tr ||= $trans[0];
    collect_supporting_evidence([$tr], $ts, 1);
#    print_transcript($tr); 
#    print STDERR has_translation_start($tr), ' ', $tr->translate->seq, " -- start/length\n";
    if (@{$tr->get_all_supporting_features()} == 0
	|| has_translation_start($tr) < 100
	) {return $ts;} #not overlapping any evidence
    
    my $noover = find_non_overlapping([$tr], $ts);
    return [$tr, @$noover];
}

sub fix_trans_by_low {
    my ($high_trans, $low_trans, $max_end_ext, $max_distance, $use_predict) = @_;

    print STDERR "\n===== check complete cds =====\n";

    my $complete_cds = 0; 
    foreach my $t (@{$high_trans}) {
	if (complete_cds_length($t) > 200 && $t->cdna_coding_start() > 3
	    ) { 
	    $complete_cds = 1;
	}
    }

    print STDERR "\n===== fix trans pairs =====\n";
    my $trans_pairs = get_trans_pairs($high_trans, $max_distance);
    my @new_trans;
    foreach my $pair (@$trans_pairs) {
	my $nt = connect_trans_pair_by_low($pair, $low_trans, $max_end_ext, 1, 
					   $use_predict && !$complete_cds);
	if (!$nt) {next;}
	push @new_trans, $nt;
	if (complete_cds_length($nt) > 200) {
	    $complete_cds = 1; 
	}
    }
    
    my $ts = [@new_trans, @{$high_trans}];
    if (@new_trans > 0) {
	$ts = process_cluster($ts, '', 0, 0, 0, $max_end_ext); 
    }	
    
    print STDERR "===== fix short high trans =====\n";
    @new_trans = ();
    foreach my $k (0..@{$ts}-1) {
	my $nt = extend_transcript_by_low($ts->[$k], $low_trans, $max_end_ext, 1, 
					  $use_predict && !$complete_cds);
	if (!$nt) {next;}
	push @new_trans, $nt;
	if (complete_cds_length($nt) > 50) {
	    $complete_cds = 1; 
	}
    }

    $ts = process_cluster([@new_trans, @{$ts}], '', 0, 0, 0, $max_end_ext);
    
    my $nonover = find_non_overlapping($ts, $low_trans);
    if (@{$nonover||[]} > 1) {
	$nonover = process_cluster($nonover, '', 99.5, 99, 0, $max_end_ext);
    }
    print STDERR "===== finish fix short high trans =====\n\n";
    return [@$ts, @$nonover];
}

sub fix_short_trans1 {
    my ($trans_group, $max_end_ext, $max_distance) = @_;

    print STDERR "\n===== check complete cds =====\n";
    $trans_group->[0] = connect_all_close_transcripts($trans_group->[0], 200);

    my $complete_cds = 0; 
    my @comtran;
    foreach my $t (@{$trans_group->[0]}) {
	if (complete_cds_length($t) > 200 && $t->cdna_coding_start() > 3
	    ) { 
	    push @comtran, $t;
	    $complete_cds = 1;
	}
    }
#    my $noover = find_non_overlapping(\@comtran, $trans_group->[0]) if @comtran;
#    if ($complete_cds && @{$noover||[]} == 0) {
#	return remove_redundant_transcripts($trans_group->[0], 99.5, 99, $max_end_ext);
#    }

    print STDERR "\n===== fix trans pairs =====\n";
    my $trans_pairs = get_trans_pairs($trans_group->[0], $max_distance);
    my @new_trans;
    foreach my $pair (@$trans_pairs) {
	my @trans;
	foreach my $i (1..@$trans_group-1) {
	    if (@{$trans_group->[$i]||[]} == 0) { next; }
	    my $nt = connect_trans_pair_by_low($pair, $trans_group->[$i], $max_end_ext, 1, !$complete_cds);
	    if (!$nt) {next;}
	    push @trans, $nt;
	    if (complete_cds_length($nt) > 200 
		) {
		push @comtran, $nt;
		$complete_cds = 1; last;
	    }
	}
	if (@trans > 0) {
	    @trans = sort {translation_length($b) <=> translation_length($a)} @trans;	
	    push @new_trans, $trans[0];
	    next;
	}
    }

    my $ts = [@new_trans, @{$trans_group->[0]}];
#    $noover = find_non_overlapping(\@comtran, $trans_group->[0]) if @comtran;
#    if ($complete_cds && @{$noover||[]} == 0) {
#	$ts = process_cluster($ts, '', 99.5, 99, 0, $max_end_ext); 
#	return $ts;
#    }els
    if (@new_trans > 0) {
	$ts = process_cluster($ts, '', 0, 0, 0, $max_end_ext); 
    }	
#    print STDERR "=====partial CDS\n\n";

    print STDERR "===== fix short top trans =====\n";
    @new_trans = ();
    foreach my $k (0..@{$ts}-1) {
	my @trans;
	foreach my $i (1..@$trans_group-1) {
	    print STDERR "\ngroup $i: ";
	    if (@{$trans_group->[$i]||[]} == 0) { next; }
	    my $nt = extend_transcript_by_low($ts->[$k], $trans_group->[$i], $max_end_ext, 1, !$complete_cds);
	    if (!$nt) {next;}
	    push @trans, $nt;
	    if (complete_cds_length($nt) > 50 
		) {
		$complete_cds = 1; 
	    }
	    $ts->[$k] = $nt;
	}

	if (@trans > 0) {
	    @trans = sort {translation_length($b) <=> translation_length($a)} @trans;	
	    push @new_trans, $trans[0];
	}
#	$noover = find_non_overlapping(\@new_trans, $ts);
#	if (@$noover==0 && $complete_cds) {last;}
    }

    $ts = process_cluster([@new_trans, @{$ts}], '', 99.5, 99, 0, $max_end_ext);

    my $good = 0;
    foreach my $t (@$ts) {
	my @sps = sort { $b->percent_id <=> $a->percent_id } @{$t->get_all_supporting_features};
	print STDERR count_utr_exons($t), ' ', bad_intron_num($t), "\n";
	if (complete_cds_length($t) && count_utr_exons($t) <= 4 && bad_intron_num($t)<2
	    || $sps[0]->percent_id>=99.5 && @$ts == 1
	    ){
	    $good = 1; last;
	}
    }
    print STDERR "===== finish fix short top trans =====\n\n";
    # replace bad transcripts with prediction
    if ($good) {return $ts;}
    print STDERR "---replace by prediction\n";
    if (@{$trans_group->[-1]} > 0 
	&& @{$trans_group->[-1]->[0]->get_all_supporting_features()} == 0) {
	my @trans = sort {translation_length($a) <=> translation_length($b)} @{$trans_group->[-1]};	
	collect_supporting_evidence([$trans[0]], $ts, 1);
	if (@{$trans[0]->get_all_supporting_features()} == 0) {return $ts;}
	my $noover = find_non_overlapping([$trans[0]], $ts);
	return [$trans[0], @$noover];
    }
    return $ts;
}

sub get_trans_pairs {
    my ($trans, $max_distance) = @_;
    my @trans_pairs;
    for (my $i=0; $i < @$trans-1; $i++) {
	my $j=$i+1;
	for (; $j < @$trans; $j++) {
	    if ($trans->[$j]->start < $trans->[$i]->end) {next;}
	    if ($trans->[$j]->start - $trans->[$i]->end > $max_distance) {last;}
	    push @trans_pairs, [$trans->[$i], $trans->[$j]];
	    last;
	}
    }
    return \@trans_pairs;
}    

sub extend_transcript_by_low {
    my ($tran, $low_trans, $max_end_ext, $compatible, $use_predict) = @_;
    
    print STDERR "\n===extend by low trans: use predict: $use_predict\n";
    print_transcript($tran, "---high transcript:");
    
    my @new_trans;
    foreach my $lp (@$low_trans) {
	print_transcript($lp, "\n---low transcript:");
	if (!$use_predict && @{$lp->get_all_supporting_features} ==0) {next;}
	if ($lp->start > $tran->end || $lp->end < $tran->start){
	    next;
	}
	my $new_tran = $tran;
	if ($lp->start < $tran->start) {
	    $new_tran = extend_transcript($lp, $tran, $max_end_ext, $compatible, 1);
	}
	print STDERR "\n===after extend transcript 1\n";
	print_transcript($new_tran);
	if ($lp->end > $new_tran->end) {
	    $new_tran = extend_transcript($new_tran, $lp, $max_end_ext, $compatible);
	}
	if ($new_tran ne $tran) {
	    print STDERR "\n===after extend transcript 2\n";
	    print_transcript($new_tran);
	    $new_tran = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
		->compute_translation( $new_tran );	    
	    push @new_trans, $new_tran;
	}
    }
    
    @new_trans = @{process_cluster(\@new_trans, '', 99.5, 99, 0, $max_end_ext)};
    if (@new_trans > 1) {	
	@new_trans = sort {
	    my $alen = translation_length($a);
	    my $blen = translation_length($b);
	    return $blen <=> $alen;
	} @new_trans; }
    return $new_trans[0];
}

sub connect_trans_pair_by_low {
    my ($pair, $low_trans, $max_end_ext, $compatible, $use_predict) = @_;

    print STDERR "\n===connect trans pair by low: use predict: $use_predict\n";
    print_transcript($pair->[0]);
    print_transcript($pair->[1]);
    
    my @new_trans;
    foreach my $lp (@$low_trans) {
	print_transcript($lp);
	if (!$use_predict && @{$lp->get_all_supporting_features} ==0) {next;}
	if ($lp->start > $pair->[0]->end || $lp->end < $pair->[1]->start){
	    next;
	}
	my $new_tran = extend_transcript($lp, $pair->[1], $max_end_ext, $compatible, 1);
	print STDERR "\n===after connect transcripts 1\n";
	print_transcript($new_tran);
	if ($new_tran ne $pair->[1]) {
	    $new_tran = extend_transcript($pair->[0],$new_tran, $max_end_ext, $compatible);
	    if ($new_tran ne $pair->[0]) {
		print STDERR "\n===after connect transcripts 2\n";
		print_transcript($new_tran);
		$new_tran = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
		    ->compute_translation( $new_tran );	    
		push @new_trans, $new_tran;
	    }
	}
    }
    if (@new_trans > 0) {@new_trans = sort {translation_length($b) <=> translation_length($a)} @new_trans; }
    return $new_trans[0];
}


#############################################
# the strand is changed back after connection 
#############################################
sub connect_close_pair {
    my ($t1, $t2) = @_;
    my $len1 = complete_cds_length($t1);
    my $len2 = complete_cds_length($t2);
    if ($len1 >= 100 && $len2 >= 100) {return undef;}
   
    my $new_tran = combine_pair_transcripts($t1,$t2);
    if ($new_tran->length < $t1->length + $t2->length) {
	return undef;
    }
    $new_tran = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
	->compute_translation( $new_tran );
    if (! $t1->translate) {
	$t1 = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
	    ->compute_translation( $t1 );
    }
    if (! $t2->translate) {
	$t2 = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
	    ->compute_translation( $t2 );
    }
    my $old_strand = $new_tran->strand;
    $new_tran = check_strand_by_translation($new_tran);
    if (translation_length($new_tran)
	>= translation_length($t1) + translation_length($t2)
	|| translation_length($new_tran, 1) > translation_length($t1) 
	&& translation_length($new_tran, 1) > translation_length($t2)
	) {
	if ($new_tran->strand != $old_strand) {
	    print STDERR "flip strand back: $old_strand\n";
	    $new_tran = change_orientation($new_tran);
	}
	return $new_tran;
    }
}

#############################################
# connect ignoring strand
# the strand is undetemined after connection
#############################################
sub connect_all_close_transcripts {
    my ($transcripts, $max_distance) = @_;
    print STDERR "distance: $max_distance\n";
    my @trans = sort { $a->start <=> $b->start } @$transcripts;
    my @index;
    my @new_trans;
    for my $i (0..$#trans-1) {
	my $j = $i+1;
	for (; $j < @trans; $j++) {
	    if ($trans[$j]->start < $trans[$i]->end) {next;}
	    if ($trans[$j]->start > $trans[$i]->end+$max_distance) {last;}
	    my $t1 = $trans[$i];
	    my $t2 = $trans[$j];
	    if ($t1->strand != $t2->strand) {
		if (@{$t1->get_all_Exons} == 1 && complete_cds_length($t1)<100) {
		    $t1 = change_orientation($t1);
		}elsif (@{$t2->get_all_Exons} == 1 && complete_cds_length($t2)<100) {
		    $t2 = change_orientation($t2);
		}else { next; }
	    }
	    print STDERR "===connect transcripts:\n";
	    print_transcript($t1);
	    print_transcript($t2);
	    my $new_tran = connect_close_pair($t1,$t2);
	    if ($new_tran) {
		push @new_trans, $new_tran;
		$index[$i] = 1;
		$index[$j] = 1;
	    }
	}
    }
    foreach my $i (0..$#trans) {
	if (!$index[$i]) {push @new_trans, $trans[$i];}
    }
    return \@new_trans;
}

sub select_top_trans {
    my ($transcripts) = @_;
    my @trans = @$transcripts;
    my @sp_id = map { 
	my @a = sort {$b->percent_id<=>$a->percent_id} 
	@{$_->get_all_supporting_features||[]}; $a[0]?$a[0]->percent_id:0 } 
    @trans;
    
    my @temp = @sp_id;
    @temp = sort {$b <=> $a} @temp;
    my $high_id = $temp[0];
    # group transcripts based on alignment identity
    my @top_trans;

    my @predicts;
    for (my $i = 0; $i < @trans;  $i++) {
	my $tid = $sp_id[$i];
	if ($tid == 0) {push @predicts, $trans[$i];}
	elsif ($tid >= 99) {
	    push @{$top_trans[0]}, $trans[$i];
	}elsif ($tid >= 95) {
	    push @{$top_trans[1]}, $trans[$i];
	}elsif ($tid >= 79) {
	    push @{$top_trans[2]}, $trans[$i];
	}elsif ($tid >= 47) {
	    push @{$top_trans[3]}, $trans[$i];
	}else {
	    push @{$top_trans[4]}, $trans[$i];
	}	    
    }    
    while (@top_trans > 0 && @{$top_trans[0]||[]}==0) {shift @top_trans;}
    if (@top_trans > 0 && @predicts > 0) { push @top_trans, \@predicts; }

    my @all_trans;
    # select only longest transcripts
    foreach my $i (0..@top_trans-1) {
	if (@{$top_trans[$i]||[]} == 0) {next;}
	if (@{$top_trans[$i]||[]} > 1000) { 
	    my @ts = sort {$b->length <=> $a->length} @{$top_trans[$i]};
	    $top_trans[$i] = [@ts[0..1000]]; 
	}
	push @all_trans, @{$top_trans[$i]};
    }
    print STDERR scalar(@all_trans), " top transcripts\n";
    return (\@top_trans, \@all_trans);
}

sub select_top_trans1 {
    my ($transcripts) = @_;
    my @trans = sort {
	if (!$a->get_all_supporting_features->[0]) {return 1;}
	elsif (!$b->get_all_supporting_features->[0]) {return -1;}
	else {
	    my @as = sort {$b->percent_id<=>$a->percent_id} @{$a->get_all_supporting_features}; 
	    my @bs = sort {$b->percent_id<=>$a->percent_id} @{$b->get_all_supporting_features}; 
	    return $bs[0]->percent_id <=> $as[0]->percent_id;
#	    return $b->get_all_supporting_features->[0]->percent_id 
#		<=> $a->get_all_supporting_features->[0]->percent_id; 
	}
    } @{$transcripts};

    # group transcripts based on alignment identity
    my @top_trans = ();
    my $high_id = 0;
    if ($trans[0]->get_all_supporting_features->[0]) {
	$high_id = $trans[0]->get_all_supporting_features->[0]->percent_id;
    }
    my $base = 0;
    if ($high_id >= 95) { $base = 1; }
    elsif ($high_id >= 90) { $base = 2; }
    elsif ($high_id >= 80) { $base = 4; }
    elsif ($high_id >= 60) { $base = 8; }
    else { $base = 16; }

    for (my $i = 0; $i < @trans;  $i++) {
	my $tid = $trans[$i]->get_all_supporting_features->[0] ?
	    $trans[$i]->get_all_supporting_features->[0]->percent_id : 0;
	if ($tid >= $high_id-$base) {
	    push @{$top_trans[0]}, $trans[$i];
	}elsif ($tid >= $high_id-$base*2) {
	    push @{$top_trans[1]}, $trans[$i];
	}elsif ($tid >= $high_id-$base*4) {
	    push @{$top_trans[2]}, $trans[$i];
	}elsif ($tid >= $high_id-$base*8) {
	    push @{$top_trans[3]}, $trans[$i];
	}elsif ($tid >= $high_id-$base*16) {
	    push @{$top_trans[4]}, $trans[$i];
	}elsif ($tid >= $high_id-$base*32) {
	    push @{$top_trans[5]}, $trans[$i];
	}
    }

    my @all_trans;
    # select only longest transcripts
    foreach my $i (0..@top_trans-1) {
	if (@{$top_trans[$i]||[]} == 0) {next;}
	my @ts = sort {$b->length <=> $a->length} @{$top_trans[$i]};
	if (@ts <= int(1000/($i+1))) { $top_trans[$i] = \@ts; }
	else { $top_trans[$i] = [@ts[0..int(1000/($i+1))]]; }
	push @all_trans, @{$top_trans[$i]};
    }
    print STDERR scalar(@all_trans), " top transcripts\n";
    return (\@top_trans, \@all_trans);
}

sub clean_bad_introns {
    my ($trans, $exons, $min_keep_id, $min_intron_size, $nocut) = @_;

    my @trans; 
#    print STDERR "\n===fix short or long introns...\n";
    foreach my $t (@$trans) {
#	print STDERR "1:\t";	print_transcript($t);
#	print_transcript_seq($t);
	# remove short introns
	$t = remove_introns($t, $exons, $min_keep_id, $min_intron_size);
#	print STDERR "2:\n";	print_transcript($t);
	# cut end introns and long introns
	my $ts =[$t];
	if (!$nocut) {$ts = cut_introns($t, $exons, $min_keep_id);}
#	print STDERR "3:\n";	print_transcript($t);
	push @trans, @$ts;
    }
#    print STDERR scalar @trans, " good transcripts\n";
    # although this is better done before fixing any intron problems, but
    # doing this first will slow down significantly (due to sort for each exon?)
    transfer_exon_supporting_features(\@trans, $min_keep_id); # add for predicted exons
    return \@trans;
}

sub correct_bad_introns {
    my ($trans, $good_introns, $max_boundary_shift) = @_;

    foreach my $tran (@$trans) {
	my @sup_feats1 = sort {$a->percent_id <=> $b->percent_id}
	@{$tran->get_all_supporting_features()||[]};
	my @introns = sort {$a->start<=>$b->start} @{$tran->get_all_Introns()};
	foreach my $i (0..$#introns) {
	    if ($introns[$i]->length < 8) {next;}
	    my $boundary = get_intron_boundary($introns[$i]);
	    if ($boundary eq 'GTAG' || $boundary eq 'GCAG') {
		next;
	    }
	    
#	    print "\nchecking intron: ",$tran->stable_id||$tran->dbID, ' ', $introns[$i]->start, '-', $introns[$i]->end, ' ', $boundary, "\n";
	    foreach my $good (@$good_introns) {
		if (abs($introns[$i]->length - $good->length) < 2
		    && abs($introns[$i]->start-$good->start) <= 2
		    || $sup_feats1[0] && $sup_feats1[0]->percent_id < 99.5
		    && abs($introns[$i]->length - $good->length) <= 9
		    || $sup_feats1[0] && $sup_feats1[0]->percent_id < 95
		    ) {
#		    print STDERR "**good intron: ", $good->start, ' ', $good->end, "\n";
		    if ($introns[$i]->start != $good->start
			&& abs($introns[$i]->start-$good->start)<=$max_boundary_shift) {
			my $prev = $introns[$i]->prev_Exon;
			if ($good->start-1>$prev->start 
			    && $good->start<$introns[$i]->end
			    ) {
			    change_exon($tran, $prev, $prev->start, $good->start-1);
			    @introns = @{$tran->get_all_Introns()};
			}
		    }
		    if ($introns[$i]->end != $good->end
			&& abs($introns[$i]->end-$good->end)<=$max_boundary_shift) {
			my $next = $introns[$i]->next_Exon;
			if ($next->end > $good->end+1
			    && $good->end>$introns[$i]->start
			    ) {
			    change_exon($tran, $next, $good->end+1, $next->end);
			    @introns = @{$tran->get_all_Introns()};
			}
		    }
		}
	    }
	}
    }
    return $trans;
}


sub get_good_introns {
    my ($trans) = @_;
    my @good_introns;
    foreach my $tran (@$trans) {
#	print_transcript($tran);
#	print_transcript_seq($tran);
	$tran = $tran->transfer($tran->slice);
	my $introns = $tran->get_all_Introns();
	foreach my $intron (@$introns) {
	    my $boundary = get_intron_boundary($intron);
	    if ($boundary eq 'GTAG' || $boundary eq 'GCAG') {
		if (! in_set($intron, \@good_introns)) {push @good_introns, $intron;}
	    }
	}
    }
    @good_introns = sort {$a->start <=> $b->start} @good_introns;
    return \@good_introns;
}

##################################################################
# For Gene Build: using proteins, cDNAs, ESTs based mapping
# --Chengzhi
##################################################################

# remove low quality mapping among all transcripts
sub prefilter {
    my ($self, $trans, $min_coverage, $min_perc_id, $single_exon_id, $use_predict) = @_;
    print "prefiltering: $min_coverage, $min_perc_id, $single_exon_id\n";
    my @good_trans;
    foreach my $i (0..@$trans-1) {
	my $tran = $trans->[$i];
	my @sup_feats = sort {$b->percent_id<=>$a->percent_id} 
	@{$tran->get_all_supporting_features||[]};
	if (! @sup_feats) {
	    if ($use_predict) { push @good_trans, $tran; }
	    else { 	    
		print STDERR "waring: no supporting features: ", $tran->dbID, "\n";
		next; 
	    }
	}
#	    print STDERR $sup_feats[0]->score, ' ', $sup_feats[0]->percent_id, " --1\n";
#	if ($sup_feats[0]->percent_id >= 99.5) {next;}
	if ($tran->length < 30 
	    || @{$tran->get_all_Exons()} == 1
	    && $sup_feats[0]->percent_id < $single_exon_id) { 
	    next;
 	}

	if ($tran->length > 30 && @sup_feats 
	    && $sup_feats[0]->score >= $min_coverage 
	    && (@{$tran->get_all_Exons()} > 1
		&& $sup_feats[0]->percent_id >= $min_perc_id 
		|| @{$tran->get_all_Exons()} == 1
		&& $sup_feats[0]->percent_id >= $single_exon_id)
	    ){
	    push @good_trans, $tran;
#	    print STDERR $sup_feats[0]->score, ' ', $sup_feats[0]->percent_id, " --2\n";
	}
	$trans->[$i] = undef;
    }
    return \@good_trans;
}

# process overlapping transcripts based on intron sizes and boundaries
sub preprocess_by_intron_and_cluster {
    my ($self, $transcripts, $fslice, $rslice, $high_perc_id, $low_perc_id, $max_check_id, $min_keep_id, $single_exon_id, 
	$min_intron_size, $max_bad_ratio, $max_boundary_shift) = @_;
    print_transcripts($transcripts);
    my $clusters = $self->cluster_transcripts_no_strands($transcripts, $fslice);
#    my $clusters =[$self->_cluster_Transcripts_by_genomic_range($fslice, @$transcripts)];exit;
    my (@f_clusters, @r_clusters);
    foreach my $cluster (@$clusters) {
#	print STDERR $cluster->start, '-', $cluster->end, ': ', scalar @{$cluster->get_Transcripts}, " transcripts\n";next;
	if (@$cluster > 5000) {
	    $cluster = preprocess_cluster($cluster, $high_perc_id, $low_perc_id);
	}else {
	    $cluster = preprocess_cluster($cluster, $high_perc_id, $low_perc_id, 1);    
	}
	$cluster = cleanup_strand_by_intron($cluster);
	print STDERR "\nafter filtering\n"; print_transcripts($cluster);
	my ($f, $r) = group_by_strand($cluster, $fslice, $rslice);
	print STDERR "\nafter grouping\n"; print_transcripts([@$f, @$r]);
	if (@$f) {
	    $f = preprocess_cluster($f, $high_perc_id, $low_perc_id);
	    $f = clean_introns($f, $min_keep_id, $min_intron_size, $max_boundary_shift);
	    $f = clean_cluster_by_intron_ratio($f, $max_check_id, $max_bad_ratio);
	    print STDERR " after clean, f cluster: "; print_transcripts($f);
	    if (@$f) {push @f_clusters, $f;}
	}
	if (@$r) {
	    $r = preprocess_cluster($r, $high_perc_id, $low_perc_id);
	    $r = clean_introns($r, $min_keep_id, $min_intron_size, $max_boundary_shift);
	    $r = clean_cluster_by_intron_ratio($r, $max_check_id, $max_bad_ratio);
	    print STDERR " after clean, r_cluster: "; print_transcripts($r);
	    if (@$r) {push @r_clusters, $r;}
	}
    }
    return (\@f_clusters, \@r_clusters);
}

#correct transcript strand based on intron boundaries
sub cleanup_strand_by_intron {
    my ($transcripts) = @_;
    print STDERR "***check and fix strand by intron ... ", scalar(@$transcripts), " transcripts\n";
    #if (@$transcripts == 1) { return $transcripts; } #bug
    my @trans = sort {@{$b->get_all_Exons} <=> @{$a->get_all_Exons}} @$transcripts;
#    print_transcripts(\@trans);
    
    my @values;
    my @good_trans;
    my @sup_id;
    foreach my $i (0..$#trans) {
	my $intron_num = @{$trans[$i]->get_all_Exons} - 1;
	my @sups = sort {$b->percent_id <=> $a->percent_id}
	@{$trans[$i]->get_all_supporting_features()||[]};
	$sup_id[$i] = @sups>0 ? $sups[0]->percent_id : 0;
	$values[$i] = check_strand_by_intron($trans[$i]);
	my $ispro = is_protein_support($trans[$i]);
	if ($ispro && $values[$i] < -1 || !$ispro && $values[$i] < 0) {
	    change_orientation($trans[$i]);
	    $values[$i] *= -1;
	}
	if ($values[$i] > 1 && $intron_num - $values[$i] <= 1.5
	    || $values[$i] == 1 && $intron_num - $values[$i] < 1 
	    && ($sup_id[$i] > 75) # low mapping wrong strand
	    ) {
	    push @good_trans, $trans[$i];
	    $values[$i] += 5;
	}
	if ($ispro && $values[$i] > 0) { $values[$i] += 5;}
    }
    if (@good_trans == 0) {push @good_trans, $trans[0]; }
    
    for (my $i = 0; $i < @good_trans; ) {
	# clean single-exons and unknown ones
	for my $j (0..$#trans) {
#	print_transcript($good_trans[$i], "\n---for check");
	    if ($values[$j] > 5) { next; }
	    if ($trans[$j]->start > $good_trans[$i]->end 
		|| $good_trans[$i]->start > $trans[$j]->end) {next;}
	    my $e1 = $trans[$j]->get_all_Exons;
	    my $e2 = $good_trans[$i]->get_all_Exons;
	    if ($e1->[0]->strand == $e2->[0]->strand) {next;}
	    print $good_trans[$i]->dbID,' ', $good_trans[$i]->strand, ' ',
	    $trans[$j]->dbID, ' ', $trans[$j]->strand, ' ', $values[$j], "\n";
	    change_orientation($trans[$j]);
	    print STDERR $trans[$j]->dbID, " changed orientation!\n";
	    push @good_trans, $trans[$j];
	    $values[$j] = 10;
	}
	shift @good_trans;
#	push @good_trans, $trans[$i];
#	print_transcript($trans[$i], "after check");
    }
    print STDERR "***after strand check and fix\n";
    print_transcripts(\@trans);
    return \@trans;
}

sub is_protein_support {
    my ($tran) = @_;
    if ($tran->description =~ /protein|predict/) {return 1;}
    return 0;
    my $supfeats = $tran->get_all_supporting_features();
    foreach my $sf (@$supfeats) {
	if (! $sf->isa('Bio::EnsEMBL::DnaDnaAlignFeature')) {return 1;}
	print STDERR $sf->isa('Bio::EnsEMBL::DnaDnaAlignFeature'), " dna/dna\n";
    }
    return 0;
}

# remove bad introns in bad transcripts
sub clean_introns {
    my ($trans, $min_keep_id, $min_intron_size, $max_boundary_shift) = @_;

    correct_introns($trans, $max_boundary_shift);
    my $exons = get_all_exons($trans);
    my @trans; 
#    print STDERR "\n===fix short or long introns...\n";
    foreach my $t (@$trans) {
#	print STDERR "1:\t";	print_transcript($t);
#	print_transcript_seq($t);
	# remove short introns
	$t = remove_introns($t, $exons, $min_keep_id, $min_intron_size);
#	print STDERR "2:\n";	print_transcript($t);
	# cut end introns and long introns
	my $ts = cut_introns($t, $exons, $min_keep_id);
#	print STDERR "3:\n";	print_transcript($t);
	push @trans, @$ts;
    }
#    print STDERR scalar @trans, " good transcripts\n";
    # although this is better done before fixing any intron problems, but
    # doing this first will slow down significantly (due to sort for each exon?)
    transfer_exon_supporting_features(\@trans, 60); # add for predicted exons
    return \@trans;
}

# remove short bad intron, especially there is an exon covering this intron
sub remove_introns {
    my ($tran, $exons, $min_keep_id, $min_intron_size, $max_len) = @_;
    my $sup=$tran->get_all_supporting_features->[0];
    my $perc_id = $sup->percent_id if $sup; $perc_id ||= 100;
#    print STDERR "==remove introns: \n"; print_transcript($tran); print_transcript_seq($tran);
    $min_intron_size ||= 10;
    $max_len ||= 1000;
    for my $intron (@{$tran->get_all_Introns}) {
#	print STDERR "intron length: ", $intron->length, "\n";
#	print STDERR "perc_id=$perc_id, min_keep_id=$min_keep_id\n";
	if ($intron->length > $max_len) {next;}
	if ($intron->length < $min_intron_size 
	    && ($intron->length % 3 == 0 || !is_protein_support($tran))) {
	    $tran = remove_intron($tran, $intron);
	}elsif ($perc_id >= $min_keep_id) {next;}
	my $boundary = get_intron_boundary($intron);
	if (($intron->length < 25 || $perc_id < 99 && $intron->length < 35
	     || $perc_id < 90 && $intron->length < 50
	     || $perc_id < 70 && $intron->length < 100) 
	    && $boundary ne 'GTAG'  
	    && ($intron->length % 3 == 0 || !is_protein_support($tran))) {
	    $tran = remove_intron($tran, $intron);
	    return remove_introns($tran, $exons, $min_keep_id, $min_intron_size, $max_len);
	}
	my $in_num = 0;
	my $in_exon = in_exon($intron, $exons, 1);
	if ($in_exon) {
	    $in_num = scalar @{$in_exon->get_all_supporting_features};
	}
	if ($in_num > 0 && $boundary ne 'GTAG' 
	    && $boundary ne 'GCAG' && $boundary ne 'ATAC'
	    && ($intron->length % 3 == 0 || !is_protein_support($tran))
	    ) {
	    $tran = remove_intron($tran, $intron);
	    return remove_introns($tran, $exons, $min_keep_id, $min_intron_size, $max_len);
	}
    }
#    print_transcript($tran);
    return $tran;
}

# cut transcripts into two by cutting on a bad intron:
# long introns
# long introns covering two or more exons
# end introns with short end exon
sub cut_introns {
    my ($tran, $exons, $min_keep_id) = @_;
#    print_transcript($tran, "===check/cut intron===");
    if (@{$tran->get_all_Exons} == 1) {return [$tran];}
    my @sufs = sort {$b->percent_id<=>$a->percent_id} @{$tran->get_all_supporting_features};
    my $perc_id = $sufs[0]->percent_id if $sufs[0]; $perc_id ||= 100;
    my @introns = sort {$a->start <=> $b->start} @{$tran->get_all_Introns};
    for my $i (0..$#introns) {
	my $out_num = 0;
	my $boundary = 'GTAG';
	if ($i==0 || $i==$#introns) {$boundary=get_intron_boundary($introns[$i]);}
        if ($introns[$i]->length > 2000) {
            $out_num = out_exon($introns[$i], $exons);
            $boundary=get_intron_boundary($introns[$i]);
        }

	if (! is_protein_support($tran) 
	    && ($i == 0 && $introns[$i]->prev_Exon->length<15 
		|| $i==$#introns && $introns[@introns-1]->next_Exon->length<15)
	    || $boundary ne 'GTAG' 
	    && ($i == 0 && $introns[$i]->prev_Exon->length < 25 
		|| $i==@introns-1 && $introns[@introns-1]->next_Exon->length<25)
	    || $boundary ne 'GTAG' && $boundary ne 'GCAG' && $perc_id < 99
	    && ($i == 0 && $introns[$i]->prev_Exon->length < 100 
		|| $i==@introns-1 && $introns[@introns-1]->next_Exon->length<100)
	    || ($perc_id < 99.5 && $boundary ne 'GTAG' 
		|| $perc_id < 95 && !is_protein_support($tran)) 
	    && $introns[$i]->length > 1000
	    && ($i == 0 && $introns[$i]->prev_Exon->length < 40 
		|| $i==@introns-1 && $introns[@introns-1]->next_Exon->length<40)
	    || $introns[$i]->length > 4000 && $perc_id < 95
	    && ($i == 0 && $introns[$i]->prev_Exon->length < 100 
		|| $i==@introns-1 && $introns[@introns-1]->next_Exon->length<100)
	    || ($out_num > 2 && $perc_id < 99 || $out_num > 1 && $perc_id < 90)
            && ($boundary ne 'GTAG' || $introns[$i]->length > 5000)
	    || $out_num > 1 && $perc_id < 95 && $introns[$i]->length > 10000
	    ) {
#	    print "***cut intron\n"; 
	    my ($left, $right) = split_transcript($tran, $introns[$i]);
	    if ($i == 0 && @introns > 1) {
		return [$left, @{cut_introns($right, $exons, $min_keep_id)}];
	    }elsif ($i == $introns[$#introns] && @introns > 1) {
		return [@{cut_introns($left, $exons, $min_keep_id)}, $right];
	    }else {
		return [@{cut_introns($left, $exons, $min_keep_id)}, 
			@{cut_introns($right, $exons, $min_keep_id)}];
	    }
	}
    }
    return [$tran];
}

# get all exons in a cluster
sub get_all_exons {
    my ($trans, $min_perc_id) = @_;
    $min_perc_id ||= 40;
    my %exons;
    foreach my $t (@$trans) {
	my @sup_feats = sort {$b->percent_id <=> $a->percent_id}
	@{$t->get_all_supporting_features()||[]};
	if (@sup_feats == 0 || $sup_feats[0]->percent_id < $min_perc_id) {next;}
	foreach my $e (@{$t->get_all_Exons}) {
	    $exons{$e->start.'-'.$e->end} = $e;
	}
    }
#    print STDERR scalar "totally ", scalar(keys %exons), " exons\n";
#    print join("\n", map {$_->start.'-'.$_->end} sort {$a->start<=>$b->start} values %exons), "\n";
    return [values %exons];
}

# transfer supporting evidences from all the same exons
sub transfer_exon_supporting_features {
    my ($cluster, $min_perc_id) = @_;
    $min_perc_id ||= 50;
    if (@$cluster < 2) {return $cluster;}

    my %exons;
    foreach my $tran (@$cluster) {
	my @ex = sort {$a->start <=> $b->start} @{$tran->get_all_Exons};
	for my $i (0..$#ex) {
	    # many transcripts share exons, need add only once
	    if ($i == 0) {add2set($exons{'x-'.$ex[$i]->end}||=[], $ex[$i]);}
	    elsif ($i == $#ex) {add2set($exons{$ex[$i]->start.'-x'}||=[], $ex[$i]);}
	    else {add2set($exons{$ex[$i]->start.'-'.$ex[$i]->end}||=[], $ex[$i]);}
	}
    }
#    print "region: ", (join ' ', keys %exons), "\n";
    foreach my $exons (values %exons) {
#	if (@$exons == 1) {next;} # need remove this due to repeated features
#	print scalar @$exons, " exons\n";
	my %sup_feats;
	foreach my $exon (@$exons) {
	    my @fs = @{$exon->get_all_supporting_features};
#	    print scalar @fs, " supporting features for this exon\n";	    
	    foreach my $f (@fs) {
		if ($f->percent_id > $min_perc_id) {
		    # each unique feature added only once 
		    if (!$sup_feats{$f->hseqname} || $sup_feats{$f->hseqname}
			&& $f->percent_id > $sup_feats{$f->hseqname}->percent_id) {
			$sup_feats{$f->hseqname} = $f;
		    }
		}
	    }
	}
	my @sup_feats = values %sup_feats;
	if (@sup_feats == 0) {next;}
#	print scalar @sup_feats, " supporting features for these exons\n";
	foreach my $exon (@$exons) {
#	    print $exon->dbID, "\n";
	    if (@{$exon->get_all_supporting_features||[]}>0 && 
		$exon->get_all_supporting_features->[0]->hseqname !~ /predict/i) {
		next;
	    }
#	    print "supports3: ", $exon->dbID, ': ', scalar @{$exon->get_all_supporting_features}, "\n";
	    # remove repeated features from db and add cleaned and new features
	    $exon->{_supporting_evidence} = \@sup_feats;
#	    $exon->add_supporting_features(@sup_feats);
#	    print "supports4: ", scalar @{$exon->get_all_supporting_features}, "\n";
	}
    }
    # can't add supporing features to trans, otherwise, won't remove
    return $cluster;
}

sub add2set {
    my ($exons, $exon) = @_;
    my $new = 1;
    for my $e (@$exons) {
	if ($exon->dbID != 0 && $e->dbID == $exon->dbID) {
	    $new = 0;
	    last;
	}
    }
    if ($new) {push @$exons, $exon;}
}


# cut transcript on non-canonical introns
sub cut_uncanonical_introns {
    my ($tran) = @_;
    for my $intron (@{$tran->get_all_Introns}) {
	my $boundary = get_intron_boundary($intron);
	if ($boundary ne 'GTAG' && $boundary ne 'GCAG') {
	    my ($left, $right) = split_transcript($tran, $intron);
#	    print "***right\n"; print_transcript($right);
	    return [$left, @{cut_uncanonical_introns($right)}];
	}
    }
    return [$tran];
}

# cut bad introns in bad transcritpts
sub clean_cluster_by_intron_ratio {
    my ($trans, $max_check_id, $max_bad_ratio) = @_;
    my @good_trans;
    my @bad_trans;
    foreach my $tran (@$trans) {
	my $t = filter_by_intron_ratio($tran, $max_check_id, $max_bad_ratio);
	if ($t) {push @good_trans, $t;}
	else {push @bad_trans, $tran;}
    }
    return (\@good_trans, \@bad_trans);
    if (@good_trans==0) {
	foreach my $t (sort {$b->length<=>$a->length} @$trans) {
	    if (get_bad_intron_ratio($t->get_all_Introns)<=$max_bad_ratio) {
		push @good_trans, $t;
		last;
	    }
	}
    }
}

# check if a transcript contains too many bad introns
sub filter_by_intron_ratio {
    my ($tran, $max_check_id, $max_bad_ratio) = @_;
    my $introns = $tran->get_all_Introns();
    if (@$introns == 0) { return $tran;}
    my @sufs = sort {$b->percent_id<=>$a->percent_id} @{$tran->get_all_supporting_features};
    if ($sufs[0] && $sufs[0]->percent_id >= $max_check_id) {return $tran;}

#    print_transcript($tran);
#    print $tran->get_all_supporting_features->[0]->percent_id, "\n";
#    print_transcript_seq($tran);
    my $bad_ratio = get_bad_intron_ratio($introns);
    if ($bad_ratio != 0) {
	print STDERR ($tran->stable_id||$tran->dbID), " bad introns ratio: $bad_ratio -- max perc_id: $max_check_id, $max_bad_ratio\n" ;
    }
    if ($bad_ratio <= $max_bad_ratio) {
	if ($bad_ratio <= $max_bad_ratio
	    || $sufs[0] && $sufs[0]->percent_id > 99 && $bad_ratio <= 0.5
	    || $sufs[0] && $sufs[0]->percent_id > 95 && $bad_ratio < 0.4
	    ) {
	}else {
#	    print_transcript($tran);
	    print STDERR "  ***removed1 ", $tran->strand, "\n";
#	    print_transcript_seq($tran);
	    return undef;
	}
    }else {print STDERR "  ***removed2 ", $tran->strand, "\n"; return undef;}

    return $tran;
}

sub bad_intron_num {
    my ($tran) = @_;
    my $introns = $tran->get_all_Introns();
    if (@$introns == 0) { return 0; }
    my $bad = 0;
    foreach my $intron (@$introns) {
	my $boundary = get_intron_boundary($intron);
	if ($boundary eq 'GCAG' || $boundary eq 'ATAC') {
	    $bad += 0.2;
	}elsif ($boundary ne 'GTAG') {
	    $bad++;
	}
    }

    return $bad;
}

sub get_bad_intron_ratio {
    my ($introns) = @_;
    my $bad = 0;
    foreach my $intron (@$introns) {
	my $boundary = get_intron_boundary($intron);
	if ($boundary eq 'GCAG'||$boundary eq 'ATAC') {
	    $bad+=0.2;
	}elsif ($boundary ne 'GTAG') {
	    $bad++;
	}
    }

    my $bad_ratio = $bad/@$introns;
    return $bad_ratio;
}

# check transcript strand based on intron boundaries
sub check_strand_by_intron {
    my ($tran) = @_;
    my $introns = $tran->get_all_Introns();
    if (@$introns == 0) { return 0;}
    my $forward = 0;
    my $backward = 0;
    my $bad = 0;
    my $exons = $tran->get_all_Exons();
#    print_transcript($tran);
#    print $tran->get_all_supporting_features->[0]->percent_id, "\n";
#    print_transcript_seq($tran);
    foreach my $i (0..@$introns-1) {
	my $seq = $introns->[$i]->seq;
	my $left = uc substr($seq, 0, 2);
	my $right = uc substr($seq, length($seq)-2, 2);
	my $boundary = "$left$right";
	if ($boundary eq 'GTAG') {$forward++;}
	elsif ($boundary eq 'CTAC') {$backward++;}
	elsif ($boundary eq 'GCAG'||$boundary eq 'ATAC') {$forward += 0.5;}
	elsif ($boundary eq 'CTGC'||$boundary eq 'GTAT') {$backward+=0.5;}
    }
#    print $tran->dbID, ": $forward, $backward\n";
    if ($forward > $backward) { return $forward;}
    elsif ($forward < $backward) {
	print STDERR $tran->dbID, " wrong orientation!\n";
#	print_transcript_seq($tran);
	return -$backward; 
    }
    if ($forward > 0) {
	print STDERR $tran->dbID, " strange transcript!\n";
	print_transcript_seq($tran);
    }
    return 0;
}

# change transcript strand
sub change_orientation {
    my ($tran) =@_;
#    print STDERR "***change strand for ", "\n";
#    print_transcript($tran);
    my @exons = @{$tran->get_all_Exons};
    my $strand = $tran->strand * -1;
#    my $strand = -1 * $exons[0]->strand;
    $tran->flush_Exons;
    for (my $i=@exons-1; $i>=0; $i--) {
	my $exon;# = new Bio::EnsEMBL::Exon();
	foreach my $key (keys %{$exons[$i]}) {
	    $exon->{$key} = $exons[$i]->{$key};
	}
	$exon = bless $exon, "Bio::EnsEMBL::Exon";#ref($exons[$i]);
#	print &Dumper($exon);
	$exon->strand($strand);
	$tran->add_Exon($exon);
    }
    $tran->translation(undef);
    $tran = $tran->transfer($tran->slice);
#    print_transcript($tran);
    return $tran;
}

# remove an exon from a transcript
sub remove_exon {
    my ($tran, $which) = @_;
    my @exons = @{$tran->get_all_Exons};
    $tran->flush_Exons;
    foreach my $i (0..@exons-1) {
	if ($i != $which) {$tran->add_Exon($exons[$i]);}
    }
    $tran->translation(undef);
    return $tran;    
}

# split a transcript into two on an intron
sub split_transcript {
    my ($tran, $intron) = @_;
    my @exons = @{$tran->get_all_Exons};
    my $tran1 = new Bio::EnsEMBL::Transcript();
    my $tran2 = new Bio::EnsEMBL::Transcript();
    foreach my $e (@exons) {
	if ($e->start < $intron->start) {$tran1->add_Exon($e);}
	else {$tran2->add_Exon($e);}
    }
    $tran1->dbID(0);
    $tran2->dbID(0);
    $tran1->description($tran->description);
    $tran2->description($tran->description);
    $tran1->add_supporting_features(@{$tran->get_all_supporting_features});
    $tran2->add_supporting_features(@{$tran->get_all_supporting_features});
    
    return ($tran1, $tran2);    
}

sub is_overlapping {
    my ($feat1, $feat2) = @_;
    if ($feat1->start <= $feat2->end && $feat2->start <= $feat1->end) {
	return 1;
    }
    return 0;
}

# group transcripts with considering strand
sub group_by_strand {
    my ($transcripts, $fslice, $rslice) = @_;
    my (@f, @r);
    foreach my $t (@$transcripts) {
	$t = $t->transfer($fslice);
	if ($t->strand == 1) { push @f, $t; }
	else { push @r, $t->transfer($rslice); }
    }
    return (\@f, \@r);
}

# remove redundancy, combine overlapping compatible transcripts
# if remove redundacy first, some alternative transcripts may not be extended
sub process_clusters {
    my ($self, $clusters, $evitype, $high_keep_id, $low_keep_id, $max_boundary_shift, $max_end_ext) = @_;

    my @good_trans;
    foreach my $trans (@$clusters) {
	if (scalar @$trans < 2) {push @good_trans, @$trans; next;}
#	transfer_exon_supporting_features($trans, $low_keep_id);
	$trans = process_cluster($trans, $evitype, $high_keep_id, $low_keep_id, $max_boundary_shift, $max_end_ext);
	push @good_trans, @$trans;
    }
    return \@good_trans;
}

sub preprocess_clusters {
    my ($self, $clusters, $high_keep_id, $low_keep_id) = @_;
    my @clusters;
    foreach my $cluster (@$clusters) {
	if (scalar @$cluster < 2) {push @clusters, $cluster; next;}
	print STDERR scalar @{$cluster}, " before preprocessing\n";
	my $trans = preprocess_cluster($cluster, $high_keep_id, $low_keep_id);
	print STDERR scalar @{$trans}, " after preprocessing\n";
	push @clusters, $trans;
    }
    return \@clusters;
}

# remove low quality transcripts from a cluster to speed up
# also reduce the number of spurious transcripts
sub preprocess_cluster {
    my ($transcripts, $high_perc_id, $low_perc_id, $nostrand) = @_;
    print STDERR "preprocess cluster: ", scalar @$transcripts, " -- high: $high_perc_id, low: $low_perc_id\n";
    my $start = $transcripts->[0]->start;
    my $end = $transcripts->[-1]->end;
    print "cluster $start - $end\n";
    if ($end - $start < 1000000) {$end = $start+1000000;}

#    print "cluster: ", $transcripts->[0]->start, ' ', $transcripts->[@$transcripts-1]->end, "\n";
#    for my $p (0..-1) {print "bad\n";}
    # from high to low
    my @sup_feats = map { my @a = sort {$b->percent_id<=>$a->percent_id} @{$_->get_all_supporting_features}; $a[0] } @$transcripts;
    my @final_trans;
#print_transcripts($transcripts);  
    foreach my $i (0..@$transcripts-1) {
	if (not defined $transcripts->[$i]) {next;} # already removed
	foreach my $j (0..$#final_trans) {
	    if ($transcripts->[$i]->start > $final_trans[$j]->end  
		|| $transcripts->[$i]->end < $final_trans[$j]->start) {
		next;
	    } # non-overlapping
	    
	    # overlapping transcripts from the same evidence keep only one
	    # to prevent mapping errors
	    my $sup_feats1 = $transcripts->[$i]->get_all_supporting_features;
	    my $sup_feats2 = $final_trans[$j]->get_all_supporting_features;
	    if ($sup_feats1->[0] && $sup_feats2->[0]
		&& $sup_feats1->[0]->hseqname eq $sup_feats2->[0]->hseqname
		&& (@{$sup_feats1} == 1 && @{$sup_feats2} == 1)) { # && or || ?
		if ($sup_feats1->[0]->percent_id > $sup_feats2->[0]->percent_id
		    || $sup_feats1->[0]->percent_id==$sup_feats2->[0]->percent_id
		    && ($sup_feats1->[0]->score > $sup_feats2->[0]->score
			|| $sup_feats1->[0]->score == $sup_feats2->[0]->score
			&& $transcripts->[$i]->end-$transcripts->[$i]->start
			< $final_trans[$j]->end-$final_trans[$j]->start
			)
		    ) {
		    print STDERR "pre-remove1 ", $final_trans[$j]->dbID, "\n";
		    print STDERR "pre-keep ", $transcripts->[$i]->dbID, "\n";
		    $final_trans[$j] = $transcripts->[$i];
		}
		print STDERR "pre-remove2 ", $transcripts->[$i]->dbID, "\n";
		$transcripts->[$i] = undef;
		last;
	    }
	}
	if ($transcripts->[$i]) { 
	    push @final_trans, $transcripts->[$i]; 
	    if ($transcripts->[$i]->start < $start) {
		$start = $transcripts->[$i]->start;
	    }
	    if ($transcripts->[$i]->end > $end) {
		$end = $transcripts->[$i]->end;
	    }
	    $transcripts->[$i] = undef;
	}
	if (@final_trans > 2000 && @final_trans*1000/($end-$start+1)>1) {
	    last;
	}
    }
    foreach my $j (0..@$transcripts-1) {
	$transcripts->[$j] = undef; # remove genes for memory
    }

    print STDERR scalar @final_trans, " top trans in cluster\n";
#print_transcripts(\@final_trans);    
    if ($nostrand) {return \@final_trans;}

    $transcripts = [@final_trans];
    @final_trans=();
    # get high quality transcripts to check if there are enough
    foreach my $i (0..@$transcripts-1) {
	if ($transcripts->[$i] && $sup_feats[$i]
	    && $sup_feats[$i]->percent_id >= $high_perc_id) {
	    push @final_trans, $transcripts->[$i];
	    $transcripts->[$i] = undef;
	}
    }

    print STDERR scalar @final_trans, " best\n";
#print_transcripts(\@final_trans);    
    if (@final_trans > 1000) { 
	if ($high_perc_id >= 99.9) { # if quality good enough
	    return \@final_trans; 
	}else { # otherwise remove some since quality not good
	    return filter_cluster(\@final_trans, $high_perc_id, 1/3*$high_perc_id+2/3*$low_perc_id);
	}
    }elsif (@final_trans > 0) {
	foreach my $i (0..@$transcripts-1) {
	    if ($transcripts->[$i] && $sup_feats[$i]
		&& $sup_feats[$i]->percent_id >= $low_perc_id) {
		push @final_trans, $transcripts->[$i];
		$transcripts->[$i] = undef;
	    }
	}	
	if (@final_trans <= 1000) {
	    return \@final_trans;
	}
	return filter_cluster(\@final_trans, $high_perc_id, 1/3*$high_perc_id+2/3*$low_perc_id);
    }
    foreach my $t (@$transcripts) {
	if (defined $t) { push @final_trans, $t; }
    }
    return filter_cluster(\@final_trans, $high_perc_id, $low_perc_id);
}

sub filter_cluster {
    my ($transcripts, $high_perc_id, $low_perc_id) = @_;
    print STDERR "filter cluster: ", scalar @$transcripts, " -- high: $high_perc_id, low: $low_perc_id\n";
#    print "cluster: ", $transcripts->[0]->start, ' ', $transcripts->[@$transcripts-1]->end, "\n";
    # from high to low
    my @trans = @$transcripts;
    my @sp_id = map { my @a = sort {$b->percent_id<=>$a->percent_id} @{$_->get_all_supporting_features||[]}; $a[0]?$a[0]->percent_id:0 } @$transcripts;
    foreach my $i (0..$#trans-1) {
	if (not defined $trans[$i]) {next;} # already removed
	foreach my $j ($i+1..$#trans) {
	    if (not defined $trans[$j]) {next;} # already removed
	    if ($trans[$i]->start > $trans[$j]->end  
		|| $trans[$j]->start > $trans[$i]->end) {next;} # non-overlapping

	    if ($sp_id[$j] < $low_perc_id) {
		if ($sp_id[$i] >= $high_perc_id) {
		    $trans[$j] = undef; # remove very low quality
		}elsif ($sp_id[$i] < $low_perc_id) {# both low
		    if ($sp_id[$i] - $sp_id[$j] < 10 
			&& $trans[$i]->length < $trans[$j]->length # use longer
			|| $sp_id[$i]-$sp_id[$j] < 20 
			&& $trans[$i]->length < 0.9*$trans[$j]->length
			|| $sp_id[$i]-$sp_id[$j] < 30 
			&& $trans[$i]->length < 0.8*$trans[$j]->length) {
			$trans[$i] = undef; 			
			last;
		    }else {
			$trans[$j] = undef; 
		    }
		}
	    }
	}
    }

    my @final_trans;
    foreach my $t (@trans) {
	if (defined $t) { push @final_trans, $t; }
    }
    if (@final_trans <= 1000 || $high_perc_id - $low_perc_id <= 0.5) { 
	return \@final_trans;
    }else {
	return filter_cluster(\@final_trans, $high_perc_id, ($high_perc_id+$low_perc_id)/2);
    }
}

sub process_cluster {
    my ($trans, $evitype, $high_keep_id, $low_keep_id, $max_boundary_shift, $max_end_ext) = @_;
    $max_boundary_shift = 0; # no need to allow intron boundary mismatch now 
#    print "---2 cluster: ";
#    foreach my $t (@$trans) {
#	print $t->dbID, ' ';
#    }
#    print "\n";
    print_transcripts($trans, "===process cluster...");
    while (1) {
	my $old_trans = $trans;
	if (@$old_trans > 1) {
    print STDERR scalar @$trans, " before removing redundancy\n";
	    $trans = remove_redundant_transcripts($trans, $high_keep_id, $low_keep_id, $max_end_ext);
    print STDERR scalar @$trans, " after removing redundancy\n";
	    $trans = combine_transcripts($trans, $evitype, $high_keep_id, $low_keep_id, $max_end_ext);
    print STDERR scalar @$trans, " after combining\n";
	}else {last;}
	if (@$trans == @$old_trans) {last;}
    }
    $trans = remove_bad_long_trans($trans);
    print STDERR scalar @$trans, " after processing\n";
    return $trans;
}

sub post_process {
    my ($self, $transcripts, $evitype, $fslice, $rslice, $high_threshold, 
	$low_threshold, $max_boundary_shift, $max_end_ext, $max_check_id,
	$min_keep_id, $min_intron_size, $max_bad_ratio, $use_predict, $max_distance) = @_;
    print STDERR "post process... "; print_transcripts($transcripts);
    my $clusters = $self->cluster_transcripts_no_strands($transcripts, $fslice);
 
    my @trans;
    foreach my $cluster (@$clusters) {
	cleanup_strand_by_intron($cluster);
	correct_introns($cluster, $max_boundary_shift);
	fix_strand_by_translation($cluster);
	$cluster = clean_introns($cluster, $min_keep_id, $min_intron_size, $max_boundary_shift);
#	if ($use_predict) {$cluster = clean_predicted($cluster);}
	push @trans, @$cluster;
    }
    $transcripts = connect_transcripts(\@trans, 200);
    $clusters = $self->cluster_transcripts_no_strands($transcripts, $fslice);

    my (@f_transcripts, @r_transcripts) = ();
    foreach my $cluster (@$clusters) {
#	print "---1 cluster: ";
#	foreach my $t (@{$cluster}) {
#	    print $t->dbID, ' ';
#	}
#	print "\n";
#	fix_strand_by_translation($cluster);
#	cleanup_opposite_strand($cluster);
#	correct_introns($cluster, $max_boundary_shift);
	my ($f, $r) = group_by_strand($cluster, $fslice, $rslice);
#	if ($use_predict) {$f = clean_predicted($f);}
	if ($f && scalar @$f > 0) {
#	    $f = clean_introns($f, $min_keep_id, $min_intron_size, $max_boundary_shift);
#	    $f = clean_cluster_by_intron_ratio($f, $max_check_id, $max_bad_ratio);
	    if (@$f > 1) {
		$f = connect_transcripts($f, 200);
		if ($use_predict) {
		    $f = connect_transcripts_by_prediction($f, $max_end_ext, $max_check_id, $max_bad_ratio, $max_distance);
		}
	    }
	    if ($use_predict) {$f = clean_predicted($f);}
	    if (@$f > 1) {
		print STDERR "\n\n***after clean1: "; print_transcripts($f);
		$f = clean_cluster_by_intron_ratio($f, $max_check_id, $max_bad_ratio);
		$f = process_cluster($f, $evitype, $high_threshold, $low_threshold, 0, $max_end_ext);
		print STDERR "***after clean2: "; print_transcripts($f);print "\n";
	    }
	    foreach my $t (@$f) {
		push @f_transcripts, $t->transfer($fslice);
	    }
	}
#	if ($use_predict) {$r = clean_predicted($r);}
	if ($r && scalar @$r > 0) {
#	    $r = clean_introns($r, $min_keep_id, $min_intron_size, $max_boundary_shift);
#	    $r = clean_cluster_by_intron_ratio($r, $max_check_id, $max_bad_ratio);
	    if (@$r > 1) {
		$r = connect_transcripts($r, 200);
		if ($use_predict) {
		    $r = connect_transcripts_by_prediction($r, $max_end_ext, $max_check_id, $max_bad_ratio, $max_distance);
		}
	    }
	    if ($use_predict) {$r = clean_predicted($r);}
	    if (@$r > 1) {
		print STDERR "\n\n***after clean1: "; print_transcripts($r);
		$r = clean_cluster_by_intron_ratio($r, $max_check_id, $max_bad_ratio);
		$r = process_cluster($r, $evitype, $high_threshold, $low_threshold, 0, $max_end_ext);
		print STDERR "***after clean2: "; print_transcripts($r);print "\n"; 
	    }
	    foreach my $t (@$r) {
		if ($rslice) {push @r_transcripts, $t->transfer($rslice);}
		else {push @r_transcripts, $t->transfer($fslice);}
	    }
	}
    } 

    print "### after post processing:\n";
    print_transcripts(\@f_transcripts);
    print_transcripts(\@r_transcripts);
    return (\@f_transcripts, \@r_transcripts);
}

sub clean_predicted {
    my ($trans) = @_;
    if (@$trans == 1) { 
	if ($trans->[0]->description eq 'predict') {return [];}
	else {return $trans;}
    }
    my @trans;
    my @ptrans;
    my $cdna = 0;
    foreach my $t (@$trans) {
	if ($t->description eq 'predict') { 
	    push @ptrans, $t;
	}else {
	    push @trans, $t;
	}
    }
    return \@trans;
}

sub in_set {
    my ($obj, $set) = @_;
    foreach my $o (@$set) {
	if ($o->start == $obj->start && $o->end == $obj->end) {return 1;}
    }
    return 0;
}


# correct non canonical introns if there are good ones within a few bp
sub correct_introns {
    my ($cluster, $max_boundary_shift) = @_;
    print "max shift: $max_boundary_shift\n";
    if ($max_boundary_shift < 1) {return;}
    my @good_introns;
    foreach my $tran (@$cluster) {
#	print_transcript($tran);
#	print_transcript_seq($tran);
	$tran = $tran->transfer($tran->slice);
	my $introns = $tran->get_all_Introns();
	foreach my $intron (@$introns) {
	    my $boundary = get_intron_boundary($intron);
	    if ($boundary eq 'GTAG' || $boundary eq 'GCAG') {
		if (! in_set($intron, \@good_introns)) {push @good_introns, $intron;}
	    }
	}
    }
    if (! @good_introns) {return;}

    @good_introns = sort {$a->start <=> $b->start} @good_introns;
    print scalar @good_introns, ' ', "good introns\n";
    # change splice boundary using good introns
    foreach my $tran (@$cluster) {
	my @sup_feats1 = sort {$a->percent_id <=> $b->percent_id}
	@{$tran->get_all_supporting_features()||[]};
	my @introns = sort {$a->start<=>$b->start} @{$tran->get_all_Introns()};
	foreach my $i (0..$#introns) {
	    if ($introns[$i]->length < 8) {next;}
	    my $boundary = get_intron_boundary($introns[$i]);
	    if ($boundary eq 'GTAG' || $boundary eq 'GCAG') {
		next;
	    }
#	    my $bseq = get_intron_exon_boundary($intron,3,2);
#	    if ($bseq =~ /GT.{2,4}AG/i) {
#	    }
	    
	    print "\nchecking intron: ",$tran->stable_id||$tran->dbID, ' ', $introns[$i]->start, '-', $introns[$i]->end, ' ', $boundary, "\n";
	    foreach my $good (@good_introns) {
#		if (abs($introns[$i]->length - $good->length) > $max_boundary_shift) {next;}
#		if ($introns[$i]->end < $good->start) {last;}
#		if ($introns[$i]->start > $good->end) {next;}
		if (abs($introns[$i]->length - $good->length) < 2
		    && abs($introns[$i]->start-$good->start) <= 2
		    || $sup_feats1[0] && $sup_feats1[0]->percent_id < 99
		    && abs($introns[$i]->length - $good->length) <= 9
		    || $sup_feats1[0] && $sup_feats1[0]->percent_id < 95
		    ) {
		    print STDERR "**good intron: ", $good->start, ' ', $good->end, "\n";
		    if ($introns[$i]->start != $good->start
			&& abs($introns[$i]->start-$good->start)<=$max_boundary_shift) {
			my $prev = $introns[$i]->prev_Exon;
			if ($good->start-1>$prev->start 
			    && $good->start<$introns[$i]->end
			    ) {
			    change_exon($tran, $prev, $prev->start, $good->start-1);
			    @introns = @{$tran->get_all_Introns()};
			}
		    }
		    if ($introns[$i]->end != $good->end
			&& abs($introns[$i]->end-$good->end)<=$max_boundary_shift) {
			my $next = $introns[$i]->next_Exon;
			if ($next->end > $good->end+1
			    && $good->end>$introns[$i]->start
			    ) {
			    change_exon($tran, $next, $good->end+1, $next->end);
			    @introns = @{$tran->get_all_Introns()};
			}
		    }
		}
	    }
	}
    }
}

sub get_intron_exon_boundary {
    my ($intron, $in_len, $ex_len) = @_;
    my ($e1, $e2) = sort {$intron->prev_Exon->strand * $a->start <=>
			      $intron->prev_Exon->strand *$b->start }
    ($intron->prev_Exon, $intron->next_Exon);
#    print STDERR $e1->start, ' ', $e2->start, "\n";
#    print STDERR uc $e1->seq->seq;
#    print STDERR lc $intron->seq;
#    print STDERR uc $e2->seq->seq;
#    print STDERR "\n";
    my $seq = $intron->seq;
    my $left = substr($seq, 0, $in_len);
    my $right = substr($seq, length($seq)-$in_len, $in_len);
    my $s = $e1->seq->seq;
    my $bseq = substr($s, length($s)-$ex_len, $ex_len)
	.lc($left).'.'.lc($right).substr($e2->seq->seq, 0, $ex_len);
    return $bseq;
}

sub change_exon {
    my ($tran, $exon, $new_start, $new_end) = @_;
    if ($new_end <= $new_start) {return $tran;}
    if ($exon->start != $new_start || $exon->end != $new_end) {
	print "***change exon: ", $tran->dbID, ': ', $exon->start, '-', $exon->end, " $new_start-$new_end\n";
#	print_transcript($tran);
	my $new_exon = $exon->strand == 1 ?
	    $exon->adjust_start_end($new_start-$exon->start, $new_end-$exon->end)
	    : $exon->adjust_start_end($exon->end-$new_end, $exon->start-$new_start);
	$tran->swap_exons($exon, $new_exon);
	$tran->translation(undef);
#	print_transcript($tran);
    }
    return $tran;
}

sub in_exon {
    my ($intron, $exons, $nonstrict) = @_;
    foreach my $exon (@$exons) {
#	print STDERR $exon->start, '-', $exon->end, ' ', $intron->start, '-', $intron->end, "\n";
	if ($exon->start == $intron->prev_Exon->start 
	    && $exon->end > $intron->end
	    || $exon->end==$intron->next_Exon->end 
	    && $exon->start<$intron->start
	    || $nonstrict && $exon->start<$intron->start && $exon->end>$intron->end
	    ) {
#	    print " --in\n";
	    return $exon;
	}
    }
    return undef;
}

sub is_out_trans {
    my ($query, $intron, $trans, $min_perc_id) = @_;
    $min_perc_id ||= 98;
    my $in_trans = 0;
    my @in_trans;
    my @in_exons;
    foreach my $t (@$trans) {
	if ($t==$query || !is_overlapping($intron, $t)||@{$t->get_all_Exons}==1) {next;}
	my @sufs = sort {$b->percent_id<=>$a->percent_id} @{$t->get_all_supporting_features};
	if ($sufs[0]->percent_id < $min_perc_id) {next;}
	foreach my $exon (@{$t->get_all_Exons}) {
	    if ($exon->start >= $intron->start && $exon->end <= $intron->end) {
		print STDERR " ** contains trans: ", $t->stable_id||$t->dbID, "\n";
		push @in_exons, $exon; 
	    }
	}
    }
    print STDERR "***intron ", $intron->start, '-', $intron->end, " contains exons: ", scalar @in_exons, "\n";
    if (non_overlapping_num(\@in_exons) > 2) { return 1; }
    return 0;
}


sub remove_bad_long_trans {
    my ($trans) = @_;
    if (@$trans < 5) {return $trans;}
    my @trans = sort {$b->length <=> $a->length} @$trans;
    foreach my $i (0..@trans-1) {
	if (!$trans[$i]) {next;}
	my @covered_trans;
	foreach my $j (0..@trans-1) {
	    if ($i==$j) {next;}
	    if (!$trans[$j] || @{$trans[$j]->get_all_Exons}==0) {next;}
	    if (is_overlapping($trans[$i], $trans[$j])) {
		push @covered_trans, $trans[$j];
	    }
	}
	if (non_overlapping_num(\@covered_trans) > 3) {
	    foreach my $intron (@{$trans[$i]->get_all_Introns}) {
		if ($intron->length < 5000) {next;}
		if (is_out_trans($trans[$i], $intron, \@covered_trans)) {
		    print STDERR "** remove: ";
		    print_transcript($trans[$i]);
		    $trans[$i] = undef;
		    last;
		}
	    }
	}
    }
    my @good_trans;
    foreach my $t (@trans) {
	if ($t) { push @good_trans, $t; }
    }
    return \@good_trans;
}

sub non_overlapping_num {
    my ($trans) = @_;
    my @trans = sort {$a->length <=> $b->length} @$trans;
    my @new_trans = ($trans[0]);
    foreach my $i (1..@trans-1) {
	my $overlapping = 0;
	foreach my $t (@new_trans) {
	    if (is_overlapping($trans[$i], $t)) {
		$overlapping = 1;
		last;
	    }
	}
	if (!$overlapping) {
	    push @new_trans, $trans[$i];
	}
    }
    print STDERR "non-overlapping: ", scalar @new_trans, "\n";
    return scalar @new_trans;
}

sub out_exon {
    my ($intron, $exons) = @_;
    my @exons;
    foreach my $exon (@$exons) {
	if ($exon->start > $intron->start && $exon->end < $intron->end) {
	    if (! in_set($exon, \@exons)) {push @exons, $exon;}
#	    print STDERR $exon->start, '-', $exon->end, ': ', $num, "\n";
	}
    }
    print STDERR scalar @exons, "*** in intron ", $intron->start, '-', $intron->end, "\n";
    return scalar @exons;
}


sub remove_intron {
    my ($tran, $intron) = @_;
    my @exons = sort {$a->start<=>$b->start} @{$tran->get_all_Exons};
#    print_transcript($tran);
    $tran->flush_Exons();
    for (my $i = 0; $i < @exons; $i++) {
	if ($i < @exons-1 &&
	    $exons[$i]->end+1 == $intron->start
	    && $exons[$i+1]->start-1 == $intron->end) {
	    $tran->add_Exon(combine_exons($exons[$i], $exons[$i+1]));
	    $i++;
	}else {$tran->add_Exon($exons[$i]);}
    }
    $tran->translation(undef);
#    print_transcript($tran);
    return $tran;
}

sub combine_exons {
    my ($le, $re) = @_; 
    my $exon = $le->strand == 1 ?
	$le->adjust_start_end(0, $re->end - $le->end)
	: $le->adjust_start_end($le->end - $re->end, 0);
    return $exon;
}

sub get_intron_boundary {
    my ($intron, $len) = @_;
    $len ||= 2;
    my $seq = $intron->seq;
    my $left = substr($seq, 0, $len);
    my $right = substr($seq, length($seq)-$len, $len);
    my $boundary = "$left$right";
    return $boundary;
}

# extend shorter transcripts with compatible ends in incompatible transcripts
sub post_extension {
    my ($self, $transcripts, $evitype, $fslice, $rslice, $max_boundary_shift, $max_end_ext, $high_keep_id, $low_keep_id) = @_;

#    print_transcripts($transcripts);
#    my $clusters = $self->cluster_Transcripts($transcripts);
    my $clusters = $self->cluster_transcripts_with_strand($transcripts, $fslice, $rslice);
    my @new_trans;
    foreach my $cluster (@$clusters) {
	my $trans = $cluster;#->get_Transcripts();
#    print_transcripts($trans);
	if (scalar @$trans > 1) {
	    $trans = extend_transcripts($trans, $max_end_ext, $high_keep_id, $low_keep_id, 1);
	}
	push @new_trans, @$trans;
    }
    return \@new_trans;
}

sub fix_strand_by_translation {
    my $transcripts = shift;
    print "\nfix strand by translation\n";
    foreach my $i (0..@$transcripts-1) {
	$transcripts->[$i] = check_strand_by_translation($transcripts->[$i]);
    }
    return $transcripts;
}


sub check_strand_by_translation {
    my ($tran) = @_;

    my @exons = @{$tran->get_all_Exons()};
    my @sufs = sort {$a->percent_id<=>$b->percent_id} @{$tran->get_all_supporting_features};
    if (@exons > 1 && is_protein_support($tran) 
	&& $tran->translate && length($tran->translate->seq)>200) {return $tran;}
    if (@exons > 3) {return $tran;}
    my $good = check_strand_by_intron($tran);
    if ($good > 1 && $good > @exons-2 && $exons[0]->length > 40 && $exons[$#exons]->length > 40
	|| $good == 1 && (@sufs==0 || $sufs[0]->percent_id>99)
	) {
	return $tran;
    }
#	    print_transcript($tran);print_exons($tran);
    $tran = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
	->compute_translation( $tran );
    my $pro1 = $tran->translate;
    my $cds1_pseudo = is_pseudo_cds($tran);
    print STDERR $pro1->seq, " --1\n" if $pro1;
    if ($pro1 && $pro1->seq =~ /\*/) {$pro1 = undef;}
    $tran = change_orientation($tran);
#	    print_transcript($tran);print_exons($tran);
    $tran = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
	->compute_translation( $tran );
    my $pro2 = $tran->translate;
    my $cds2_pseudo = is_pseudo_cds($tran);
    print STDERR $pro2->seq, " --2\n" if $pro2;
    if ($pro2 && $pro2->seq =~ /\*/) {$pro2 = undef;}
    if (!$pro1 && !$pro2) {return $tran;}
    if (!$pro2) {
	$tran = change_orientation($tran); # change back
	$tran = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
	    ->compute_translation( $tran );
    }elsif ($pro1 && $pro2) {
	my $pro1_len = length($pro1->seq);
	my $pro2_len = length($pro2->seq);
	my $pro1_mlen = protein_length($pro1->seq, $pro1_len);
	my $pro2_mlen = protein_length($pro2->seq, $pro2_len);
	print STDERR "$good, $pro1_len, $pro2_len, $pro1_mlen, $pro2_mlen\n";
	if ($pro2_mlen >= 100 && $pro2_len > $pro1_len && $pro2_mlen > 3 * $pro1_mlen
	    || $good < 1 && ($pro2_len > $pro1_len 
			     && $pro2_mlen > 2*$pro1_mlen)
	    || $cds1_pseudo && !$cds2_pseudo
	    || !$cds1_pseudo && !$cds2_pseudo 
	    && $pro2_len > $pro1_len && ($good < 1 || $pro2_mlen > 2*$pro1_mlen)
	    ) {
	    print STDERR "keep opposite\n";
	}else {
	    $tran = change_orientation($tran); # change back
	    $tran = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
		->compute_translation( $tran );
	    print STDERR "keep original\n";
	}		
    }else {
	print STDERR "keep opposite\n";
    }

#    print_transcript($tran, "after check translation");
    return $tran;
}


# can't use any other biotype-checking functions here since biotype has been changed
sub filter_by_translation {
    my ($transcripts, $min_translation, $min_cds_ratio, $max_utr_len, $max_utr_exons, $complete_cds) = @_;

    print STDERR "\n=========check translation: $min_translation, $min_cds_ratio, $complete_cds\n";
#    print STDERR scalar(@$transcripts), " transcripts\n";
    foreach my $t (@$transcripts) {
	$t->biotype('protein_coding');
	if (complete_cds_length($t) < 200) {
	    $t = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
		->compute_translation( $t );
	}
	eval{
	    print STDERR "set start, stop for: transcript ", $t->dbID, "\n";
	    $t = Bio::EnsEMBL::Pipeline::Tools::TranscriptUtils->set_start_codon( $t );
	    $t = Bio::EnsEMBL::Pipeline::Tools::TranscriptUtils->set_stop_codon( $t );
	};
	if($@){
	    print STDERR "there was a problem with the trancript: [$@]\n";
	    $t->biotype('error');
	    next;
	}
	clean_supporting_features($t);
	my $cds_complete = complete_cds_length($t);
	if (!$cds_complete) {
	    $t->biotype('protein_coding_partial');
	}

	my $pro = $t->translate;
	my $pro_len = translation_length($t);
	print_transcript($t);
	print STDERR " len: $pro_len, min: $min_translation, ", $t->description, "\n";

	# remove low quality EST only transcripts
	if (!$pro || @{$t->get_all_Exons} == 1
	    && @{$t->get_all_supporting_features} == 1
	    && ($t->description =~ /^est$|^est;|^est\s+/ 
		&& ($t->length < 500 || $pro_len < 100)
		&& $t->get_all_supporting_features->[0]->percent_id<99
		|| !is_protein_support($t)
		&& $pro_len < $min_translation
		&& $t->get_all_supporting_features->[0]->percent_id<80
		)
	    ) {
	    $t->biotype('false_transcript'); 
	    print STDERR $t->description, ", remove EST\n\n"; next;
	}

	# translation too short
	if ($pro_len < 25) {
	    $t->biotype('untranslated'); next; 
	}

	my $has_start = 0;
	my $pro_mlen = 0;
	my $mpos = index($pro->seq, 'M');
	if ($mpos == 0) { $has_start = 1; }
	if ($mpos >= 0) {$pro_mlen = $pro_len - $mpos;}

	# cds without ATG but not partial
	my $nonM_start = 0;
	my $coding_start = $t->cdna_coding_start();
	if (!$has_start && $coding_start > 3) {$nonM_start = 1;}
	if ($nonM_start) {
	    print STDERR $pro->seq, "\n", " $coding_start, pseudo cds\n\n"; 
	    $t->biotype('untranslated'); next;
	}

	# too many UTR exons
	if (count_utr_exons($t) > $max_utr_exons
	    && $pro_len*3 < $t->length*$min_cds_ratio
	    && $pro_len*3+$max_utr_len < $t->length) {
	    $t->biotype('unprocessed'); next;
	}

	# short translations
	if ($pro_len < $min_translation 
	    && ($t->description !~ /predict/
		|| @{$t->get_all_Exons} == 1
		&& @{$t->get_all_supporting_features} == 1
		&& $t->get_all_supporting_features->[0]->percent_id<80)
	    || $pro_len < 100
	    && ($t->description =~ /^est$|^est;|\s+est\s+/ 
		|| $t->description =~ /(^|\s)(\w+est|)*\w+est($|;|\s)/)
	    && check_strand_by_intron($t) < 1
	    ) {
	    print STDERR "-- trans len: ", $t->length, " protein len: $pro_len:$pro_mlen\n\n";
	    if (complete_cds_length($t) || $pro_len < $min_translation) {
		$t->biotype('protein_coding_short'); next;
	    }
	}
#	print_transcript($t);
#	print STDERR $pro->seq, "\n";

	print STDERR "trans length: ", $t->length, " protein len: $pro_len:$pro_mlen, start pos: $coding_start\n"; 
	print STDERR $t->biotype, ' ', $t->description, "\n\n";
    }

    my @good_trans;
    foreach my $t (@$transcripts) {
	if ($t->biotype eq 'false_transcript'||$t->biotype eq 'error') {next;}
	if ($complete_cds && ($t->biotype ne 'protein_coding'
			      || length($t->translate->seq) < $min_translation)
	    ) { next; }
	push @good_trans, $t;
    }
    print STDERR scalar(@good_trans), " transcripts afterwards\n";
    return \@good_trans;
}

sub count_utr_exons {
    my ($tran) = @_;
    my $pro = $tran->translate;
    if (!$pro) {
	$tran = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
	    ->compute_translation( $tran );
    }
    my $coding_region_start = $tran->coding_region_start||1000000000;
    my $coding_region_end = $tran->coding_region_end||-1000000000;
    my @exons = sort { $a->start <=> $b->start } @{$tran->get_all_Exons};
    my $num = 0;
    foreach my $e (@exons) {
	if ($e->end < $coding_region_start) { $num++; }
	elsif ($e->start > $coding_region_end) { $num++; }
    }
    return $num;
}

sub has_translation_start {
    my ($tran) = @_;
    my $pro = $tran->translate;
    if (!$pro) {
	$tran = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
	    ->compute_translation( $tran );
    }
    $pro = $tran->translate;
    if (!$pro) {return 0;}
    my $mpos = index($pro->seq, 'M');
    if ($mpos == 0) { return length($pro->seq); }
    return 0;
}

sub complete_cds_length {
    my ($tran) = @_;
    my $pro = $tran->translate;
    if (!$pro) {
	$tran = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
	    ->compute_translation( $tran );
    }
    $pro = $tran->translate;
    if (!$pro) {return 0;}
    my $has_start = 0;
#    print STDERR $pro->seq, "\n";
    if (index($pro->seq, 'M') == 0) { $has_start = 1; }
    my $has_end = 0;
    my $coding_end = $tran->cdna_coding_end();
    my $seq = $tran->translateable_seq;
#    print STDERR $seq, "\n";
#    print STDERR $tran->length, ': ', length($tran->five_prime_utr->seq), " -- $coding_end\n";
    if ($seq =~ /(TAG|TGA|TAA)$/i || $tran->length - $coding_end > 2) {
	$has_end = 1;
    }
#    my $utr3 = $tran->three_prime_utr;
#    if ($utr3 && length($utr3->seq) >= 3) {$has_end = 1;}
    print STDERR $tran->length, "-- CDS start/end: ", $tran->cdna_coding_start(), 
    '/', $coding_end, " has start/end: $has_start/$has_end\n";
    if ($has_start && $has_end) { return length($pro->seq); }
    return 0;
}

sub is_pseudo_cds {
    my ($tran) = @_;
    my $pro = $tran->translate;
    if (!$pro) {
	$tran = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
	    ->compute_translation( $tran );
    }
    $pro = $tran->translate;
    if (!$pro) {return 1;}
    my $mpos = index($pro->seq, 'M');
    if ($mpos == 0) { return 0; }
    if (length($pro->seq) - $mpos > 50) { return 0; }
    my $coding_start = $tran->cdna_coding_start();
    if ($coding_start <= 3) {return 0;}
    return 1;
}

sub translation_length {
    my ($tran, $is_m) = @_;
    my $pro = $tran->translate;
    if (!$pro) {return 0;}
    my $pro_len = length($pro->seq);
    if (!$is_m) {return $pro_len;}
    my $pro_mlen = 0;
    my $mpos = index($pro->seq, 'M');
    if ($mpos >= 0) {$pro_mlen = $pro_len - $mpos;}
    return $pro_mlen;
}

sub protein_length {
    my ($pro, $pro_len) = @_;
    $pro_len ||= length($pro);
    my $pro_mlen = 0;
    my $mpos = index($pro, 'M');
    if ($mpos >= 0) {$pro_mlen = $pro_len - $mpos;}
    return $pro_mlen;
}

sub extend_transcripts {
    my ($transcripts, $max_end_ext, $high_keep_id, $low_keep_id, $compatible) = @_;
    my @trans = #$transcripts->[0]->slice->name =~ /:1$/ ? 
	sort {$a->start <=> $b->start} @$transcripts;
#	: sort {$b->start <=> $a->start} @$transcripts;
    print STDERR "=== extend transcripts: \n";
 #   print_transcripts(\@trans);
    foreach my $i (1..$#trans) {
	if ($trans[$i]->description eq 'predict') { next; }
	my @sup_feats1 = sort {$b->percent_id <=> $a->percent_id}
	@{$trans[$i]->get_all_supporting_features()||[]};
	foreach my $j (0..$i-1) {
#	    if ($sup_feats1[0] && $sup_feats1[0]->percent_id >= 99.9 && $max_end_ext > 3) {$max_end_ext = 3;}
	    my $new_tran = extend_transcript($trans[$j], $trans[$i], $max_end_ext, $compatible, 1);
	    if ($new_tran->start != $trans[$i]->start) {
		my $len1 = complete_cds_length($trans[$i]);
		my $len = complete_cds_length($new_tran);
		if ($len > $len1 || $len1 < 100) { 
		    $trans[$i] = $new_tran; 
		    last;
		}
	    }
	}
    }
#    print_transcripts(\@trans);
    @trans = #$trans[0]->slice->name =~ /-1$/ ?
	sort {$a->end <=> $b->end} @trans;
#	: sort {$b->end <=> $a->end} @trans; 
#    print_transcripts(\@trans);
    foreach my $i (1..$#trans) {
	if ($trans[$i]->description eq 'predict') { next; }
	my @sup_feats1 = sort {$b->percent_id <=> $a->percent_id}
	@{$trans[$i]->get_all_supporting_features()||[]};
	foreach my $j (0..$i-1) {
#	    if ($sup_feats1[0] && $sup_feats1[0]->percent_id >= 99.9 && $max_end_ext > 3) {$max_end_ext = 3;}
	    my $new_tran = extend_transcript($trans[$i], $trans[$j], $max_end_ext, $compatible);
	    if ($new_tran->end != $trans[$i]->end) {
		my $len1 = complete_cds_length($trans[$i]);
		my $len = complete_cds_length($new_tran);
		if ($len > $len1 || $len1 < 100) { 
		    $trans[$i] = $new_tran; 
		    last;
		}
	    }
	}
    }
#    print_transcripts(\@trans);
    return remove_redundant_transcripts(\@trans, $high_keep_id, $low_keep_id, $max_end_ext, 1);
}

# extending transcripts on one end (left or right)
sub extend_transcript {
    my ($ltran, $rtran, $max_end_ext, $compatible, $onleft) = @_;
    
#    print_exons($ltran);
#    print_exons($rtran);
#    print "\n";
    if ($onleft && 
	($rtran->start <= $ltran->start	|| !is_overlapping($ltran, $rtran))) {
	return $rtran;
    }elsif (!$onleft && 
	    ($ltran->end >= $rtran->end || !is_overlapping($ltran, $rtran))) {
	return $ltran;
    }

    my @lexons = sort {$a->start <=> $b->start} @{$ltran->get_all_Exons()};
    my @rexons = sort {$a->start <=> $b->start} @{$rtran->get_all_Exons()};
    
#    if ($onleft) {
#	my ($st) = compare_exons_approxy(\@rexons, \@lexons, $max_end_ext);
#	print STDERR "status: $st\n";
#	if ($st == 1 || $st == 2) { return $ltran; }
#    }else {
#	my ($st) = compare_exons_approxy(\@lexons, \@rexons, $max_end_ext);
#	print STDERR "status: $st\n";
#	if ($st == 1 || $st == 2) { return $rtran; }
#    }

    my $i = 0;
    my ($l, $r) = (-1, -1);
    my $enum = $onleft ? @lexons : @rexons;
    for (; $i < $enum; $i++) {
	my ($ll, $rr) = $onleft ? ($i, 0) : ($#lexons, $i);
	if (is_overlapping($lexons[$ll], $rexons[$rr])) {
	    ($l, $r) = ($ll, $rr);
	    last;
	}
    }
#    print STDERR $onleft || 0, ": $l, $r\n";
    if ($l < 0 || $r < 0) {
	if ($onleft) { return $rtran; }
	else { return $ltran; }
    }
    @lexons = @lexons[0..$l];
    @rexons = @rexons[$r..$#rexons];
    my ($status, $exoncomps) = compare_exons_approxy(\@lexons, \@rexons, $max_end_ext);
#    print STDERR "status: $status\n";
    if ($compatible && $status != 2 && $status != 3 && $status != 5) {
	if ($onleft) {return $rtran;}
	else {return $ltran;}
    }
    
#    print STDERR "combine exons: ", $lexons[$#lexons]->start, '-',$lexons[$#lexons]->end, ' ',
#    $rexons[0]->start, '-', $rexons[0]->end, "\n";
    my $exon = $lexons[$#lexons]->strand == 1 ?
	$lexons[$#lexons]->adjust_start_end(0, $rexons[0]->end - $lexons[$#lexons]->end)
	: $lexons[$#lexons]->adjust_start_end($lexons[$#lexons]->end - $rexons[0]->end, 0);
#    print "new exon: ", $exon->start, '-', $exon->end, "\n";
    $lexons[$#lexons] = $exon;
    for my $j (1..$#rexons) {
	$rexons[$j]->slice($exon->slice);
	push @lexons, $rexons[$j];
    }
    #need to setup all exons before create transcript, otherwise a bug causing transcript end not set correctly
    @lexons = $lexons[0]->strand==1 ? sort {$a->start<=>$b->start} @lexons : sort {$b->start<=>$a->start} @lexons;
#    @lexons = sort {$lexons[0]->strand==1 ? $a->start<=>$b->start : $b->start<=>$a->start} @lexons;

    my $newtran = new Bio::EnsEMBL::Transcript(-EXONS => \@lexons);
    $newtran->dbID(0);
    $newtran->description(create_description($rtran, $ltran, 1));
    $newtran->add_supporting_features(@{$rtran->get_all_supporting_features()});
    $newtran->add_supporting_features(@{$ltran->get_all_supporting_features()});

#    print STDERR "--after combine:\n";print_transcript($newtran);

    return $newtran;
}


# remove transcripts
# 1) identical ones: keep one with best supporting evidence
# 2) full coverage ones: basically keep with best supporting evidence
sub remove_redundant_transcripts {
    my ($transcripts, $high_threshold, $low_threshold, $max_end_ext, $remove_predict) = @_;
    my @trans = sort {$a->length <=> $b->length} @$transcripts;
    print STDERR "---$high_threshold, $low_threshold, remove redundancy: ";
    print_transcripts(\@trans);

    my @trans_new;
    for my $i (0..$#trans-1) {
	my @sup_feats1 = sort {$b->percent_id <=> $a->percent_id}
	@{$trans[$i]->get_all_supporting_features()||[]};
#	print STDERR $trans[$i]->dbID||'new', ' ', $sup_feats1[0]->score, ':', $sup_feats1[0]->percent_id, "\n";
#	if ($sup_feats1[0]->score + $sup_feats1[0]->percent_id < 150) {next;}
	my $keep = 1;
	for my $j ($i+1..$#trans) {
	    my @sup_feats2 = sort {$b->percent_id <=> $a->percent_id}
	    @{$trans[$j]->get_all_supporting_features()||[]};
#	    print STDERR $trans[$j]->dbID||'new', ' ', $sup_feats2[0]->score, ':', $sup_feats2[0]->percent_id, "\n";
	    my ($value) = compare_transcripts_approxy($trans[$i], $trans[$j], $max_end_ext);
	    if ($value == 0) {next;}
	    print STDERR $trans[$i]->dbID||'new', ' ', $trans[$j]->dbID||'new', " comp value $value\n";
#	    print_transcripts([$trans[$i], $trans[$j]]);
#	    print STDERR $sup_feats1[0]->percent_id, ' ', $sup_feats2[0]->percent_id, "\n";

            # overlapping transcripts from the same evidence keep only one
#           if ($value != 0 && @sup_feats1 == 1 && @sup_feats2 == 1
#               && $sup_feats1[0]->hseqname eq $sup_feats2[0]->hseqname) {
#               if ($sup_feats1[0]->percent_id > $sup_feats2[0]->percent_id){
#                   for (my $k=$j; $k > $i; $k--) { # keep $i
#                       $trans[$k] = $trans[$k-1]; # shift all between
#                   }
#               }
#               $keep = 0;
#               last;
#           }
 	    if ($value == 1) { # same transcripts
		my $description = create_description($trans[$i], $trans[$j]);
		if ($sup_feats1[0] && (!$sup_feats2[0] || $sup_feats1[0]->percent_id > $sup_feats2[0]->percent_id)) {
		    if ($sup_feats2[0] && $sup_feats1[0]->hseqname ne $sup_feats2[0]->hseqname && 
			($sup_feats2[0]->percent_id >= $low_threshold
			 && @{$trans[$j]->get_all_Exons} > 1
			 || $trans[$j]->description ne 'est' 
			 && $trans[$j]->description ne 'predict' 
			 )
			) {
			$trans[$i]->add_supporting_features(@sup_feats2);
		    }
		    $trans[$j] = $trans[$i]; # keep better one
		}elsif ($sup_feats1[0] && $sup_feats1[0]->hseqname ne $sup_feats2[0]->hseqname &&
			($sup_feats1[0]->percent_id >= $low_threshold
			 && @{$trans[$i]->get_all_Exons} > 1
			 || $trans[$i]->description ne 'est' 
			 && $trans[$i]->description ne 'predict' 
			 )
			) {
		    $trans[$j]->add_supporting_features(@sup_feats1);
		}
		$trans[$j]->description($description);
		$keep = 0;
		last;
	    }
	    if ($value == 2) { # some can be longer genes but with low identity 
		my $description = create_description($trans[$i], $trans[$j], 1);
		if ($sup_feats1[0] && $sup_feats1[0]->percent_id >= $high_threshold 
		    && (!$sup_feats2[0] || $sup_feats2[0]->percent_id < $low_threshold)
		    && complete_cds_length($trans[$i])
		    && length($trans[$i]->translate->seq) > 200
		    ){
		    print STDERR "---remove1 ", $trans[$j]->stable_id||$trans[$j]->dbID, "\n";
		    for (my $k=$j; $k > $i; $k--) { # keep $i
			$trans[$k] = $trans[$k-1]; # shift all between
		    }
		}elsif ($sup_feats1[0] && (!$sup_feats2[0] || $sup_feats1[0]->hseqname ne $sup_feats2[0]->hseqname) 
			&& ($trans[$j]->description =~ /predict/
			    || @{$trans[$i]->get_all_Exons} > 1
			    && $sup_feats1[0]->percent_id >= $low_threshold
			    || $trans[$i]->description ne 'est'
			    )
			) {
		    $trans[$j]->add_supporting_features(@sup_feats1);
		    print "---remove2 ", $trans[$i]->stable_id||$trans[$i]->dbID, "\n";
		}
		$trans[$j]->description($description);
		$keep = 0; 
		last;
	    }
	    if ($value == 4) { # overlapping transcripts keep best one if any
		print STDERR "---overlapping, ", $trans[$i]->description, ', ', $trans[$j]->description, "\n";
		my $removej = 0;
		if ($sup_feats1[0] && $sup_feats1[0]->percent_id >= $high_threshold 
		    && (!$sup_feats2[0] || $sup_feats2[0] && $sup_feats2[0]->percent_id < $low_threshold)
		    && (has_translation_start($trans[$i]) && $trans[$i]->description !~ /predict/i)
		    ) {
		    $removej = 1;
		}elsif ($sup_feats2[0] && $sup_feats2[0]->percent_id >= $high_threshold 
			&& (!$sup_feats1[0] || $sup_feats1[0] && $sup_feats1[0]->percent_id < $low_threshold)
			&& (has_translation_start($trans[$j]) && $trans[$j]->description !~ /predict/)
			) { 
		    $keep = 0; # remove $i
		}else {
		    my $in1 = check_strand_by_intron($trans[$i]); 
		    my $in2 = check_strand_by_intron($trans[$j]);
		    if (!$trans[$i]->translate) {
			$trans[$i] = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
			    ->compute_translation( $trans[$i] );
		    }
		    if (!$trans[$j]->translate) {
			$trans[$j] = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
			    ->compute_translation( $trans[$j] );
		    }
		    my $len1 = length($trans[$i]->translate->seq) if $trans[$i]->translate;
		    my $len2 = length($trans[$j]->translate->seq) if $trans[$j]->translate;
		    print STDERR "   intron value: $in1, $in2\n";
		    if ((!$sup_feats1[0] || $sup_feats1[0]->percent_id < $low_threshold)
			&& (!$sup_feats2[0] || $sup_feats2[0]->percent_id < $low_threshold)) {
			if ($in1 > $in2 && $trans[$i]->length > $trans[$j]->length) {
			    $removej = 1;
			}elsif ($in2 > $in1 && $trans[$j]->length > $trans[$i]->length) {
			    $keep = 0;
			}elsif ($len1 > $len2) {
			    $removej = 1;
			}elsif ($len2 > $len1) {
			    $keep = 0;
			}elsif ($in1 > $in2) { $removej = 1;}
			elsif ($in1 < $in2) { $keep = 0;}
			elsif ($trans[$i]->length > $trans[$j]->length) { $removej = 1; }
			elsif ($trans[$j]->length > $trans[$i]->length) { $keep = 0; }
		    }elsif (!$sup_feats1[0] || $sup_feats1[0]->percent_id < $low_threshold) {
			if ($in1 <= $in2 || $trans[$j]->length >= $trans[$i]->length) { $keep = 0; }
		    }elsif (!$sup_feats2[0] || $sup_feats2[0]->percent_id < $low_threshold) {
			if ($in2 <= $in1 || $trans[$i]->length >= $trans[$j]->length) { $removej = 1;}
		    }
		}
		if (!$keep) {
		    print STDERR "---remove3 ", $trans[$i]->stable_id||$trans[$i]->dbID, "\n";
		    last;
		}
		if ($removej) {
		    for (my $k=$j; $k > $i; $k--) { # keep $i
			$trans[$k] = $trans[$k-1]; # shift all between
		    }
		    $keep = 0;
		    print STDERR "---remove4 ", $trans[$j]->stable_id||$trans[$j]->dbID, "\n";
		    last;
		}
	    }
	}
	if ($keep) { 
	    push @trans_new, $trans[$i]; 
#	    print STDERR "  keep1 ", $trans[$i]->dbID||'new', "\n";
#	    print STDERR ' ', scalar @{$trans[$i]->get_all_supporting_features()||[]}, "\n";
	}
    }
    push @trans_new, $trans[$#trans] if $trans[$#trans];
#    print STDERR "  keep2 ", $trans[$#trans]->dbID||'new', "\n";
    return \@trans_new;
}

# combine two compatible overlapping transcripts
# ie, if there are no alternative exons between them
# this method is very slow in some cases (bug?), ie, too many combinations
sub combine_transcripts1 { 
    my ($transcripts, $evitype, $high_threshold, $low_threshold, $max_boundary_shift, $max_end_ext) = @_;
#    print scalar @$transcripts, " high: $high_threshold, low: $low_threshold\n";
    my @transcripts = sort {$a->length <=> $b->length} @$transcripts;
    my @trans_new;
    my @keep = (1) x scalar @transcripts;
    for my $i (0..$#transcripts-1) {
#	print "==$i: ", $keep[$i], "\n";
	my $tran = $transcripts[$i];
#	my $keep = 1;
	for my $j ($i+1..$#transcripts) {
#	    print $transcripts[$i]->length, ' ', $transcripts[$j]->length, "\n";
	    # need to resort
	    my ($trans1, $trans2) = $transcripts[$i]->length < $transcripts[$j]->length ? 
		($transcripts[$i], $transcripts[$j]) : ($transcripts[$j], $transcripts[$i]);

	    my  ($value, $exoncomp) = compare_transcripts($trans1, $trans2);
#	    print STDERR "--comp value $value\n";
	    my @sup_feats1 = sort {$b->percent_id <=> $a->percent_id}
		@{$trans1->get_all_supporting_features()||[]};
	    my @sup_feats2 = sort {$b->percent_id <=> $a->percent_id}
		@{$trans2->get_all_supporting_features()||[]};
#	    print $sup_feats1[0]->percent_id, ' ', $sup_feats2[0]->percent_id, "\n";
	    if ($value == 3 || $value == 5) { # compatible overlapping transcripts
		if ($sup_feats1[0]->percent_id>$high_threshold && $sup_feats2[0]->percent_id < $low_threshold) {
#		    $transcripts[$j] = $trans1; 
		    if ($trans1 eq $tran) { $keep[$j] = 0;}
		    else {$keep[$i] = 0;}
		}elsif ($sup_feats2[0]->percent_id>$high_threshold && $sup_feats1[0]->percent_id < $low_threshold) {
#		    $transcripts[$j] = $trans2;
		    if ($trans2 eq $tran) { $keep[$j] = 0;}
		    else {$keep[$i] = 0;}
		}else {
		    my $new_tran = combine_pair_transcripts($transcripts[$i], $transcripts[$j], $evitype);
#		    $transcripts[$j] = $new_tran;
		    push @trans_new, $new_tran;
		    $keep[$j] = 0;
		    $keep[$i] = 0;
		}
	    }
	}
#	print_transcript($tran);
#	print $keep[$i], "\n";
	if ($keep[$i]) {push @trans_new, $tran;}
    }
    if ($keep[$#transcripts]) {push @trans_new, $transcripts[$#transcripts];}
    return \@trans_new;
}

sub combine_transcripts {
    my ($transcripts, $evitype, $high_threshold, $low_threshold, $max_end_ext) = @_;
    my @transcripts = sort {$a->length <=> $b->length} @$transcripts;
    my @trans_new;
    for my $i (0..$#transcripts-1) {
	my $keep = 1;
	for my $j ($i+1..$#transcripts) {
#	    print $transcripts[$i]->length, ' ', $transcripts[$j]->length, "\n";
	    # need to resort
	    my ($trans1, $trans2) = $transcripts[$i]->length < $transcripts[$j]->length ? 
		($transcripts[$i], $transcripts[$j]) : ($transcripts[$j], $transcripts[$i]);
	    
	    my  ($value, $exoncomp) = compare_transcripts_approxy($trans1, $trans2, $max_end_ext);
#	    print STDERR "--comp value $value\n";
	    my @sup_feats1 = sort {$b->percent_id <=> $a->percent_id}
		@{$trans1->get_all_supporting_features()||[]};
	    my @sup_feats2 = sort {$b->percent_id <=> $a->percent_id}
		@{$trans2->get_all_supporting_features()||[]};
	    if ($value == 3 || $value == 5) { # compatible overlapping transcripts
		if ($sup_feats1[0]->percent_id>$high_threshold 
		    && $sup_feats2[0]->percent_id < $low_threshold
		    && (complete_cds_length($trans1)
			&& length($trans1->translate->seq)>200)
		    ) {
		    $transcripts[$j] = $trans1; 
		    $keep = 0;
		    last;
		}elsif ($sup_feats2[0]->percent_id>$high_threshold 
			&& $sup_feats1[0]->percent_id < $low_threshold
			&& (complete_cds_length($trans2)
			    && length($trans2->translate->seq)>200)
			) {
		    $transcripts[$j] = $trans2;
		    $keep = 0;
		    last;
		}else {
		    my $new_tran = combine_pair_transcripts($transcripts[$i], $transcripts[$j], $evitype);
		    my $ox = 0;
		    foreach my $e (@$exoncomp) {
			if ($e > 0 && $e < 4) { $ox ++; }
		    }
		    if ($ox > 0) { 
			$transcripts[$j] = $new_tran; 
			$keep = 0; 
			last;
		    }
		    my $len1 = complete_cds_length($transcripts[$i]);
		    my $len2 = complete_cds_length($transcripts[$j]);
		    my $len = complete_cds_length($new_tran);
		    print STDERR "=== check combined transcript === $len, $len1, $len2\n";
		    if ($len >= $len1+$len2 || $len1<100 || $len2 < 100) {
			$transcripts[$j] = $new_tran;
			$keep = 0;
			last;
		    }
		}
	    }
	}
	if ($keep) {push @trans_new, $transcripts[$i];}
    }
    push @trans_new, $transcripts[$#transcripts];
    return \@trans_new;
}

sub combine_pair_transcripts {
    my ($qtrans, $ttrans, $evitype, $exoncomps) = @_;

    #$ttrans = $ttrans->transfer($qtrans->slice);
    my ($ltrans, $rtrans) = $qtrans->start <= $ttrans->start ? ($qtrans, $ttrans) : ($ttrans, $qtrans);
#    print STDERR "--ltrans ", $ltrans->dbID, ': ', $ltrans->start, '-', $ltrans->end, "\n"; print_exons($ltrans);
#    print STDERR "--rtrans ", $rtrans->dbID, ': ', $rtrans->start, '-', $rtrans->end, "\n"; print_exons($rtrans);
#    print_transcript($ltrans);
#    print_transcript($rtrans);
    if ($rtrans->end <= $ltrans->end) {return $ltrans;}
    my @lexons = sort {$a->start <=> $b->start} @{$ltrans->get_all_Exons()};
    my @rexons = sort {$a->start <=> $b->start} @{$rtrans->get_all_Exons()};

    my $i = 0;
    for (; $i < @rexons; $i++) {
	if ($rexons[$i]->end > $lexons[$#lexons]->start
	    #&& $rexons[$i]->start <= $lexons[$#lexons]->end
	    ) {last;}
    }
    print STDERR "combine exons: ",$ltrans->strand,' ',$lexons[$#lexons]->start, '-',
    $lexons[$#lexons]->end, ' ', $rexons[$i]->start, '-', $rexons[$i]->end, "\n";
    my $exon = $lexons[$#lexons]->strand == 1 ?
	$lexons[$#lexons]->adjust_start_end(0, $rexons[$i]->end - $lexons[$#lexons]->end)
	: $lexons[$#lexons]->adjust_start_end($lexons[$#lexons]->end - $rexons[$i]->end, 0);
    print STDERR "new exon: ", $exon->start, '-', $exon->end, "\n";
    if (@rexons >= 1) {
#	print "supports1: ", scalar @{$exon->get_all_supporting_features}, "\n";
	$exon->add_supporting_features(@{$rexons[$i]->get_all_supporting_features});
#	print "supports2: ", scalar @{$exon->get_all_supporting_features}, "\n";
    }
    
    $lexons[$#lexons] = $exon;
    for my $j ($i+1..$#rexons) {
	$rexons[$j]->slice($exon->slice);
	push @lexons, $rexons[$j];
    }
#    print $lexons[0]->strand, "\n";exit;
    #exon order need change for minus strand
    @lexons = $lexons[0]->strand==1 ? sort {$a->start<=>$b->start} @lexons : sort {$b->start<=>$a->start} @lexons;
    #need to setup all exons before create transcript, otherwise a bug(?) causing transcript end not set correctly

    my $newtrans = new Bio::EnsEMBL::Transcript(-EXONS => \@lexons);
#    $newtrans->slice($ltrans->slice); # need change slice for supporting features
    $newtrans->description(create_description($ltrans, $rtrans, 1));
    $newtrans->dbID(0);

    # use supporting features which support
    my @sup_feats = (@{$ltrans->get_all_supporting_features()}, @{$rtrans->get_all_supporting_features()});
#    if (@rexons > 1) {
#        if (@{$ltrans->get_all_Exons} > 1) {
#            push @sup_feats, @{$rtrans->get_all_supporting_features()};
#        }else {
#            @sup_feats = @{$rtrans->get_all_supporting_features()};
#        }
#    }elsif (@{$ltrans->get_all_Exons} == 1) {
#	push @sup_feats, @{$rtrans->get_all_supporting_features()};
#    }
    
#    print_transcript($newtrans);print scalar @sup_feats, "\n";
    $newtrans->add_supporting_features(@sup_feats);
#    print_transcript($newtrans);
#    print STDERR "--after combine:\n";print_exons($newtrans);print $newtrans->description,"\n";

    return $newtrans;
}

sub create_description {
    my ($tran1, $tran2, $use_predict) = @_;
    
    if (!$use_predict && $tran1->description eq 'predict'){return $tran2->description;}
    if (!$use_predict && $tran2->description eq 'predict'){return $tran1->description;}
    my @lbs = split /\|/, $tran1->description;
    my @rbs = split /\|/, $tran2->description;
    my %description = ();
    foreach my $b (@lbs, @rbs) {
	$description{$b}=1;
    }
    my $description;
    foreach my $b (sort keys %description) {
	$description ? ($description .= '|'.$b) : ($description = $b);
    }
    return $description;
}

sub clean_supporting_features() {
    my ($tran) = @_;
    my %name;
    if (@{$tran->get_all_supporting_features||[]} < 30) {return;}
    my @sfs = sort {$b->percent_id <=> $a->percent_id}
    @{$tran->get_all_supporting_features()||[]};
    $tran->flush_supporting_features();
    my $high_perc = $sfs[0] ? $sfs[0]->percent_id : 0;
    my $pro = 0;
    my $low = 0;
    foreach my $sf (@sfs) {
	if (!$name{$sf->hseqname}) {
	    if ($low < 10 || $high_perc - $sf->percent_id < 2) {
		$name{$sf->hseqname} = $sf;
		if ($sf->percent_id <= 98) { $low++; }
	    }elsif ($pro < 5 && ! $sf->isa('Bio::EnsEMBL::DnaDnaAlignFeature')) {
		$name{$sf->hseqname} = $sf;
		$pro++;
	    }	    
	}
    }
    $tran->add_supporting_features(values %name);
}

# compare two transcripts if exon/intron boundaries same
# type:
#       0: no overlapping exons, 
#       1: same transcript 
#       2: query covered by target, but with extended end exon(s)
#       3: query covered by target, but with shortened end exon(s)
#       4: overlapping with different middle exons or missing exons
#       5: partial overlap with missing end exons (middle same if any)
######################################################################
# first transcript should be the shorter one
sub compare_transcripts {
    my ($qtrans, $ttrans, $ignore_strand) = @_;
#    print STDERR "==qtrans "; print_transcript($qtrans);
#    print STDERR "==ttrans "; print_transcript($ttrans);
    if (!$ignore_strand && $qtrans->strand != $ttrans->strand) {return (0);}
    if ($qtrans->end < $ttrans->start || $qtrans->start > $ttrans->end) {return (0);}

    my @qexons = sort {$a->start <=> $b->start} @{$qtrans->get_all_Exons()};
    my @texons = sort {$a->start <=> $b->start} @{$ttrans->get_all_Exons()};

    my ($status1, $exoncomps) = compare_exons(\@qexons, \@texons);
    my ($status2) = compare_exons(\@texons, \@qexons);
#    print STDERR "status: $status1, $status2\n";
    if ($status2 == 4) {
	$status1 = 4;
    }elsif ($status1 == 3 && $status2 == 5) {
	$status1 = 5;
    }
#    print STDERR "  -- final: $status1 \n";
    return ($status1, $exoncomps);
}

sub compare_transcripts_approxy {
    my ($qtrans, $ttrans, $max_end_ext, $ignore_strand) = @_;
    $max_end_ext ||= 0;
#    print STDERR "qtrans ", $qtrans->dbID, "\n"; print_exons($qtrans);
#    print STDERR "ttrans ", $ttrans->dbID, "\n"; print_exons($ttrans);
    if (!$ignore_strand && $qtrans->strand != $ttrans->strand) {return (0);}
    if ($qtrans->end < $ttrans->start || $qtrans->start > $ttrans->end) {return (0);}

    my @qexons = sort {$a->start <=> $b->start} @{$qtrans->get_all_Exons()};
    my @texons = sort {$a->start <=> $b->start} @{$ttrans->get_all_Exons()};

    my ($status1, $exoncomps) = compare_exons_approxy(\@qexons, \@texons, $max_end_ext);
    my ($status2) = compare_exons_approxy(\@texons, \@qexons, $max_end_ext);
    print STDERR "status: $status1, $status2\n";
    if ($status2 == 4) {
	$status1 = 4;
    }elsif ($status1 == 3 && $status2 == 5) {
	$status1 = 5;
    }
#    print STDERR "  -- final: $status1 \n";
    return ($status1, $exoncomps);
}

# exons must be sorted from left to right
# compare two transcripts if exon/intron boundaries same
# type:
#       0: non overlapping exons, 
#       1: same exons
#       2: query covered by target, but extended
#       3: query covered by target, but shortened
#       4: overlapping incompatible middle exons
#       5: partial overlapping compatible end exons
sub compare_exons {
    my ($qexons, $texons) = @_;

    my @qexons = @$qexons;
    my @texons = @$texons;
    my @exoncomps = (0) x @qexons;
    if ($qexons[0]->start > $texons[$#texons]->start 
	&& $qexons[0]->end > $texons[$#texons]->end) {
	$exoncomps[0] = 5;
	return (5, \@exoncomps);
    }elsif ($texons[0]->start > $qexons[$#qexons]->start 
	&& $texons[0]->end > $qexons[$#qexons]->end) {
	$exoncomps[$#qexons] = 5;
	return (5, \@exoncomps);
    }
    for (my ($i, $j)=(0,0); $i <@qexons; $i++) {
	for (; $j < @texons; $j++) {
	    if ($texons[$j]->end < $qexons[$i]->start) {next;}
	    if ($texons[$j]->start > $qexons[$i]->end) {last;}
	    if ($qexons[$i]->start == $texons[$j]->start 
		&& $qexons[$i]->end == $texons[$j]->end) { # equal exons
		$exoncomps[$i] = 1;
	    }elsif (@qexons==1 && $qexons[0]->start>=$texons[$j]->start 
		    && $qexons[0]->end<=$texons[$j]->end 
		    || $i==0 && $qexons[$i]->end == $texons[$j]->end 
		    && $qexons[$i]->start > $texons[$j]->start
		    || $i==$#qexons && $qexons[$i]->start == $texons[$j]->start 
		    && $qexons[$i]->end < $texons[$j]->end) {
		$exoncomps[$i] = 2; # extended end exons treated differently
	    }elsif (@texons==1 && $texons[0]->start>=$qexons[$i]->start 
		    && $texons[0]->end<=$qexons[$i]->end
		    || $j==0 && $qexons[$i]->end == $texons[$j]->end 
		    && $qexons[$i]->start < $texons[$j]->start
		    || $j==$#texons && $qexons[$i]->start == $texons[$j]->start 
		    && $qexons[$i]->end > $texons[$j]->end) {
		$exoncomps[$i] = 3; # shortened end exons treated differently
	    }else {$exoncomps[$i] = 4;} # alternative exons
	    last; # only compare one overlapping exon if there are more
	}
#	print STDERR "$i: ", $exoncomps[$i], "\n";
    }

    my $status = 0;
    for (my $i = 0; $i < @exoncomps; $i++) {
	if ($exoncomps[$i] == 0 && $qexons[$i]->start <= $texons[$#texons]->end
	    && $qexons[$i]->end >= $texons[0]->start) {
	    $status = 4;
	}elsif ($status < $exoncomps[$i]) {
	    $status = $exoncomps[$i];
	}	
    }
    if (($exoncomps[0] == 0 || $exoncomps[$#exoncomps] == 0) 
	&& $status > 0 && $status < 4) {
	return (5, \@exoncomps);
    }elsif ($status == 1 && @qexons < @texons) { # if both end covered and same 
	return (2, \@exoncomps);	
    }else {
	return ($status, \@exoncomps);
    }
}


sub compare_exons_approxy {
    my ($qexons, $texons, $max_end_ext) = @_;
    if (! $max_end_ext || $max_end_ext < 0) {$max_end_ext=0;}
#    print "max_end_ext: $max_end_ext\n";    
    my @qexons = @$qexons;
    my @texons = @$texons;
    my @exoncomps = (0) x @qexons;
    if (is_overlapping($qexons[0], $texons[$#texons])
	&& (@qexons == 1 && @texons == 1
	    && $qexons[0]->start > $texons[$#texons]->start 
	    && $qexons[0]->end > $texons[$#texons]->end
	    || (@qexons != 1 || @texons != 1)
	    && $qexons[0]->start > $texons[$#texons]->start - $max_end_ext
	    && $qexons[0]->end > $texons[$#texons]->end - $max_end_ext)) {
	$exoncomps[0] = 5;
	return (5, \@exoncomps);
    }elsif (is_overlapping($texons[0], $qexons[$#qexons])
	    && (@qexons == 1 && @texons == 1
		&& $texons[0]->start > $qexons[$#qexons]->start 
		&& $texons[0]->end > $qexons[$#qexons]->end
		|| (@qexons != 1 || @texons != 1)
		&& $texons[0]->start > $qexons[$#qexons]->start - $max_end_ext
		&& $texons[0]->end > $qexons[$#qexons]->end - $max_end_ext)) {
	$exoncomps[$#qexons] = 5;
	return (5, \@exoncomps);
    }
    for (my ($i, $j)=(0,0); $i <@qexons; $i++) {
	for (; $j < @texons; $j++) {
	    if ($texons[$j]->end < $qexons[$i]->start) {next;}
	    if ($texons[$j]->start > $qexons[$i]->end) {last;}
	    if ($qexons[$i]->start == $texons[$j]->start 
		&& $qexons[$i]->end == $texons[$j]->end) { # equal exons
		$exoncomps[$i] = 1;
	    }elsif (@qexons==1 && $qexons[0]->start>=$texons[$j]->start 
		    && $qexons[0]->end<=$texons[$j]->end 
		    || $i==0 && $qexons[$i]->end == $texons[$j]->end 
		    && $qexons[$i]->start > $texons[$j]->start
		    || $i==$#qexons && $qexons[$i]->start == $texons[$j]->start 
		    && $qexons[$i]->end < $texons[$j]->end) {
		$exoncomps[$i] = 2; # extended end exons treated differently
	    }elsif (@texons==1 && $texons[0]->start>=$qexons[$i]->start 
		    && $texons[0]->end<=$qexons[$i]->end
		    || $j==0 && $qexons[$i]->end == $texons[$j]->end 
		    && $qexons[$i]->start < $texons[$j]->start
		    || $j==$#texons && $qexons[$i]->start == $texons[$j]->start 
		    && $qexons[$i]->end > $texons[$j]->end) {
		$exoncomps[$i] = 3; # shortened end exons treated differently
	    }elsif ($max_end_ext != 0) {
		if (@qexons==1 && $qexons[0]->start+$max_end_ext>=$texons[$j]->start 
		    && $qexons[0]->end-$max_end_ext<=$texons[$j]->end 
		    || @texons==1 && $texons[0]->start+$max_end_ext>=$qexons[$i]->start 
		    && $texons[0]->end-$max_end_ext<=$qexons[$i]->end
		    || $i==0 && $qexons[$i]->end == $texons[$j]->end 
		    && $qexons[$i]->start < $texons[$j]->start
		    && $qexons[$i]->start >= $texons[$j]->start - $max_end_ext
		    || $i==$#qexons && $qexons[$i]->start == $texons[$j]->start
		    && $qexons[$i]->end > $texons[$j]->end
		    && $qexons[$i]->end <= $texons[$j]->end + $max_end_ext
		    || $j==0 && $qexons[$i]->end == $texons[$j]->end 
		    && $qexons[$i]->start > $texons[$j]->start
		    && $qexons[$i]->start <= $texons[$j]->start + $max_end_ext
		    || $j==$#texons && $qexons[$i]->start == $texons[$j]->start
		    && $qexons[$i]->end < $texons[$j]->end
		    && $qexons[$i]->end >= $texons[$j]->end - $max_end_ext
		    ) {
		    $exoncomps[$i] = 1;
		}else {$exoncomps[$i] = 4;} # alternative exons
	    }else {$exoncomps[$i] = 4;} # alternative exons
	    last; # only compare one overlapping exon if there are more
	}
#	print STDERR "$i: ", $exoncomps[$i], "\n";
    }

    my $status = 0;
    for (my $i = 0; $i < @exoncomps; $i++) {
	if ($exoncomps[$i] == 0 && $qexons[$i]->start <= $texons[$#texons]->end
	    && $qexons[$i]->end >= $texons[0]->start) {
	    $status = 4;
	}elsif ($status < $exoncomps[$i]) {
	    $status = $exoncomps[$i];
	}	
    }
    if (($exoncomps[0] == 0 || $exoncomps[$#exoncomps] == 0) 
	&& $status > 0 && $status < 4) {
	return (5, \@exoncomps);
    }elsif ($status == 1 && @qexons < @texons) { # if both end covered and same 
	return (2, \@exoncomps);	
    }else {
	return ($status, \@exoncomps);
    }
}

sub connect_transcripts {
    my ($transcripts, $max_distance) = @_;
    print STDERR "distance: $max_distance\n";
    my @trans = sort { $a->start <=> $b->start } @$transcripts;
    my @index;
    my @new_trans;
    for my $i (0..$#trans-1) {
	my $j = $i+1;
	for (; $j < @trans; $j++) {
	    if ($trans[$j]->start < $trans[$i]->end) {next;}
	    if ($trans[$j]->start > $trans[$i]->end+$max_distance) {last;}
	    if ($trans[$i]->strand != $trans[$j]->strand) {
		if (@{$trans[$i]->get_all_Exons} == 1) {
		    $trans[$i] = change_orientation($trans[$i]);
		}elsif (@{$trans[$j]->get_all_Exons} == 1) {
		    $trans[$j] = change_orientation($trans[$j]);
		}else { next; }
	    }
	    print STDERR "===connect transcripts:\n";
	    print_transcript($trans[$i]);
	    print_transcript($trans[$j]);
	    my $new_tran = combine_pair_transcripts($trans[$i],$trans[$j]);
	    if ($new_tran->length > $trans[$i]->length 
		&& $new_tran->length > $trans[$j]->length) {
		$new_tran = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
		    ->compute_translation( $new_tran );
	    }
	    $new_tran = check_strand_by_translation($new_tran);
	    if (translation_length($new_tran)>translation_length($trans[$i])
		&& translation_length($new_tran)>translation_length($trans[$j])
		|| translation_length($new_tran, 1)>translation_length($trans[$i], 1)
		&& translation_length($new_tran, 1)>translation_length($trans[$j], 1)
		) {
		push @new_trans, $new_tran;
		$index[$i] = 1;
		$index[$j] = 1;
	    }
	}
    }
    foreach my $i (0..$#trans) {
	if (!$index[$i]) {push @new_trans, $trans[$i];}
    }
    return \@new_trans;
}

sub connect_transcripts_by_prediction {
    my ($transcripts, $max_end_ext, $max_check_id, $max_bad_ratio, $max_distance) = @_;
    $max_distance ||= 2000;
    $transcripts = add_translation($transcripts);
    if (@$transcripts == 1) {return $transcripts;}
    my @predicts;
    my @trans;
    my @cdna_trans;
    foreach my $t (@$transcripts) {
	if ($t->description =~ /predict/) { push @predicts, $t; }
	elsif ($t->description !~ /^cdna/) { push @trans, $t; }
	else { push @cdna_trans, $t; }
    }
    if (@predicts == 0) {return $transcripts;}
    @trans = sort { $a->start <=> $b->start } @trans;

    my @trans_pairs;
    for (my $i=0; $i < $#trans; $i++) {
	my $j=$i+1;
	for (; $j < @trans; $j++) {
	    if ($trans[$j]->start < $trans[$i]->end) {next;}
	    if ($trans[$j]->start - $trans[$i]->end > $max_distance) {last;}
	    push @trans_pairs, [$trans[$i], $trans[$j]];
	    last;
	}
    }
    
    print "\nconnect transcripts with predicted: ", scalar(@trans_pairs), " pairs\n";
    my @new_trans;
    foreach my $pair (@trans_pairs) {
	foreach my $pt (@predicts) {
	    if ($pt->start > $pair->[0]->end || $pt->end < $pair->[1]->start){
		next;
	    }
	    print STDERR "\n===connect transcripts:\n";
	    print_transcript($pt);
	    print_transcript($pair->[0]);
	    print_transcript($pair->[1]);
	    my $compatible = 0;
	    if ($pair->[1]->start-$pair->[0]->end > 2000) {$compatible=1;}
	    my $new_tran = extend_transcript($pt, $pair->[1], 15, $compatible, 1);
	    print STDERR "\n===after connect transcripts 1\n";
	    print_transcript($new_tran);
	    if ($new_tran ne $pair->[1]) {
		$new_tran = extend_transcript($pair->[0],$new_tran, 15, $compatible);
		if ($new_tran ne $pair->[0]) {
		    $new_tran = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
			->compute_translation( $new_tran );
		    push @new_trans, $new_tran;
		    print STDERR "\n===after connect transcripts 2\n";
		    print_transcript($new_tran);
		    last;
		}
	    }
	}
    }

    my $complete_cds = 0; 
    foreach my $t (@cdna_trans, @trans, @new_trans) {
	if (complete_cds_length($t) 
	    && ($t->description =~ /cdna/ || length($t->translate->seq) > 100)) { 
	    $complete_cds = 1; last; 
	}
    }
    my $ts = [@cdna_trans, @new_trans, @trans];
    if (@$ts == 0) { return $transcripts; }
    if (!$complete_cds) {
	$ts = clean_cluster_by_intron_ratio($ts, $max_check_id, $max_bad_ratio);
	if (@$ts > 0) {
	    print STDERR "---replace partial genes\n";
	    return extend_transcripts([@predicts,@$ts], $max_end_ext, 0, 0, 0);
	}else {
	    $predicts[0]->description(create_description($predicts[0], $cdna_trans[0]||$new_trans[0]||$trans[0], 1));
	    my @sf;
	    if ($cdna_trans[0]) { push @sf, @{$cdna_trans[0]->get_all_supporting_features}; }
	    if ($new_trans[0]) { push @sf, @{$new_trans[0]->get_all_supporting_features}; }
	    if ($trans[0]) { push @sf, @{$trans[0]->get_all_supporting_features}; }	    
	    $predicts[0]->add_supporting_features(@sf);
	    return [$predicts[0]];
	}
    }

#    print STDERR "after connection: "; print_transcripts([@new_trans, @trans]);
    return remove_redundant_transcripts([@new_trans, @cdna_trans, @trans], 0, 0, $max_end_ext, 1);
}

# add translation if none exists
sub add_translation {
    my ($transcripts) = @_;
    my @newtrans;
    foreach my $tran (@$transcripts) {
	my $pro = $tran->translate;
#	print STDERR $tran->translate->seq, "\n" if $pro;
	if (!$pro || !$pro->seq || $pro->seq =~ /\*/ || !complete_cds_length($tran)) {
#	    print_transcript($tran);
#	    print $tran->seq->seq, "\n";
#	    print STDERR "recompute translation:\n"; # need after strand change
	    $tran = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
		->compute_translation( $tran );
	    $pro = $tran->translate;
	    if (! $pro || !$pro->seq || $pro->seq =~ /\*/) {
		print STDERR $tran->dbID, " TRANSCRIPT WITHOUT A TRANSLATION!!\n";
		next;
	    }
#	    print STDERR $tran->translate->seq, "\n";
	}
	push @newtrans, $tran;
    }
    return \@newtrans;
}

sub print_transcripts {
    my ($trans, $desc) = @_;
    if ($desc) {
	print STDERR "$desc\n";
    }
    print STDERR "### ", scalar @{$trans||[]}, " transcripts.\n";
    foreach my $t (@$trans) {
	print_transcript($t);
    }
}

sub print_transcript {
    my ($t, $desc) = @_;
    if ($desc) {
	print STDERR "$desc\n";
    }
    print STDERR $t->slice->name, "\n";
    $t->slice->name =~ /^(\w+)/;
    $t = $t->transform($1);
    print STDERR $t->stable_id || $t->dbID, ' ', $t->seq_region_name, ':', 
    $t->start, '-', $t->end, ' ', $t->strand, "\n";
    print_exons($t);
    my @sup_feats = sort {$b->percent_id <=> $a->percent_id}
    @{$t->get_all_supporting_features};
    print STDERR '    ', $t->biotype, ' ', $t->description||'--', "\n";
    if (!@sup_feats) {return;}
    print STDERR '    -- ', scalar @sup_feats, " supporting features: "; 
    foreach my $i (0..$#sup_feats) {
	print STDERR ' ', $sup_feats[$i]->hseqname, ':', $sup_feats[$i]->percent_id;
    }
    print STDERR "\n";
#    print_transcript_seq($t);

    return;
    my $new_tran = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils
        ->compute_translation( $t );
    my $sequence = $t->translate;    
    if ( $sequence ){
	print STDERR $sequence->seq, "\n";
    }else {
	print STDERR "TRANSCRIPT WITHOUT A TRANSLATION!!\n";
    }
}

sub print_exons {
    my ($transcript) = @_;
    print STDERR "    Exons: ";
    foreach my $exon (sort {$a->start <=> $b->start} @{$transcript->get_all_Exons}) {
	print STDERR " ", $exon->start, '-', $exon->end; 
#	print ' ', $exon, ' ', $exon->dbID, '(', $exon->strand, ')';
	}
    print STDERR "\n";
}

sub print_transcript_seq {
    my $trans = shift;
    $trans = $trans->transfer($trans->slice);
    my $exons = $trans->get_all_Exons;
    my $introns = $trans->get_all_Introns;
#    print STDERR "    Exons: ";
    foreach my $exon (@$exons) {
#	print STDERR " ", $exon->start, '-', $exon->end; 
    }
#    print STDERR "\n";
#    print STDERR "    Introns: ";
    foreach my $intron (@$introns) {
#	print STDERR " ", $intron->start, '-', $intron->end;
    }
#    print STDERR "\n";

    print STDERR '    ';
    foreach my $i (0..@$introns-1) {
        print STDERR uc $exons->[$i]->seq->seq; #print $introns->[$i]->start, '-', $introns->[$i]->end;
        print STDERR lc $introns->[$i]->seq; 
    }
    print STDERR uc $exons->[@$exons-1]->seq->seq;
    print STDERR "\n";
}
 

1;
