#!/usr/local/bin/perl -w
## USE set_off to change the lengths of sequences around the 
## region
package Gramene::Util::Splice;
use strict;
BEGIN{use lib("/home/ravi");}
use Gramene::Util::myUtil;
my $myUtil= Gramene::Util::myUtil->new();
my %primate5;
my %pos;
my %primate3;
my $mint;
my $maxt;

my $r_off_d;
my $l_off_d;
my $r_off_a;

my $l_off_a;

my $refDonor;
my @refDonor;

my $refAcceptor;
my @refAcceptor;

my %one;
#/*
#
#Score=100*((t1-l1)/(h1-l1) + (t2-l2)/(h2-l2))/2
#   t1= sum of best 8 of 10 percentages at pyrimidine positions
#   t2=sum of 4 best percentages at CAGG positions
#   l1= sum of lowest 8 of 10 percentages at pyrimidine positions (57)
#   l2 = sum of 4 lowest percentages at CAGG positions (11)
#   h1= sum of highest 8 of highest 10 percentages at pyrimidine positions (377)
#   h2 = sum of highest 4 percentages at CAGG positions (323)
#Note that the formula weighs the pyrimidine and CAGG parts equally.#
#
#***********/
##### ##### ##### ##### ##### ##### ##### 
##### new constructor.
##### ##### ##### ##### ##### ##### #####
sub new{
 my $name = shift;
 my $class = ref($name) || $name;
 my $this = {};
 bless $this,$class;
 $this->init();
 return $this;
}

my $ref="
Mammalian frequency matrices for GT-AG pairs
donor site

   34.08   60.36    9.14    0.00    0.00   52.57   71.26    7.08   15.98
   36.24   12.90    3.27    0.00    0.00    2.82    7.56    5.50   16.46
   18.31   12.48   80.34  100.00    0.00   41.94   11.76   81.35   20.90
   11.38   14.25    7.24    0.00  100.00    2.55    9.29    5.88   46.16

acceptor site

    9.00    8.44    7.51    6.82    7.63    7.98    9.66    9.23    7.57    7.69   23.76    4.21  100.00    0.00   23.86
   31.03   31.04   30.69   29.32   32.60   33.00   37.26   38.56   41.02   35.28   30.92   70.82    0.00    0.00   13.81
   12.50   11.52   10.56   10.35   11.05   11.32   11.25    8.52    6.56    6.37   21.14    0.32    0.00  100.00   51.97
   42.36   44.04   47.04   49.43   47.05   46.28   40.81   42.89   44.54   50.50   24.09   24.61    0.00    0.00   10.37


Mammalian frequency matrices for GC-AG pairs
donor site

   40.48   88.89    1.59    0.00    0.00   87.30   84.13    1.59    7.94
   42.06    0.79    0.79    0.00  100.00    0.00    3.17    0.79   11.90
   15.87    1.59   97.62  100.00    0.00   12.70    6.35   96.83    9.52
    1.59    8.73    0.00    0.00    0.00    0.00    6.35    0.79   70.63

acceptor site

   11.11   12.70    3.17    4.76   12.70    8.73   16.67   16.67   12.70    9.52   26.19    6.35  100.00    0.00   21.43
   36.51   30.95   19.05   23.02   34.92   39.68   34.92   40.48   40.48   36.51   33.33   68.25    0.00    0.00    7.94
    9.52   10.32   15.08   12.70    8.73    9.52   16.67    4.76    2.38    6.35   13.49    0.00    0.00  100.00   62.70
   38.89   41.27   58.73   55.56   42.06   40.48   30.95   37.30   44.44   47.62   26.98   25.40    0.00    0.00    7.94
";
##### ##### ##### ##### ##### ##### #####
##### init
##### ##### ##### ##### ##### ##### #####
sub init{
 $refDonor = "CAGGTAAGT";
 @refDonor=split(//,$refDonor);
 %one=(A=>2,C=>3,G=>4,T=>5);

 ## default settings
 $r_off_d=6;
 $l_off_d=3;
 $r_off_a=1;
 $l_off_a=14;

 #  $r_off_d=60;
 #  $l_off_d=60;

  $pos{A}=0;
  $pos{C}=1;
  $pos{G}=2;
  $pos{T}=3;

  ## first guy is at -3 (if 0 is at site of interest
  $primate5{off} = -3;
  $primate5{score} = [
		      [32,37,19,12],
		      [58,13,15,15],
		      [10,4,78,8],
		      [0,0,100,0],
		      [0,0,0,100],
		      [57,2,39,2],
		      [71,8,12,9],
		      [5,6,84,5],
		      [16,15,22,47]
		     ];
  $mint=59;
  $maxt=632;
  $primate3{off} = -14;

 ###  123456789012345
 ### "TTTTTTCTTTCCAGG";
 $refAcceptor = "TTTTTTCTTTCCAGG";
 @refAcceptor=split(//,$refAcceptor);

 $primate3{score} = [
		     [9,31,15,45],
		     [9,33,13,45],
		     [7,31,11,51],
		     [7,35,7,51],
		     [10,35,7,47],
		     [10,35,11,44],
		     [7,43,7,42],
		     [9,41,8,42],
		     [6,39,6,48],
		     [6,40,8,46],
		     [23,29,23,24],
		     [3,74,1,22],
		     [100,0,0,0],
		     [0,0,100,0],
		     [28,13,49,10]
		    ];
}
##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### 
##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### 
##### set_off
##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### 
##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### 
sub set_off{
  my $this=shift;
  $l_off_d=shift;
  $r_off_d=shift;
  $l_off_a=shift;
  $r_off_a=shift;
}
##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### 
##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### 
##### get_off
##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### 
##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### 
sub get_off{
  my $this=shift;
  return "l_off_d=$l_off_d\n".
    "r_off_d=$r_off_d\n".
      "l_off_a=$l_off_a\n".
	"r_off_a=$r_off_a\n";
}
###  ###  ###  ###  ###  ###  ###  ###  ###  ###  
### getdonors
###  ###  ###  ###  ###  ###  ###  ###  ###  ###  
sub getdonors{ 
    my $this=shift;
    my $P_str=shift;
    my $P_donor=shift;
    my $threshold=shift; if(!defined($threshold)){$threshold=0;}

    my $str = $$P_str;$str =~ tr/A-Z/a-z/;
#    warn "SP-$str\n";

    my $pos=$l_off_d-1;## 3;
    my $len=length($str)-$r_off_d;
   # warn "SP-$len,$pos\n";
    while($pos<$len){
	$pos = index($str,"gt",$pos);
#	warn "SP-$pos\n";
	## this cannot normally happen
	if($pos<=0){last;}## for safety, if the 7 above is not right 
	#   push(@$P_donor,[substr($str,$pos-3,9),$pos-3]);
	my $sbstr=substr($str,$pos-$l_off_d,$l_off_d+$r_off_d);
	my $score=$this->don_sena_Score($sbstr);
	if ($score>=$threshold) {
	#    warn "SP-$sbstr,$pos-$l_off_d,$score";
	    push(@$P_donor,[$sbstr,$pos-$l_off_d,$score]);
	}
	$pos+=2;
    }
}
###  ###  ###  ###  ###  ###  ###  ###  ###  ###  
###  ###  ###  ###  ###  ###  ###  ###  ###  ###  
### getacceptors
###  ###  ###  ###  ###  ###  ###  ###  ###  ###  
###  ###  ###  ###  ###  ###  ###  ###  ###  ###  
sub getacceptors{
    my $this=shift;
    my $P_str=shift;
    my $P_acceptor=shift;
    my $threshold=shift; if(!defined($threshold)){$threshold=0;}

    my $str = $$P_str;
    $str =~ tr/A-Z/a-z/;

    my $pos=$l_off_a-1-2;
    my $len=length($str)-$r_off_a-2;

    while($pos<$len){
	$pos = index($str,"ag",$pos);
#	print "str=$str,pos=$pos\n";
	## this cannot normally happen
	## for safety, if the 7 above is not right 
	if($pos<=0){last;}
# print "pos=$pos\n";print "l_off_a=$l_off_a\n";print "r_off_a=$r_off_a\n";
	my $sbstr=substr($str,$pos-$l_off_a+2,$l_off_a+$r_off_a);
	###	print "sbstr=$sbstr\n";
	my $score=$this->acc_sena_Score($sbstr);

	if ($score>=$threshold){
	    push(@$P_acceptor,[$sbstr,$pos-$l_off_a+2,$score]);
	}
	$pos+=2;
    }
}
#######    #######    #######    #######    #######
#######    #######    #######    #######    #######
#######  don_sena_Score
########## return -1 for invalid stuff
#######    #######    #######    #######    #######
#######    #######    #######    #######    ######
sub don_sena_Score{
  my $this=shift;
  my $ss=shift;
  $ss =~ tr/a-z/A-Z/;
  my @tmp = split(//,$ss);
  my $len=@tmp;
  if ($len != @{$primate5{score}}) {
       return -1; 
  }
  my $score=0;
#  $myUtil->prettyPrint($primate5{score});
  for(my $jk=0;$jk<@tmp;$jk++){ 
   # my $tmpsc;$tmpsc=$primate5{score}->[$jk]->[$pos{$tmp[$jk]}];
#    print "jk=$jk,tmp=$tmp[$jk],pos=$pos{$tmp[$jk]}\n";
    $score+=$primate5{score}->[$jk]->[$pos{$tmp[$jk]}];
  } 
  $score = 100*($score-$mint)/($maxt-$mint);

  $score+=0.5;  return int($score);
  # $score=~s/(\.\d\d)\d+$/$1/;  return $score;
}

#######    #######    #######    #######    #######
#######    #######    #######    #######    #######
#######  acc_sena_Score
#######    #######    #######    #######    #######
#######    #######    #######    #######    ######
sub acc_sena_Score{
  my $this=shift;
  my $ss=shift;
  $ss =~ tr/a-z/A-Z/;
  my @tmp = split(//,$ss);
  my $len=@tmp;
  my $score=0;
  if ($len != @{$primate3{score}}) {return -1;}

  my @scores;
  for(my $jk=0;$jk<$len;$jk++){
    $scores[$jk]=$primate3{score}->[$jk]->[$pos{$tmp[$jk]}];
  }
  my @pyr = @scores[0 .. 9];
  my @pur = @scores[11 .. 14];
  
  @pyr = sort {$b<=>$a}@pyr;
  @pur = sort {$b<=>$a}@pur;

  my $t1=0; ## sum of 8/10 best percentages at pyrimidine positions.
  for (my $i=0;$i<8;$i++) {$t1+=$pyr[$i];}

  my $t2=0; ## sum of 4 best percentages at CAGG positions.
  for (my $i=0;$i<4;$i++) {$t2+=$pur[$i];}

  my $l1 =57; ## sum of lowest 8/10 best percentages at pyrimidine positions.
  my $l2 =11; ## sum of 4 best percentages at CAGG positions.

  my $h1=377;##sum of highest 8 of  10 percentages at pyrimidine position(377)
  my $h2=323;##sum of highest 4 percentages of CAGG positions (323)

  $score = 100*(($t1-$l1)/($h1-$l1) + ($t2-$l2)/($h2-$l2))/2;

  $score+=0.5; return return int($score);
## old
#  $score=~s/(\.\d\d)\d+$/$1/;  return $score;
 
}
#######    #######    #######    #######    #######    
####### don_our_Score
#######    #######    #######    #######    #######    
sub don_our_Score{
  my $this = shift;
  my $ss=shift;
  $ss =~ tr/a-z/A-Z/;
  my @tmp = split(//,$ss);
  my $score="";
  $score .= $one{$tmp[0]};
  for(my $ij=1;$ij<@tmp;$ij++){ 
    if($tmp[$ij] eq $refDonor[$ij-1])
      {$score .= "0";}
    else 
      {$score .= "1";}
  } 
  return $score;
}
#######   #######   #######   #######   #######   #######   
#######   sites
#######  Usage :: $sp->sites(
##	                $name,$seq,$content{markers},
##                       \@exons,\@introns,\@donors,\@acceptors,\@cryptics
##                   ); 
#######   #######   #######   #######   #######   #######   
sub sites{
  my $this=shift;
  my $name= shift;
  my $seq = shift;
  my $P_markers = shift;

  my ($P_exons,$P_introns,$P_donors,$P_acceptors,$P_cryptics)
   = (shift,shift,shift,shift,shift);

  my @tmp = @{$P_markers};
  my (@cryexon,@cry);
  my (@exon_end,@exon_st);
  foreach(@tmp){
    #5369 label=exon4_end
    if(/^(\d+)\s+label=(\S+)/){ 
      my $pos = $1;
      my $label = $2;
      #   print "pos=$pos label=$label\n";
      if($label=~/exon(\d+)\_start/){
	$exon_st[$1-1] = $pos;
      }elsif($label=~/exon(\d+)\_end/){
	$exon_end[$1-1]=$pos;
      }elsif($label=~/cryptic(\d+)\_start/){
	$cry[$1-1]=$pos;
      }
    }
  }
  if(@exon_end != @exon_st){ die " problem in number of exon starts and ends";}

  # print "First exon=";
  my $exon = substr($seq,$exon_st[0],$exon_end[0]-$exon_st[0]);
  push(@$P_exons,$exon);

  my $intron = substr($seq,$exon_end[0],$exon_st[1]-$exon_end[0]);
  push(@$P_introns,$intron);

  my $donor=substr($seq,$exon_end[0]-$l_off_d,$l_off_d+$r_off_d);
  if(!defined($donor)){die "bad donor";} 
  push(@$P_donors,$donor);

  for(my $ij=1;$ij<(@exon_end-1);$ij++){ 
    my $exon = substr($seq,$exon_st[$ij],$exon_end[$ij]-$exon_st[$ij]);
    push(@$P_exons,$exon);

    my $intron = substr($seq,$exon_end[$ij],$exon_st[$ij+1]-$exon_end[$ij]);
    push(@$P_introns,$intron);

    my $acceptor=substr($seq,$exon_st[$ij]-$l_off_a,$l_off_a+$r_off_a);
    push(@$P_acceptors,$acceptor);

    my $donor=substr($seq,$exon_end[$ij]-$l_off_d,$l_off_d+$r_off_d);
    push(@$P_donors,$donor);

    if(!defined($donor)){
      print STDERR "$name\n";
      print STDERR "$exon_end[$ij]-$l_off_d,$l_off_d+$r_off_d\n";
      die "bad donor";
    } 
  } 
  ## The last guy
  $exon = substr($seq,$exon_st[-1],$exon_end[-1]-$exon_st[-1]);
  push(@$P_exons,$exon);

  my $acceptor=substr($seq,$exon_st[-1]-$l_off_a,$l_off_a+$r_off_a);
  push(@$P_acceptors,$acceptor);

  foreach(@cry){
    my $sbstr=substr($seq,$_-$l_off_d,$l_off_d+$r_off_d);
    push(@$P_cryptics,$sbstr);
  }  
  return;
} 

###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  
###  fixsplice
###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  
sub fixsplice{
  my $this = shift;
  my $P_seq = shift;
  my @tmp = $$P_seq=~ m{([A-Z]{6,12}
		      [a-z]{6,12})([a-z]*?)
		     ([a-z]{6,12}
		      [A-Z]{6,12})
		    }xgs;
  for(my $i=0;$i<@tmp;$i+=3){
    my $j=$i+1;
    my $k=$i+2;
    ## print STDERR "i=$i\n";
    ## print STDERR "$tmp[$i] ---- $tmp[$k]\n";
    #  print STDERR "$tmp[$i] ---- $tmp[$k]\n";
    my ($one,$two,$three)=($tmp[$i],$tmp[$i+1],$tmp[$i+2]);
    if($one =~ /[A-Z]gt/ && $three=~/ag[A-Z]/){
      # TGCTGgtgg ---- ttccagGCGA
      ## print STDERR "No Problem\n";
      ### ADDtions to the three side.
      next;
    }elsif($one =~ /Gt/ && $three=~/agg[A-Z]/){
      # GCAGGtgcgt ---- ccaggCGGGG
      $one =~ s/Gt/gt/;$three=~ s/agg([A-Z])/agG$1/;
      ## print STDERR "1--$one ---- $three\n";
         ## minimalist approach
    }elsif($three=~/aggt([a-z]*?)[A-Z]/ && $one=~/GT\U$1\E[a-z]/){
      # GCGTABCgcgt ---- aggtabcCGGGG
      $three=~s/aggt([a-z]*?)([A-Z])/agGT\U$1\E$2/;
      my $t="\U$1\E";
      $one=~s/GT$t([a-z])/gt\L$t\E$1/;
      ## print STDERR "2--$one ---- $three\n";
     ## non minimalist approach
    }elsif($three=~/aggt([a-z]*)[A-Z]/ && $one=~/GT\U$1\E[a-z]/){
      # GCGTABCgcgt ---- aggtabcCGGGG
      $three=~s/aggt([a-z]*)([A-Z])/agGT\U$1\E$2/;
      my $t="\U$1\E";
      $one=~s/GT$t([a-z])/gt\L$t\E$1/;
      ## print STDERR "3--$one ---- $three\n";
      ## BELOW are ADDITIONS to the one side.
    }elsif($one =~ /[A-Z]ggt/ && $three=~/aG/){
      # TGCTGggtgg ---- ttccaGGCGA
      $one =~ s/([A-Z])ggt/$1Ggt/;$three=~ s/aG/ag/;
      ## print STDERR "4--$one ---- $three\n";
      ## minimalist approach
    }elsif($one=~/[A-Z]([a-z]*?)aggt/ && $three=~/[a-z]\U$1\EAG/){
      #CATGGcaggt ---- ccgccCAGGT
      $one =~ s/([A-Z])([a-z]*?)aggt/$1\U$2\EAGgt/;
      my $t="\U$2\E";
      $three=~s/
	([a-z])($t
		AG)
		  /$1\L$2\E/x;
      ## print STDERR "5--$one ---- $three\n";
      ## greedy approach
    }elsif($one=~/[A-Z]([a-z]*)aggt/ && $three=~/[a-z]\U$1\EAG/){
      #CATGGcaggt ---- ccgccCAGGT
      $one =~ s/([A-Z])([a-z]*)aggt/$1\U$2\EAGgt/;
      my $t="\U$2\E";
      $three=~s/
	([a-z])($t
		AG)
		  /$1\L$2\E/x;
      ## print STDERR "6--$one ---- $three\n";
    } else { 
      print LOG "Not known pattern\n";
    }
    ## print STDERR "\n===========================\n";
    $$P_seq =~ s/$tmp[$i]$tmp[$j]$tmp[$k]/$one$two$three/;
  }
  return;
} 
###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  
###  fixsplice_alt
## Fixes gc...ag (instead of gt..ag
###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  
sub fixsplice_alt{
  my $this=shift;
  my $P_seq = shift;
  my @tmp = $$P_seq=~ m{([A-Z]{6,12}
		      [a-z]{6,12})([a-z]*?)
		     ([a-z]{6,12}
		      [A-Z]{6,12})
		    }xgs;
  for(my $i=0;$i<@tmp;$i+=3){
    my $j=$i+1;
    my $k=$i+2;
    ## print STDERR "i=$i\n";
    ## print STDERR "$tmp[$i] ---- $tmp[$k]\n";
    #  print STDERR "$tmp[$i] ---- $tmp[$k]\n";
    my ($one,$two,$three)=($tmp[$i],$tmp[$i+1],$tmp[$i+2]);
    if($one =~ /[A-Z]gt/ && $three=~/ag[A-Z]/){
      ## print STDERR "No Problem\n";
      next;
    } elsif($one =~ /[A-Z]gc/ && $three=~/ag[A-Z]/){
      # TGCTGgcgg ---- ttccagGCGA
      next;
    }elsif($one =~ /Gc/ && $three=~/agg[A-Z]/){
      # GCAGGtgcgt ---- ccaggCGGGG
      $one =~ s/Gc/gc/;$three=~ s/agg([A-Z])/agG$1/;
      ##  print STDERR "1--$one ---- $three\n";
      ## minimalist approach
    }elsif($three=~/aggc([a-z]*?)[A-Z]/ && $one=~/GC\U$1\E[a-z]/){
      # GCGTABCgcgt ---- aggtabcCGGGG
      $three=~s/aggc([a-z]*?)([A-Z])/agGC\U$1\E$2/;
      my $t="\U$1\E";
      $one=~s/GC$t([a-z])/gc\L$t\E$1/;
      ## print STDERR "2--$one ---- $three\n";
     ## non minimalist approach
    }elsif($three=~/aggc([a-z]*)[A-Z]/ && $one=~/GC\U$1\E[a-z]/){
      # GCGTABCgcgt ---- aggtabcCGGGG
      $three=~s/aggc([a-z]*)([A-Z])/agGC\U$1\E$2/;
      my $t="\U$1\E";
      $one=~s/GC$t([a-z])/gc\L$t\E$1/;
      ## print STDERR "3--$one ---- $three\n";
      ## BELOW are ADDITIONS to the one side.
    }elsif($one =~ /[A-Z]ggc/ && $three=~/aG/){
      # TGCTGggtgg ---- ttccaGGCGA
      $one =~ s/([A-Z])ggc/$1Ggc/;$three=~ s/aG/ag/;
      ## print STDERR "4--$one ---- $three\n";
      ## minimalist approach
    }elsif($one=~/[A-Z]([a-z]*?)aggc/ && $three=~/[a-z]\U$1\EAG/){
      #CATGGcaggt ---- ccgccCAGGT
      $one =~ s/([A-Z])([a-z]*?)aggc/$1\U$2\EAGgc/;
      my $t="\U$2\E";
      $three=~s/
	([a-z])($t
		AG)
		  /$1\L$2\E/x;
      ## print STDERR "5--$one ---- $three\n";
      ## greedy approach
    }elsif($one=~/[A-Z]([a-z]*)aggc/ && $three=~/[a-z]\U$1\EAG/){
      #CATGGcaggt ---- ccgccCAGGT
      $one =~ s/([A-Z])([a-z]*)aggc/$1\U$2\EAGgc/;
      my $t="\U$2\E";
      $three=~s/
	([a-z])($t
		AG)
		  /$1\L$2\E/x;
      ## print STDERR "6--$one ---- $three\n";
    } else { 
      print LOG "Not known pattern\n";
    }
#    print STDERR "\n===========================\n";
#    print STDERR "$one\n$three\n";
    $$P_seq =~ s/$tmp[$i]$tmp[$j]$tmp[$k]/$one$two$three/;
#    print STDERR "$one\n$three\n";
#    print STDERR "ALT\n===========================\n";
  }
  return;
} 
###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  
###  fixsplice_oneOff
### fixes one-off errors, which cannot be fixed by 
#### fixsplice_more
###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  
sub fixsplice_oneOff{
  my $this = shift;
  my $P_seq = shift;
  my @tmp = $$P_seq=~ m{([A-Z]{6,12}
		      [a-z]{6,12})([a-z]*?)
		     ([a-z]{6,12}
		      [A-Z]{6,12})
		    }xgs;
  for(my $i=0;$i<@tmp;$i+=3){
    my $j=$i+1;
    my $k=$i+2;
    my ($one,$two,$three)=($tmp[$i],$tmp[$i+1],$tmp[$i+2]);
    if($one =~ /[A-Z]gt/ && $three=~/ag[A-Z]/){
      ## print STDERR "No Problem\n";
      ## NO PROBLEM
      next;
      ## neither gt not ag
      ## the length stuff can be fixed also starting with three
      ## now we start with one, so we should try that in another sub routine
   }elsif($one =~ /[A-Z]gc/ && $three=~/ag[A-Z]/){
      ## print STDERR "No Problem\n";
      ## NO PROBLEM
      next;
      ## neither gt not ag
      ## the length stuff can be fixed also starting with three
      ## now we start with one, so we should try that in another sub routine
    }elsif($one =~ /GT[A-Z]*(g[^t]|[^g][a-z])/ && 
	   $three=~/ag[a-z]*(a[^g]|[^a][a-z])[A-Z]/){ 
      ## there is a chance to fix something here
      ###  GTGAGg gcctgc   aggaga gTA
      # ACCAGGTTAGAGatgctcagtgcc tttagattagagCTGGAACTGAGG
     my $r_one = reverse($one); 
	my $onem2;my $onem1;
      if($r_one=~/([a-z])([A-Z]*?TG)/){ 
	$onem2=$1; $onem2 = reverse($onem2);
	$onem1=$2; $onem1 = reverse($onem1);
      } else { die "$one,$two error 2\n";}
      my $len = length($onem1);
      my $r_three = reverse($three); 
      my $threem2;my $threem1;
      if($r_three=~/([A-Z])([a-z]{$len})ga/){ 
	$threem2=$1;$threem2 = reverse($threem2);
	$threem1=$2;$threem1 = reverse($threem1);
      }
      my $sd=$this->seqdiff($onem1,$threem1);
      if($sd==1){
	$one=~s/$onem1$onem2/\L$onem1$onem2\E/;
	$three=~s/$threem1$threem2/\U$threem1$threem2\E/;
	print STDERR "fixed problem1 $one $three\n";
      } else { 
	print STDERR "failed to fix problem1 $sd $onem1 $threem1\n";
	print STDERR "failed to fix problem1 $one $three\n";
      } 
      ## not gt but ag
    }elsif($one =~ /GT[A-Z]*(g[^t]|[^g][a-z])/ && 
	   $three=~/ag[a-z]*(ag)[A-Z]/){ 
      ## there is a chance to fix something here
      ###  GTGAGg gcctgc   aggaga gTA
      # ACCAGGTTAGAGatgctcagtgcc tttagattagagCTGGAACTGAGG
      my $r_one = reverse($one); 
      my $onem2;my $onem1;
      if($r_one=~/([a-z])([A-Z]*?TG)/){ 
	$onem2=$1; $onem2 = reverse($onem2);
	$onem1=$2; $onem1 = reverse($onem1);
      } else { die "error 3 $one $three\n";}
      my $len = length($onem1);
      $len-=2;
      my $r_three = reverse($three); 
      my $threem2;my $threem1;
      if($r_three=~/([A-Z])(ga[a-z]{$len})ga/){ 
	 $threem2=$1;$threem2 = reverse($threem2);
	 $threem1=$2;$threem1 = reverse($threem1);
      } else { 
	print STDERR "HAHAH\n";
      } 
      my $sd=$this->seqdiff($onem1,$threem1);
      if($sd==1){
	$one=~s/$onem1$onem2/\L$onem1$onem2\E/;
	$three=~s/$threem1$threem2/\U$threem1$threem2\E/;
	print STDERR "fixed problem1c $one $three\n";
      } else { 
	print STDERR "failed to fix problem1an $sd $onem1 $threem1\n";
	print STDERR "failed to fix problem1an $one $three\n";
      } 
      ####       $three=~s/([a-z])($tAG)/$1\L$2\E/x;
      ##  gt but not ag
    }elsif($one =~ /GT[A-Z]*(gt)/ && 
	   $three=~/ag[a-z]*([^a][a-z]|a[^g])[A-Z]/){ 
      ## there is a chance to fix something here
      ###  GTGAGg gcctgc   aggaga gTA
      my $r_one = reverse($one); 
      my $onem2;
      my $onem1;

      if($r_one=~/(tg)([A-Z]*?TG)/){ 
	$onem2=$1; $onem2 = reverse($onem2);
	$onem1=$2; $onem1 = reverse($onem1);
      } else { die "something wrong 1 $one $three";}
      my $len = length($onem1);
      my $r_three = reverse($three); 
      my $threem2;
      my $threem1;
      if($r_three=~/([A-Z])([a-z]{$len})ga/){
	$threem2=$1;$threem2 = reverse($threem2);
	$threem1=$2;$threem1 = reverse($threem1);
      }
      my $sd=$this->seqdiff($onem1,$threem1);
      if($sd==1){
	$one=~s/$onem1$onem2/\L$onem1$onem2\E/;
	$three=~s/$threem1$threem2/\U$threem1$threem2\E/;
	print STDERR "fixed problem1 $one $three\n";
      } else { 
	print STDERR "failed to fix problem1d $sd $onem1 $threem1\n";
	print STDERR "failed to fix problem1d $one $three\n";
      } 
      ####       $three=~s/([a-z])($tAG)/$1\L$2\E/x;
      ### neither gt not ag
    }elsif($one =~ /[A-Z](([^g][a-z]|g[^t])[a-z]*)gt/ && 
	   $three=~/([^a][a-z]|a[^g])[A-Z]*AG/){ 
      ## there is a chance to fix something here
      ###  GTGAGg gcctgc   aggaga gTA
      my $onem1;my $onem2;
      if($one =~ /[A-Z]([a-z]*?)(gt)/){ 
	 $onem1=$1;
	 $onem2=$2;
      } else { die "error 4 $one $three";}
      my $len = length($onem1);
      $len -= 2;
	my $threem1;
	my $threem2;
      if($three=~/([a-z])([A-Z]{$len}AG)/){ 
	 $threem1=$1;
	 $threem2=$2;
      } 
      my $sd=$this->seqdiff($onem1,$threem2);

#      if($sd==1){
      if($sd==2||$sd==1){
	$one=~s/$onem1$onem2/\U$onem1\E$onem2/;
	$three=~s/$threem1$threem2/\L$threem1$threem2\E/;
      }  else { 
	print STDERR "failed to fix problem2 $sd $onem1 $threem1\n";
	print STDERR "failed to fix problem2 $sd $one $three\n";
      } 
      ###  gt not ag
    }elsif($one =~ /[A-Z](gt[a-z]*)gt/ && 
	   $three=~/([^a][a-z]|a[^g])[A-Z]*AG/){ 
      ## there is a chance to fix something here
      ###  GTGAGg gcctgc   aggaga gTA
      my $onem1;
      my $onem2;
      if($one =~ /[A-Z](gt[a-z]*?)(gt)/){ 
	$onem1=$1;
	$onem2=$2;
      } else { die "error 5 $one $three";}
      my $len = length($onem1);
      $len -= 2;
	my $threem1;
	my $threem2;
      if($three=~/([a-z])([A-Z]{$len}AG)/){ 
	$threem1=$1;
	$threem2=$2;
      } 
      my $sd=$this->seqdiff($onem1,$threem1);
      if($sd==1){
	$one=~s/$onem1$onem2/\U$onem1\E$onem2/;
	$three=~s/$threem1$threem2/\L$threem1$threem2\E/;
      }  else { 
	print STDERR "failed to fix problem3 $sd $onem1 $threem1\n";
	print STDERR "failed to fix problem3 $sd $one $three\n";
      } 
      ### not gt , ag
    }elsif($one =~ /[A-Z](([^g][a-z]|g[^t])[a-z]*)gt/ && 
	   $three=~/(ag)[A-Z]*AG/){ 
      ## there is a chance to fix something here
      ###  GTGAGg gcctgc   aggaga gTA
      my $onem1;
      my $onem2;
      if($one =~ /[A-Z]([a-z]*?)(gt)/){ 
	 $onem1=$1;
	 $onem2=$2;
      } else { die "error 6 $one $three\n";}
      my $len = length($onem1);
      $len -= 2;
	my $threem1;
	my $threem2;
      if($three=~/([a-z])([A-Z]{$len}AG)/){ 
	 $threem1=$1;
	 $threem2=$2;
      } 
      my $sd=$this->seqdiff($onem1,$threem1);

      if($sd==1){
	$one=~s/$onem1$onem2/\U$onem1\E$onem2/;
	$three=~s/$threem1$threem2/\L$threem1$threem2\E/;
      }  else { 
	print STDERR "failed to fix problem2 $sd $onem1 $threem1\n";
      } 
      #### #### #### #### ##### $three=~s/([a-z])($tAG)/$1\L$2\E/x;
    } else { 
      #### print  LOG "fixsplice_oneOff Not known pattern\n";
      print  STDERR "fixsplice_oneOff Not known pattern\n";
      print  STDERR "$one\n$three\n";
      print  LOG "fixsplice_oneOff Not known pattern\n";
      print  LOG "$one\n$three\n";
    }
    ## print STDERR "\n===========================\n";
    $$P_seq =~ s/$tmp[$i]$tmp[$j]$tmp[$k]/$one$two$three/;
  }
  return;
} 
###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  
###  fixsplice_more
### fixes one-off errors, not really kosher
###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  ###  
sub fixsplice_more{
  my $this=shift;
  my $P_seq = shift;
  my @tmp = $$P_seq=~ m{([A-Z]{6,12}
		      [a-z]{6,12})([a-z]*?)
		     ([a-z]{6,12}
		      [A-Z]{6,12})
		    }xgs;
  for(my $i=0;$i<@tmp;$i+=3){
    my $j=$i+1;
    my $k=$i+2;
    my ($one,$two,$three)=($tmp[$i],$tmp[$i+1],$tmp[$i+2]);
    if($one =~ /[A-Z]gt/ && $three=~/ag[A-Z]/){
      ## print STDERR "No Problem\n";
      ## NO PROBLEM
      next;
   }elsif($one =~ /[A-Z]gc/ && $three=~/ag[A-Z]/){
      ## print STDERR "No Problem\n";
      ## NO PROBLEM
      next;
      ## neither gt not ag
      ## the length stuff can be fixed also starting with three
      ## now we start with one, so we should try that in another sub routine
#    }elsif($one =~ /Gt/ && $three=~/ag[A-Z]/){
      ## fixing a small one base pair error
#      $one =~ s/Gt/gt/;
    }elsif($one =~ /Gt/ && $three=~/a[a-z][A-Z]/){
      ## fixing a small one base pair error
      $one =~ s/Gt/gt/;
      $three =~ s/a[a-z]([A-Z])/ag$1/;
    }elsif($one =~ /Gt/ && $three=~/[a-z]g[A-Z]/){
      ## fixing a small one base pair error
      $one =~ s/Gt/gt/;
      $three =~ s/[a-z]g([A-Z])/ag$1/;
    }elsif($one =~ /[A-Z][a-z]t/ && $three=~/aG[A-Z]/){
      ## fixing a small one base pair error
      $one =~ s/([A-Z])[a-z]t/$1gt/;
      $three =~ s/aG([A-Z])/ag$1/;
    }elsif($one =~ /[A-Z]g[a-z]/ && $three=~/aG[A-Z]/){
      ## fixing a small one base pair error
      $one =~ s/([A-Z])g[a-z]/$1gt/;
      $three =~ s/[a-z]g([A-Z])/ag$1/;
    }elsif($one =~ /[A-Z]gt/ && $three=~/aG[A-Z]/){
      $three=~s/aG/ag/;
    }elsif($one =~ /[A-Z]Gt/ && $three=~/aG[A-Z]/){
     $one =~ s/Gt/gt/; $three=~s/aG/ag/;
    }elsif($one =~ /[A-Z][a-z]gt/ && $three=~/ag[A-Z]/){
      $one =~ s/([A-Z])([a-z])gt/$1\U$2\Egt/;
    }elsif($one =~ /[A-Z]gt/ && $three=~/ag[a-z][A-Z]/){
      $three =~ s/ag([a-z])([A-Z])/ag\U$1\E$2/;
    }elsif($one =~ /[A-Z][a-z]gt/ && $three=~/ag[a-z][A-Z]/){
      $one =~ s/([A-Z])([a-z])gt/$1\U$2\Egt/;
      $three =~ s/ag([a-z])([A-Z])/ag\U$1\E$2/;
    } else { 
      print  LOG "fixsplice_more Not known pattern\n";
    }
    ## print STDERR "\n===========================\n";
    $$P_seq =~ s/$tmp[$i]$tmp[$j]$tmp[$k]/$one$two$three/;
  }
  return;
} 
## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 
## return number of differences between two sequences of equal length, 
## does not do any smart thing like diff does.
## a minus 1 return means meaningless value.
## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 
sub seqdiff{
  my $this=shift;
  my $seq1=shift;
  my $seq2=shift;
  if(length($seq1) != length($seq2)){return -1;} 
  $seq1=~tr/a-z/A-Z/;$seq2=~tr/a-z/A-Z/;
  my @tmp1 = split(//,$seq1);my @tmp2 = split(//,$seq2);
  my $cnt=0;
  for(my $i=0;$i<@tmp1;$i++)
    {if($tmp1[$i] ne $tmp2[$i]){$cnt++;}} 
  return $cnt;
}


1;
