#!/usr/local/bin/perl

# Filename: GetRiceGenome.pl
 
# Description:
#    It searches GenBank for rice BACs and PACs as well as their annotations. 
# Then, it parses, calculates and formats them into text data files according 
# to the table definition in the Oracle database for rice genome. Finally, 
# all of the data will be loaded into the relational database.

# Version: 1.0

# Author: Xiaokang Pan (pan@cshl.org)


use strict;
use Getopt::Long;
use Boulder::Genbank;

my @accns_to_fetch;
GetOptions("acc=s"=>\@accns_to_fetch);


open(OUTClone, ">clone.dat") || die "cannot create";
open(OUTContig, ">contig.dat") || die "cannot create";
open(OUTExon, ">exon.dat") || die "cannot create";
open(OUTGene, ">gene.dat") || die "cannot create";
open(OUTGenedesc, ">gene_description.dat") || die "cannot create";
open(OUTGenetype, ">genetype.dat") || die "cannot create";
open(OUTTransl, ">translation.dat") || die "cannot create";
open(OUTTransc, ">transcript.dat") || die "cannot create";
open(OUTExon_Transc, ">exon_transcript.dat") || die "cannot create";
open(OUTContig_land, ">contig_land.dat") || die "cannot create";
open(OUTDNA, ">dna.dat") || die "cannot create";
open(OUTDNASEQ, ">dnaseq.fasta") || die "cannot create";
open(OUTChr, ">chromosome.dat") || die "cannot create";
open(OUTGoldenPath, ">static_golden_path.dat") || die "cannot create";
open(OUTXref, ">Xref.dat") || die "cannot create";
open(OUTobjectXref, ">objectXref.dat") || die "cannot create";
open(OUTRepeat, ">repeat_feature.dat") || die "cannot create";
open(OUTSummary, ">summary.dat") || die;


select((select( OUTDNA), $|=1)[0]);
select((select( OUTDNASEQ), $|=1)[0]);
select((select( OUTClone), $|=1)[0]);
select((select( OUTContig), $|=1)[0]);
select((select( OUTExon), $|=1)[0]);
select((select( OUTGene), $|=1)[0]);
select((select( OUTGenedesc), $|=1)[0]);
select((select( OUTGenetype), $|=1)[0]);
select((select( OUTTransl), $|=1)[0]);
select((select( OUTTransc), $|=1)[0]);
select((select( OUTExon_Transc), $|=1)[0]);
select((select( OUTContig_land), $|=1)[0]);
select((select( OUTChr), $|=1)[0]);
select((select( OUTGoldenPath), $|=1)[0]);
select((select( OUTXref), $|=1)[0]);
select((select( OUTobjectXref), $|=1)[0]);
select((select( OUTRepeat), $|=1)[0]);
select((select( OUTSummary), $|=1)[0]);


#get current date
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime(time);
my $today = ($year+1900)."\/".($mon+1)."\/".$mday;

my @maxDaysInMon;
if (($year+1900) % 4 == 0) {
   @maxDaysInMon = (31,29,31,30,31,30,31,31,30,31,30,31);

} else {
   @maxDaysInMon = (31,28,31,30,31,30,31,31,30,31,30,31);
}

my ($nextmday, $nextmon);
if ($maxDaysInMon[$mon] > $mday) { 
  $nextmday = $mday + 1; 
  $nextmon = $mon + 1;

} else {
  $nextmday = 1; 
  $nextmon = $mon + 2;
}
my $nextday = ($year+1900)."\/".$nextmon."\/".$nextmday;

#check when we run this script last time 
open (INPUTDate, "timeRecordFile") || die "cannot open timeRecordFile";
my ($date, $ini_date);
while ($date = <INPUTDate>) {
  chomp($date);
  $ini_date = $date; 
}

#record the date of nextday as the initial date of next run
open (OUTDate, ">>timeRecordFile") || die "cannot append to timeRecordFile:$!";
print OUTDate "$nextday\n";
close OUTDate;

# network access via Entrez
my $gb;
if(@accns_to_fetch) {
    my $acclist = join " ", map { split (/[+ ,]/,$_) } @accns_to_fetch;
    print "to fetch $acclist\n";
    $gb = new Boulder::Genbank( -accessor => 'Entrez', 
                                -fetch => [$acclist]) or die "Boulder: $!"; 
} else {
    my $query = "Oryza [ORGN] AND (30000:1000000[SLEN]) AND ";
    if ($ini_date) {
      $query .= $ini_date.":";
      $query .= $today."[MDAT] AND ";
    } 
    $query .="((htg [KYWD] OR BAC [ALL] OR chromosome [TITL] OR PAC [ALL]) ";
    $query .="NOT (marker [TITL] OR cDNA [TITL] OR mRNA [TITL] OR ";
    $query .="RAPD [TITL OR GSS [KYWD] OR telomere [TITL] OR Protein [TITL]))";

    print "$query\n";

    $gb = new Boulder::Genbank( -accessor=>'Entrez', 
				-query=> $query, 
				-db   => 'n'   
			      ) or die "Boulder: $!";
}

my $count = 0;   #to count the number of BACs and PACs.
my ($phase1_count, $phase2_count, $phase3_count)=(0, 0, 0);
my ($phase1_size, $phase2_size, $phase3_size)=(0, 0, 0);
my $ctg_count = 0;
my ($known_genes, $unknown_genes) = (0, 0);

my $exon_count = 0;
my $repeat_count = 0;
my $transl_count = 0;
my $transc_count = 0;
my (%known_gene, %unknown_gene); 
my (%site_entries, %site_size);
my (%phase1_site_entries, %phase1_site_size);
my (%phase2_site_entries, %phase2_site_size);
my (%phase3_site_entries, %phase3_site_size);
my ($clone, $clone_id, $version, $chr, $create_date, $htg_phse);
my ($clone_id, $clone, $gi, $version, $create_date, $htg_phase, $chr, $site);

getLastTimeResults();

#start to parsing the data
while (my $data = $gb->get) {

  $count++;

  #to get accession number
  $clone_id = $data->Accession;

  #to get gi number
  $gi = $data->Nid;
  $gi = substr($gi, 1, length($gi));

  #to get clone name;
  $clone = $data->Features->Source->Clone; 
  if (!$clone) {
    $clone = $clone_id;
  }
 
  #to get site where the sequence comes from
  $site = getSite($data);

  #to get chromosome where the sequence belongs to
  $chr = getChr($data, $site);

  #to get the version of the GenBank record
  my $ver = $data->Version;
  my @chars = split(/\./, $ver);
  $version = $chars[1]; 

  #to get the keywords
  my $keywords = $data->Keywords;
  chomp($keywords);
  chop($keywords);

  #to get the locus
  my $locus = $data->Locus;
  my @locus_words = split(/ /, $locus);

  #to get create date
  $create_date = $locus_words[-1];
   
  #to get phase
  $htg_phase = findHTG_Phase($locus, $keywords);
  
  my $seq   = $data->Sequence;
  my $start = $data->start;
  my $end   = $data->end;
  my $length= $data->length;

  $site_entries{$site}++;
  $site_size{$site} += $length;

  if ($clone_id) {
    #site1 is just the id of this site
    my @temp = split(/\(/, $site);
    my $site1 = $temp[0];

    #put clone data into clone table
    print OUTClone "$clone_id,$clone_id,$gi,$clone,$version,$version,$htg_phase,$create_date,$create_date,$site1\n";

  }

  #to get contig and DNA data into contig table 
  my ($ctg_id, $ctg_id1, $offset, $ctg_len); 
  if ($htg_phase == 3) {
    $ctg_count++;
    $phase3_count++;
    $phase3_size += $length;
    $phase3_site_entries{$site}++;
    $phase3_site_size{$site} += $length;
    $ctg_id = $clone_id.".001";
    $ctg_id1= $clone_id.".".$version."~1";
    $offset = 1;
    $ctg_len= $length;

    print OUTDNA "$seq,$length,$create_date\n";
    print OUTDNASEQ ">$ctg_id\n",join("\n",$seq =~ /(.{1,60})/g),"\n";
				#- 60  chars per line
    print OUTContig "$ctg_id,$count,$ctg_len,$offset,0,$ctg_count,$chr,$ctg_id1\n";

    print OUTContig_land "$ctg_id,$clone_id,$start,$end,1,R$chr\n";

    getGoldenPath($offset, $ctg_len);

    getRepeat($data);

  } else {
    
    if ($htg_phase == 1) {
      $phase1_count++;
      $phase1_size += $length;
      $phase1_site_entries{$site}++;
      $phase1_site_size{$site} += $length;

    } elsif ($htg_phase == 2) {

      $phase2_count++;
      $phase2_size += $length;
      $phase2_site_entries{$site}++;
      $phase2_site_size{$site} += $length;
    }
    my $comment = $data->Comment;

    my @clines = split(/\*  /, $comment);
    my $clines_count = 0;
    my $ctg_count1 = 0;
    my $cline;
    foreach $cline (@clines) {
      $clines_count++;

      if ($clines_count > 1) {
       
	my @cline1 = split(/\: contig of/, $cline);
	my $cline2 = @cline1[0];

	if (!($cline2 =~ /gap of/)) {
	  $ctg_count1++;
	  $ctg_count++;

	  $ctg_id = $clone_id.".00".$ctg_count1;
	  $ctg_id1= $clone_id.".".$version."~".$ctg_count1;

	  my ($ctg_start, $ctg_end);
	  $cline2 =~ /(\d+)\s+(\d+)/;
	  $ctg_start = $1;
	  $ctg_end = $2;

	  $ctg_len = $ctg_end - $ctg_start + 1;
	  my $seq1 = substr($seq, $ctg_start-1, $ctg_len);

	  print OUTDNA "$seq1,$ctg_len,$create_date\n";
          print OUTDNASEQ ">$ctg_id\n$seq1\n";
	  print OUTContig "$ctg_id,$count,$ctg_len,$ctg_start,0,$ctg_count,$chr,$ctg_id1\n";

	  print OUTContig_land "$ctg_id,$clone_id,$start,$end,1,R$chr\n";


	  if ($htg_phase == 2) {

	    getGoldenPath($ctg_start, $ctg_len);
	  }
	} 
      }  
    }
    if ($clines_count <= 1) {

      my @bases = split(//, $seq);
      my $base_count = 0;
      my $base_count1= 0;
      my $n = 0;
      my $ctg_count1 = 0;
      my ($previous, $base);
      my ($offset, $ctg_len);
      foreach $base (@bases) {
	$base_count++;
	$base_count1++;

	if ($base eq "n") {
	  $n++;
	}  
          
	if ($n >= 40 && $previous eq "n" && $base ne "n") {
 	
	  $ctg_count1++;
	  $ctg_count++;

	  $ctg_id = $clone_id.".00".$ctg_count1;
	  $ctg_id1= $clone_id.".".$version."~".$ctg_count1;
	  $offset = $base_count - $base_count1 + 1;
	  $ctg_len= $base_count1-$n;

	  my $seq1 = substr($seq, $offset-1, $ctg_len);

	  print OUTDNA "$seq1,$ctg_len,$create_date\n";
          print OUTDNASEQ ">$ctg_id\n$seq1\n";
	  print OUTContig "$ctg_id,$count,$ctg_len,$offset,0,$ctg_count,$chr,$ctg_id1\n";

	  print OUTContig_land "$ctg_id,$clone_id,$start,$end,1,R$chr\n";

	  if ($htg_phase == 2) {

	    getGoldenPath($offset, $ctg_len);
	  }

	  $base_count1 = 0;
	  $n = 0;

	} elsif ($n < 40 && $previous eq "n" && $base ne "n") {

	  $n = 0;
	}
	$previous = $base;
      }
      $ctg_count1++;
      $ctg_count++;

      $ctg_id = $clone_id.".00".$ctg_count1;
      $ctg_id1= $clone_id.".".$version."~".$ctg_count1;
      $offset = $base_count - $base_count1 + 1;
      $ctg_len= $base_count1 - 1;
      my $seq1 = substr($seq, $offset-1, $ctg_len);

      print OUTDNA "$seq1,$ctg_len,$create_date\n";
      print OUTDNASEQ ">$ctg_id\n$seq1\n";
      print OUTContig "$ctg_id,$count,$ctg_len,$offset,0,$ctg_count,$chr,$ctg_id1\n";

      print OUTContig_land "$ctg_id,$clone_id,$start,$end,1,R$chr\n";

      if ($htg_phase == 2) {

	getGoldenPath($offset, $ctg_len); 
      }
    }
  }

  # just print to screen
  print "$count,$clone_id,$gi,$version,$create_date,$start,$end,$htg_phase,$chr,$length,$site\n";
   
  my $transl_count1 = 0;
  foreach ($data->Features->Cds) {
    
    #to get data for Exon and translation table
    my $exon_pos = $_->Position;
    my $codon_start = $_->Codon_start;
    my @temp = split(/\//, $codon_start);
    $codon_start = $temp[0];     

    my $protein_id = $_->Protein_id;
    my $note = $_->Note;
    my $product=$_->Product;
    my $strand;
    $transl_count1++;
    $transl_count++;
    my $transl_id = "GRMP".$transl_count;
    
    my ($exon_count1, $strand) = 
     getExonAndTranslation($exon_pos,$protein_id,$codon_start);
      
    #to get transcript data into transcript table
    $transc_count++;
    my $gene_id = "GRMG".$transc_count;
    print OUTTransc "GRMT$transc_count,1,$gene_id,$transl_id\n";

    #to get gene information
    getGene($gene_id, $protein_id, $note, $product);

    my $previous_exons = $exon_count - $exon_count1;

    my $rank;
    if ($strand == 1) {
      $rank = 0;

    } else {
      $rank = $exon_count1+1;

    }

    my $i;
    for ($i = $previous_exons+1; $i <= $exon_count; $i++) {

      if ($strand == 1) {
	$rank++;

      } else {
	$rank--;

      }	     
      print OUTExon_Transc "GRME$i,Transc$transc_count,$rank\n"; 
    }
  }
}


sub getSite {

  my ($data) = @_;
  my $site;
  my @journals = $data->Journal;
  my $journal;
  foreach $journal (@journals) {
     
    if ($journal =~ /Japan/) {
      $site = "RGP(Japan)";

    } elsif ($journal =~ /CHINA/ || $journal =~ /China/) {
      $site = "NCGR(China)";

    } elsif ($journal =~ /Korea/) {
      $site = "KRGRP(Korea)";

    } elsif ($journal =~ /Taiwan/) {
      $site = "ASPGC(Taiwan)";

    } elsif ($journal =~ /Thailand/) {
      $site = "BIOTEC(Thailand)";

    } elsif ($journal =~ /FRANCE/ || $journal =~ /France/) {
      $site = "Genoscope(France)";

    } elsif ($journal =~ /MD/) {
      $site = "TIGR(USA)";

    } elsif ($journal =~ /NY/) {
      $site = "CSHL(USA)"; 

    } elsif ($journal =~ /SC/) {
      $site = "CUGI(USA)"; 

    } elsif ($journal =~ /NJ/) {
      $site = "PGIR(USA)"; 

    } elsif ($journal =~ /WI/) {
      $site = "GCW(USA)"; 

    } elsif ($journal =~ /MO/) {
      $site = "GSC(USA)"; 

    } elsif ($journal =~ /\bIndian\s+Initiative\b/i) {
      $site = "IIRGP(India)"; 
    } elsif ($journal =~ /\bBotany\b.*University of Georgia\b/i) {
      $site = "LGBUGA(USA)"; 
    } 
    last if $site;
  }
  unless($site) {
      $site = "Others";
      print STDERR "Site Unknown for ",$data->Accession," Journals:",
	join("\n",@journals),"\n//";
  }
  return $site;
}


sub getChr {

  my ($data, $site) = @_;

  my $chr = $data->Features->Source->Chromosome; 

  if (!$chr) {
    $chr = "unknown";

  } elsif ($chr eq "X") {
    $chr = "unknown";

  }  elsif (length($chr) > 2) { 
    $chr = "unknown";

  }

  if ($chr eq "unknown") {

    if ($site eq "NCGR(China)") {
      $chr = 4;

    } elsif ($site eq "KRGRP(Korea)") {
      $chr = 1;

    } elsif ($site eq "ASPGC(Taiwan)") {
      $chr = 5;

    } elsif ($site eq "BIOTEC(Thailand)") {
      $chr = 9;

    } elsif ($site eq "Genoscope(France)") {
      $chr = 12;
  
    } elsif ($site eq "RGP(Japan)") {
      $chr = 0;

    } elsif ($site eq "TIGR(USA)" || 
	     $site eq "CSHL(USA)" ||
	     $site eq "CUGI(USA)" ||
	     $site eq "PGIR(USA)") {
      $chr = 0;

    } else {

      $chr = 0;
    }
  }
  return $chr;
}


sub findHTG_Phase {

  my ($locus, $keywords) = @_;

  my @locus_words = split(/ /, $locus);
 
  my $locus_word;
  my $result_phase;
  foreach $locus_word (@locus_words) { 

    if ($locus_word eq "PLN") {
      $result_phase = 3;

    } elsif ($locus_word eq "HTG") {
      if ($keywords =~ /PHASE/) {
	my @kwords = split(/\; /, $keywords); 
	my $phase_str = $kwords[1];
	$result_phase = substr($phase_str, length($phase_str)-1);

      } elsif (length($keywords) == 3) {
	$result_phase = 3;

      } else {
	$result_phase = 0;

      }
    }
  }

  if (!$result_phase) {
    
    $result_phase = 0; 
  }

  return ($result_phase);
}


sub getGene {

  my ($gene_id, $protein_id, $note, $product) = @_;

  #to get known and unknown genes
  my $genetype;
  if (($note && ($note =~ /hypothetical protein/ ||
		 $note =~ /Hypothetical protein/)) ||
      ($product && ($product =~ /unknown protein/ ||
		    $product =~ /Unknown protein/))) {
    $unknown_gene{$chr}++;
    
  } elsif ($note || $product) {
    $known_gene{$chr}++;
    $known_genes++;

  } else {
    $unknown_gene{$chr}++;

  }

  if ($protein_id) {
    $genetype = "SPAN_GB";

  } else {
    $genetype = "Pseudo";

  }

  my $desc;
  if ($note) {
    $desc = $note;

  } elsif ($product) {
    $desc = $product;

  }
     
  print OUTGene "$gene_id,$version,$create_date,$create_date,1003\n";
  print OUTGenedesc "$gene_id,$desc\n";
  print OUTGenetype "$gene_id,$genetype\n";   
}

getChromosome();

sub getChromosome {

  my @chr_len = ("0","52739000", "42524000", "45865000", "35179000", "33734000", "36271000", "35106000", "33955000", "27415000", "23348000", "33630000", "30575000");

  my $i;
  for ($i = 0; $i <= 12; $i++) {

    my $knownGene = $known_gene{$i};
    if (!$knownGene) {
      $knownGene = 0;

    }

    my $unknownGene=$unknown_gene{$i};
    if (!$unknownGene) {
      $unknownGene = 0;

    }

    my $chrLen = $chr_len[$i];

    if ($i < 10) {

      print OUTChr "$i,R$i,1,10$i,$knownGene,$unknownGene,0,$chrLen\n"; 

    } else {

      print OUTChr "$i,R$i,1,1$i,$knownGene,$unknownGene,0,$chrLen\n";
    }
  }
}


sub getExonAndTranslation {

    my ($strand, $exon_str);
    my $exon_count1 = 0;
    my $sticky_rank = 1;
    my ($exon_pos,$protein_id,$codon_start) = @_;
    my $transl_id = "GRMP".$transl_count;

    if ($exon_pos) {
      chomp($exon_pos);
      if (substr($exon_pos, 0, 4) eq "join") {
	$strand = 1;
	$exon_str = substr($exon_pos, 5);
	chop($exon_str);

      } elsif (substr($exon_pos, 0, 15) eq "complement(join") {

	$strand = -1;
	$exon_str = substr($exon_pos, 16);
	chop($exon_str);
	chop($exon_str);

      } elsif (substr($exon_pos, 0, 10) eq "complement") {

	$strand = -1;
	$exon_str = substr($exon_pos, 11);
	chop($exon_str);

      } else {

	$strand = 1;
	$exon_str = $exon_pos;
      }

      my @exons = split(/,/, $exon_str);
      my ($phase, $end_phase);
      my ($transl_seq_start, $transl_seq_end, $start_exon, $end_exon); 
      my ($seq_start, $seq_end, $exon);
      @exons=reverse @exons if ($strand == -1); #We have to start with the
						#first exon to calculate phase 
      foreach $exon (@exons) {

	  next unless ($exon) ;

	  $exon_count++;
	  $exon_count1++;
	  my $exon_id = "GRME".$exon_count;
	  my @numbers = split(/\../, $exon);
	  $seq_start = $numbers[0];
	  if (substr($seq_start, 0, 1) eq "<") {
	    $seq_start = substr($seq_start, 1);
	  }

	  $seq_end = $numbers[1];
	  if (substr($seq_end, 0, 1) eq ">") {
 	    $seq_end = substr($seq_end, 1);

	  }

	  if ($exon_count1 == 1) {
	      $start_exon = "GRME".$exon_count;
	      $transl_seq_start = $codon_start;
	      $phase = (1-$codon_start)%3;
	  } else {
              $phase = $end_phase;
	  }
	  $end_phase = ($phase+$seq_end-$seq_start+1)%3;

	  print OUTExon "$exon_id,$ctg_count,1,$create_date,$create_date,$seq_start,$seq_end,$strand,$phase,$end_phase,$sticky_rank\n";
      }


      $end_exon = "GRME".$exon_count; 
      $transl_seq_end = $seq_end-$seq_start + 1 -$end_phase; #get rid of partial codons
				# (  $phase+$transl_seq_end  mod 3 must be 0 )
      
      print OUTTransl "$transl_id,1,$transl_seq_start,$start_exon,$transl_seq_end,$end_exon\n";
	
      my $externalDBId;
      if (!$protein_id) {
	$protein_id = $transl_id;
	$externalDBId = 1999;

      } else {	
	$externalDBId = 1000;
      }

      my @protein_ids = split(/\./, $protein_id);
      print OUTXref "$externalDBId,$protein_id,$protein_ids[0],1,N\n";
      print OUTobjectXref "$transl_id,Translation,$transl_count\n";
      
    }
    return ($exon_count1, $strand);
}


sub getRepeat {

  my $data;
  ($data) = @_;
   my $contig = $ctg_count;
  
   foreach ($data->Features->Repeat_region) {
       $repeat_count++;
       my $repeat = $_->Position;
      
       if ($repeat) {
	 chomp($repeat);
         my ($strand, $repeat_range);
	 if (substr($repeat, 0, 10) eq "complement") {

	   $strand = -1;
	   $repeat_range = substr($repeat, 11);
	   chop($repeat_range);

	 } else {

	   $strand = 1;
	   $repeat_range = $repeat;
	 }

	 my @numbers = split(/\../, $repeat_range);

	 my $seq_start = $numbers[0];
	 my $seq_end = $numbers[1];
	 print OUTRepeat "$contig,$seq_start,$seq_end,0.0000,$strand,1003,0,0,unknown\n";
       }
   }
}


sub getGoldenPath {
 
  my ($offset, $ctg_len) = @_;

  my $ctg_start = $offset;
  my $ctg_end = $offset + $ctg_len - 1;
  my $fpcctg_name = $clone_id;
  my $chr_name = "R".$chr."_".$clone_id;
  my $raw_id = $ctg_count;
  my $chr_start = $ctg_start;
  my $chr_end  = $ctg_end;
  my $fpcctg_start = $ctg_start;
  my $fpcctg_end = $ctg_end;
  my $raw_start= 1;
  my $raw_end  = $ctg_end;  
  my $raw_ori = 1;
  my $type = "CUGI";

  my $record  = $fpcctg_name.",".$chr_name.",".$raw_id.",";
  $record .= $chr_start.",".$chr_end.",".$fpcctg_start.",";
  $record .= $fpcctg_end.",".$raw_start.",".$raw_end.",";
  $record .= $raw_ori.",".$type;
	    
  print OUTGoldenPath "$record\n";
}


sub getLastTimeResults() {
  
  open(INPUTSummary, "summary.previous") || die "cannot open summary.previous:$!";
  my $line;
  while($line = <INPUTSummary>) {
    my @wds = split(/ /, $line);
    my $key = $wds[0];
    my $value = $wds[-1];
    if ($line =~ /phase1 BACs:/) {
      $phase1_site_entries{$key} = $value;

    } elsif ($line =~ /phase1 BAC_size:/) {
      $phase1_site_size{$key} = $value;

    } elsif ($line =~ /phase2 BACs:/) {
      $phase2_site_entries{$key} = $value;

    } elsif ($line =~ /phase2 BAC_size:/) {
      $phase2_site_size{$key} = $value;

    } elsif ($line =~ /phase3 BACs:/) {
      $phase3_site_entries{$key} = $value;

    } elsif ($line =~ /phase3 BAC_size:/) {
      $phase3_site_size{$key} = $value;

    } elsif ($line =~ /total BACs:/) {
      $site_entries{$key} = $value;

    } elsif ($line =~ /total BAC_size:/) {
      $site_size{$key} = $value;

    } elsif ($line =~ /Phase1_count:/) {
      $phase1_count = $value;

    } elsif ($line =~ /Phase1_size:/) {
      $phase1_size = $value;

    } elsif ($line =~ /Phase2_count:/) {
      $phase2_count = $value;

    } elsif ($line =~ /Phase2_size:/) {
      $phase2_size = $value;

    } elsif ($line =~ /Phase3_count:/) {
      $phase3_count = $value;

    } elsif ($line =~ /Phase3_size:/) {
      $phase3_size = $value;

    } elsif ($line =~ /Total BACs:/) {
      $count = $value;

    } elsif ($line =~ /Predicted_genes:/) {
      $transl_count = $value;

    } elsif ($line =~ /Known_genes:/) {
      $known_genes = $value;

    } elsif ($line =~ /Unknown_genes:/) {
      $unknown_genes = $value;

    } elsif ($line =~ /Contigs:/) {
      $ctg_count = $value;

    } elsif ($line =~ /Exons:/) {
      $exon_count = $value;

    } elsif ($line =~ /Repeats:/) {
      $repeat_count = $value;

    } elsif ($line =~ /Transcripts:/) {
      $transc_count = $value;

    } 
  }
}


summarize_results();

sub summarize_results {

  my ($key, $value);
  foreach $key (keys %phase1_site_entries) {
    $value = $phase1_site_entries{$key};
    print OUTSummary "$key phase1 BACs: $value\n";
  }
 
  foreach $key (keys %phase1_site_size) {
    $value = $phase1_site_size{$key};
    print OUTSummary "$key phase1 BAC_size: $value\n";
  }

  foreach $key (keys %phase2_site_entries) {
    $value = $phase2_site_entries{$key};
    print OUTSummary "$key phase2 BACs: $value\n";
  }
 
  foreach $key (keys %phase2_site_size) {
    $value = $phase2_site_size{$key};
    print OUTSummary "$key phase2 BAC_size: $value\n";
  }

  foreach $key (keys %phase3_site_entries) {
    $value = $phase3_site_entries{$key};
    print OUTSummary "$key phase3 BACs: $value\n";
  }
 
  foreach $key (keys %phase3_site_size) {
    $value = $phase3_site_size{$key};
    print OUTSummary "$key phase3 BAC_size: $value\n";
  }

  foreach $key (keys %site_entries) {
    $value = $site_entries{$key};
    print OUTSummary "$key total BACs: $value\n";
  }
 
  foreach $key (keys %site_size) {
    $value = $site_size{$key};
    print OUTSummary "$key total BAC_size: $value\n";
  }

  print OUTSummary "Phase1_count: $phase1_count\n";
  print OUTSummary "Phase1_size: $phase1_size\n";
  print OUTSummary "Phase2_count: $phase2_count\n";
  print OUTSummary "Phase2_size: $phase2_size\n";
  print OUTSummary "Phase3_count: $phase3_count\n";
  print OUTSummary "Phase3_size: $phase3_size\n";
  print OUTSummary "Total BACs: $count\n";
  my $bac_size = $phase1_size + $phase2_size + $phase3_size;
  print OUTSummary "Total BAC_size: $bac_size\n"; 

  chomp($transl_count);
  print OUTSummary "Predicted_genes: $transl_count\n";
  chomp($known_genes);
  print OUTSummary "Known_genes: $known_genes\n";
  $unknown_genes = $transl_count - $known_genes;
  print OUTSummary "Unknown_genes: $unknown_genes\n";
  chomp($ctg_count);
  print OUTSummary "Contigs: $ctg_count\n";
  chomp($exon_count);
  print OUTSummary "Exons: $exon_count\n";
  chomp($repeat_count);
  print OUTSummary "Repeats: $repeat_count\n";
  chomp($transc_count);
  print OUTSummary "Transcripts: $transc_count\n";  
  
  print OUTSummary "Update: $today\n";
}

close(OUTClone);
close(OUTContig);
close(OUTExon);
close(OUTGene);
close(OUTGenedesc);
close(OUTGenetype);
close(OUTTransl);
close(OUTTransc);
close(OUTExon_Transc);
close(OUTDNA);
close(OUTContig_land);
close(OUTGoldenPath);
close(OUTXref);
close(OUTobjectXref);
close(OUTRepeat);

#system("perl insertDNA.pl");
system("cp summary.previous summary.previous.back");
system("cp summary.dat summary.previous");

if ($ini_date) {
  #system("load_data");

} else {
  #system("load_data0");
}


