# You may distribute this module under the same terms as perl itself

# POD documentation - main docs before the code

=head1 NAME

Bio::EnsEMBL::Analysis::Config::GeneBuild::GeneBuilder - imports global variables used by EnsEMBL gene building

=head1 SYNOPSIS
    use Bio::EnsEMBL::Analysis::Config::GeneBuild::GeneBuilder;
    use Bio::EnsEMBL::Analysis::Config::GeneBuild::GeneBuilder qw(  );

=head1 DESCRIPTION

GeneBuilder is a pure ripoff of humConf written by James Gilbert.

humConf is based upon ideas from the standard perl Env environment
module.

It imports and sets a number of standard global variables into the
calling package, which are used in many scripts in the human sequence
analysis system.  The variables are first decalared using "use vars",
so that it can be used when "use strict" is in use in the calling
script.  Without arguments all the standard variables are set, and
with a list, only those variables whose names are provided are set.
The module will die if a variable which doesn\'t appear in its
C<%GeneBuilder> hash is asked to be set.

The variables can also be references to arrays or hashes.

Edit C<%GeneBuilder> to add or alter variables.

All the variables are in capitals, so that they resemble environment
variables.

=head1 CONTACT

=cut


package Bio::EnsEMBL::Analysis::Config::GeneBuild::GeneBuilder;

use strict;
use vars qw( %GeneBuilder );

# Hash containing config info
%GeneBuilder = (

		GB_FINAL_GENETYPE           => 'all-gene5-pred',

	        GB_MISC_OTHER_INPUT_GENETYPES => [#'protein_coding',
						  #'protein', 
						  'protein-gene', 
						  #'other-protein-gene', 
						  #'cdna',
						  'cdna-gene',
						  'mrna-gene',
						  'est-gene', 
						  #'smrna', 
						  #'clmrna',
						  'omrna-gene1',
						  #'arest-gene1',
						  #'arest-gene2',
						  #'viest-gene',
						  #'goest-gene',
						  #'brest-gene',
						  #'glest-gene',
						  #'dimrna-gene',
						  #'whest-gene1',
						  #'whest-gene2',
						  #'baest-gene',
						  #'maest-gene1',
						  #'maest-gene2',
						  #'moest-gene1',
						  #'moest-gene2',
						  #'momrna-gene',
						  #'omrna-gene-high',
						  #'omrna-gene-low',
						  #'oest-gene', 
						  #'oest', 
						  #'ocdna-gene', 
						  #'evidence-gene',
						  ], # other gene-types which have to be combined / fetched (in GB_COMB_DBNAME)
	        GB_PREDICTION_INPUT_GENETYPES => ['fgenesh',
						  #'fgenesh_new',
						  #'twinscan',
						  #'exonhunter',
						  #'genewise',
						  #'predict',
						  'protein_coding',
						  'protein_coding_unsupported',
						  ], # other gene-types which have to be combined / fetched (in GB_COMB_DBNAME)

		# parameters for use of genscan predictions in final build		
		GB_USE_ABINITIO            => '0',
		GB_ABINITIO_TYPE           => 'predict',
		GB_ABINITIO_SUPPORTED_TYPE => 'ab_initio_supported',
		GB_ABINITIO_PROTEIN_EVIDENCE => ['Swall'],
		GB_ABINITIO_DNA_EVIDENCE     => ['Vertrna', 'Unigene'],

                GB_ABINITIO_LOGIC_NAME => 'Genscan', 
                #this must be a single logic_name, it only has to be defined if you
                #have more than one type in the prediction transcript table as this
                #breaks the PredictionGenebuilder if multiple sets are passed in
		GB_MIN_GENSCAN_EXONS        => 4,
		GB_GENSCAN_MAX_INTRON       => 15000,

                # If you want to confirm prediction transcripts with pfam. 
		# This option is currently only used for anopheles. 
		# Use with caution as the result will be drastically different 
		# than what you get with the default option.
		GB_CONFIRM_PFAM             => '0',

		# lower bound in the 'base align features' retrieved in the genebuilder
		GB_MIN_FEATURE_SCORE        => 50,
		GB_MIN_FEATURE_LENGTH       => 15,

	        # are we running on slices or RawContigs? This may be obsolete
		GB_VCONTIG                  => 1,
		
		# maximum number of transcripts per gene
		GB_MAX_TRANSCRIPTS_PER_GENE => 15,
		
		# Other parameters of the GeneBuild, also used in the post genebuild checks
		
		# introns smaller than this could be real due to framshifts
		GB_MINSHORTINTRONLEN    => 7, 
		
		# introns between smaller than this is considered too short
		GB_MAXSHORTINTRONLEN    => 10, 
		
		#
		# the rest of these don't seem to be used any more
		#

		# introns longer than this are too long
		GB_MINLONGINTRONLEN     => 20000, 
		
		# exons smaller than this could be real due to framshifts
		GB_MINSHORTEXONLEN      => 3, 
		
		# exons shorter than this are too short
		GB_MAXSHORTEXONLEN      => 10, 
		
		# exons longer than this are probably too long
		GB_MINLONGEXONLEN       => 5000, 
		
		GB_MINTRANSLATIONLEN    => 10, 

		GB_MAX_EXONSTRANSCRIPT  => 150, 

		GB_MAXTRANSCRIPTS       => 10, 
		GB_MAXGENELEN           => 50_000, 

		GB_IGNOREWARNINGS       => 1, 	    

		# all keep in a cluster above this identity -- Chengzhi
		GB_HIGH_PERCENT_ID        => 99.9,
		# keep at most one in a cluster if all below this identity
		GB_LOW_PERCENT_ID         => 99.5,
		# remove all below this identity
		GB_PREFILTER            => 0,
		GB_MIN_COVERAGE           => 30,
		GB_MIN_PERCENT_ID         => 30,
		GB_MIN_SINGLE_EXON_ID     => 30,
		# check intron bad ratio below this percent_id
		# small introns (<25 for GTAG, <35 for others) not allowed
		GB_MAX_INTRON_CHECK_ID    => 99.9,
		# allowed non-canonical introns ratio in a transcript
		GB_MAX_BAD_INTRON_RATIO   => 0.50,
		# keep introns (remove or cut below) above this percent_id
		GB_MIN_INTRON_KEEP_ID     => 99.9,
		GB_MIN_INTRON_SIZE        => 30,
		# minimum translation length
		GB_MIN_TRANSLATION        => 50,
		# minimum CDS to transcript length ratio
		GB_MIN_CDS_RATIO          => 0.1,
		GB_MAX_UTR_LENGTH         => 3000,
		GB_MAX_UTR_EXONS          => 4, # exons are complete UTRs

		# exon/intron boundary shift and end exon extension
		GB_MAX_BOUNDARY_SHIFT     => 25,
		GB_MAX_END_EXT            => 50,
		GB_POST_KEEP_ID            => 95,
		GB_POST_BAD_INTRON_RATIO   => 0.20,
		# to make up for the end using the longer alternative trans
		GB_POST_EXTENSION         => 1,
		# to use predicted gene to connect partial pairs
		GB_USE_PREDICTED          => 1,
		GB_PREDICT_TYPE           => 'predict',
		GB_USE_COMPLETE_CDS       => 0,
		# max distance between two genes to connect them
		# this is for only non-cDNA supported genes
		GB_HIGH_DISTANCE           => 2500,
		GB_LOW_DISTANCE           => 2500,
		GB_MAX_DISTANCE           => 2500,
		GB_USE_BIOTYPE            => 1, # should always use 
	       );

sub import {
  my ($callpack) = caller(0); # Name of the calling package
  my $pack = shift; # Need to move package off @_
  
  # Get list of variables supplied, or else
  # all of GeneBuilder:
  my @vars = @_ ? @_ : keys( %GeneBuilder );
  return unless @vars;
  
  # Predeclare global variables in calling package
  eval "package $callpack; use vars qw("
    . join(' ', map { '$'.$_ } @vars) . ")";
    die $@ if $@;


    foreach (@vars) {
	if ( defined $GeneBuilder{ $_ } ) {
            no strict 'refs';
	    # Exporter does a similar job to the following
	    # statement, but for function names, not
	    # scalar variables:
	    *{"${callpack}::$_"} = \$GeneBuilder{ $_ };
	} else {
	    die "Error: GeneBuilder: $_ not known\n";
	}
    }
}

1;
