#!/usr/local/bin/perl

package AVIndexEnsembl2;

BEGIN {
    unshift(@INC,"../ensembl/modules");
    unshift(@INC,"../ensembl-view");
    unshift(@INC,"../ensembl-map/modules");
    unshift(@INC,"../ensembl-trace/modules");
    unshift(@INC,"../ensembl-external/modules");
    unshift(@INC,"../conf");
    unshift(@INC,"../modules");
    unshift(@INC,"../perl");
    unshift(@INC,"../bioperl-live");
}

use strict;
use SiteDefs;
use EnsWeb;
use Bio::EnsEMBL::DBLoader;
use Bio::EnsEMBL::DBSQL::Obj;
use Bio::EnsEMBL::DBArchive::Obj;
use Bio::EnsEMBL::Map::DBSQL::Map;
use Bio::EnsEMBL::ExternalData::SNPSQL::WebSNPAdaptor;
use Bio::EnsEMBL::ExternalData::TCORESQL::DBAdaptor;
use Bio::EnsEMBL::ExternalData::Family::FamilyAdaptor;
use Bio::EnsemblViewer::Sequence::Contig;
use Bio::EnsEMBL::DBSQL::ExternalWrapper;
use XML::Generator;
use AVIndexUtils;
use DiseaseHandler;

$| = 1;
use vars qw($LIMIT $DEBUG $TIERS $IDX $MODE $GENEURL $TRANSURL $CLONEURL $CONTIGURL $PEPURL);
my $INDEXROOT = '/mysql/ensembl/www/server/modules/AltaVista/ens100_index';
$LIMIT = 1000000000;
#$LIMIT = 100;
$TIERS = 12;
$MODE = 'rw';
$GENEURL = qq(http://www.ensembl.org/perl/geneview?gene=);
$PEPURL = qq(http://www.ensembl.org/perl/protview?peptide=);
$TRANSURL = qq(http://www.ensembl.org/perl/transview?transcript=);
$CLONEURL = qq(http://www.ensembl.org/perl/cloneview?clone=);
$CONTIGURL = qq(http://www.ensembl.org/perl/contigview?contigid=);

my $db = undef;
my $dbuser = 'root';
my $dbname = 'ensembl100';
my $host = 'localhost';
my $port = '3307';
my $dbpass = undef;
eval {
    my $locator = get_locator();
    $db =  Bio::EnsEMBL::DBLoader->new($locator);
    my $snpdb = Bio::EnsEMBL::ExternalData::SNPSQL::WebSNPAdaptor->new(   
                                                 -dbname => $ENSEMBL_SNP,
                                                 -user   => $ENSEMBL_DBUSER,
                                                 -host   => $ENSEMBL_HOST,
                                                 -port   => $ENSEMBL_HOST_PORT,
                                                 );
};
if( $@ ) {
    print STDERR "Apologies. A SNP database connection exception occured: $@\n";
    exit;
}


my $docs;
			#Contig 
			#Domain 
			#Feature 
			#Family 
			#Disease 
			#IPI 
			#External
			#Genscan 
			#Chromosome 
			#Clone 
			#Peptide
			#Marker 
			#SNP 
			#Gene 
			#IDArchive
foreach (qw(
	Domain
		)){
	my $type = $_;
	my $index = "$INDEXROOT/$_.idx";
	if (! -d "$index"){
		print STDERR "AV indexer: making index directory $index...\n";
		system("mkdir $index");
	} else {
		print STDERR "AV indexer: clearing old index directory $index...\n";
		system("rm -Rf $index");
		system("mkdir $index");
	}

	unless ($IDX = ens_avs_open($index, $MODE)) {
		print STDERR "AV indexer: creating index...\n";
		warn "Fatal: cannot open index $index for read/write.\n";
		next;
	}
	print STDERR "AV indexer: switching to buildmode...\n";
	ens_avs_buildmode_ex($IDX, $TIERS);
	
	print STDERR "AV indexer: indexing $type...\n";
    no strict "refs";
	my $sub = "Index". $type;
	$docs = &$sub($IDX, $db);
	print STDERR "\nIndexed $docs for $type\n";
	ens_avs_commit($IDX);
	ens_avs_finalize($IDX);
	ens_avs_close($IDX);
	print STDERR "AV indexer: setting index permissions...\n";
	system ("chown -R w3adm $index");
}


1;
# SUBROUTINES #

#####################################################
sub IndexFeature {

	my ($idx, $dbh) = @_;

	my $blastp_14 		= 8;
	my $wublastp 		= 9;
	my $rhprimers 		= 10;
	my $unigene 		= 11;
	my $embl_vertrna 	= 12;
	my $embl_vertrna2 	= 51214;
	
	my $i;

	foreach my $match (8,9,10,11,12,51214){

		my $sth = $dbh->prepare(
								"select 
									feature.id, 
									feature.contig, 
									feature.hid, 
									feature.seq_start,
									feature.seq_end, 
									feature.name, 
									feature.perc_id, 
									contig.id
								from feature, contig, static_golden_path
								where feature.analysis = $match 
								and feature.contig = contig.internal_id 
								and contig.internal_id = static_golden_path.raw_id
							");

		$sth->execute();
		my ($fid,$contigid, $hid,$start,$end,$type,$percent,$contigname);
		while(($fid,$contigid, $hid,$start,$end,$type,$percent,$contigname) = $sth->fetchrow_array()){
			my $length = $end - $start;
			next if ($length < 100 || $percent < 75);
			my $desc = qq(Ensembl feature $hid has a ${length}bp $type similarity match to contig $contigname at bps $start-$end);
			$desc .= qq( [$percent % identity]) if ($percent);
			my $FEATUREURL = qq(http://www.ensembl.org/perl/contigview?contig=$contigname&fpos_start=$start&fpos_end=$end&fpos_context=10000);

			my $xml = make_XML("Sequence feature",$hid, $contigname, $start, $end, $type, $hid, $FEATUREURL,\$desc);
			ens_avs_indexdoc($idx,"$hid-$i",$hid,$xml,"feature");		
			print ".";
			$i++;
			if ($i > 0 && $i%1000 == 0){
				if(ens_avs_commit($idx)){
					print "[Index commit OK:$i]";
				}
			}
		}
		($fid,$contigid, $hid,$start,$end,$type,$percent,$contigname) = ();
		return($i)  if $i == $LIMIT;
	}
	return($i);
}

#####################################################
sub IndexGene {

	my ($idx, $dbh) = @_;
	my $sth = $dbh->prepare("SELECT id FROM gene");
	$sth->execute();
	my $i;
	while(my ($g) = $sth->fetchrow_array()){
		my $sth2 = $dbh->prepare("select id from transcript where gene = '$g'");
		$sth2->execute();
		my @trans = ();
		my @tmp = ();
		while(@trans = $sth2->fetchrow_array()){
			push (@tmp, @trans);
		}
		my $desc = qq(Ensembl gene $g has ) . scalar @tmp . " transcript(s):" . join(" ",@tmp);
		$i++;

		my $xml = make_XML("Gene",$g,"-","-","-","-",$g,$GENEURL.$g,\$desc);
    	#my ($index_name,$title,$content,$summary,$fieldval) = @_;
		ens_avs_indexdoc($idx,"Gene $g-$i","$g ". join(" ",@tmp),$xml,"gene");		
		print ".";
		if ($i > 0 && $i%1000 == 0){
			if(ens_avs_commit($idx)){
				print "[Index commit OK:$i]";
			}
		}
		last if $i >= $LIMIT;
	}

	
	#my $sth = $dbh->prepare("SELECT external_id, external_db, gene_id FROM genedblink");
    my $sth = $dbh->prepare("SELECT x.display_id,x.dbprimary_id,xdb.db_name,tr.gene FROM transcript tr,objectXref ox,Xref x,externalDB xdb where tr.translation = ox.ensembl_id and ox.XrefId = x.XrefId and xdb.externalDBId = x.externalDBId");
	$sth->execute();
	my ($extid, $extpid, $extdb, $g);
	while(($extid,$extpid, $extdb, $g) = $sth->fetchrow_array()){
		my $desc = qq($extid is an external identifier from the $extdb database and is mapped to Ensembl gene $g);
		$i++;
		#print STDERR "$desc\n";
		my $xml = make_XML("External Gene ID",$extid,"-","-","-","-",$extid,$GENEURL.$g,\$desc);
    	#my ($index_name,$title,$content,$summary,$fieldval) = @_;
		ens_avs_indexdoc($idx,"ExtID $extid-$i","$extid $extpid",$xml,"gene");		
		print ".";
		if ($i > 0 && $i%1000 == 0){
			if(ens_avs_commit($idx)){
				print "[Index commit OK:$i]";
			}
		}
		last if $i >= $LIMIT * 2;
	}	
	return($i);
}

#####################################################
sub IndexPeptide {

	my ($idx, $dbh) = @_;
	my $sth = $dbh->prepare("SELECT id FROM gene");
	$sth->execute();
	my $i;
	while(my ($g) = $sth->fetchrow_array()){
		my $sth2 = $dbh->prepare("select id, translation from transcript where gene = '$g'");
		$sth2->execute();
		my ($trans,$pep) = ();
		while(($trans,$pep) = $sth2->fetchrow_array()){
			my $desc = qq(Ensembl peptide $pep is a product of Ensembl gene $g [transcript $trans]);
			$i++;

			my $xml = make_XML("Peptide",$pep,"-","-","-","-",$pep,$PEPURL.$pep,\$desc);
    		#my ($index_name,$title,$content,$summary,$fieldval) = @_;
			ens_avs_indexdoc($idx,"Peptide $pep-$i",$pep,$xml,"peptide");		
			print ".";
			if ($i > 0 && $i%1000 == 0){
				if(ens_avs_commit($idx)){
					print "[Index commit OK:$i]";
				}
			}
		}
		last if $i >= $LIMIT;
	}

	return($i);
}

#####################################################
sub IndexTranscript {

	my ($idx, $dbh) = @_;
	
	my $sth = $dbh->prepare("SELECT id, gene FROM transcript");
	$sth->execute();
	my ($t, $g);
	my $i;
	while(($t, $g) = $sth->fetchrow_array()){
		my $desc = qq(Transcript $t is a transcription product of Ensembl gene $g);
		print "$desc\n";
		$i++;
		my $xml = make_XML("Transcript",$t,"-","-","-","-",$t,$TRANSURL.$t,\$desc);
    	#my ($index_name,$title,$content,$summary,$fieldval) = @_;
		ens_avs_indexdoc($idx,"Transcript $t-$i",$t,$xml,"transcript");		
		print ".";
		if ($i > 0 && $i%1000 == 0){
			if(ens_avs_commit($idx)){
				print "[Index commit OK:$i]";
			}
		}
		last if $i >= $LIMIT;
	}	
	
	my $sth = $dbh->prepare("SELECT external_id, external_db, transcript_id FROM transcriptdblink");
	$sth->execute();

	my ($extid, $extdb, $t);
	my $i;
	while(($extid, $extdb, $t) = $sth->fetchrow_array()){
		my $desc = qq($extid is an external identifier from the $extdb database and is mapped to Ensembl transcript $t);
		print "$desc\n";
		$i++;
		my $xml = make_XML("External Transcript ID",$extid,"-","-","-","-",$extid,$TRANSURL.$t,\$desc);
    	#my ($index_name,$title,$content,$summary,$fieldval) = @_;
		ens_avs_indexdoc($idx,"ExtID $extid-$i",$extid,$xml,"transcript");		
		print ".";
		if ($i > 0 && $i%1000 == 0){
			if(ens_avs_commit($idx)){
				print "[Index commit OK:$i]";
			}
		}
		last if $i >= $LIMIT * 2;
	}	
	return($i);
}

#####################################################
sub IndexGenscan {

	my ($idx, $dbh) = @_;

	my $sth = $dbh->prepare("select count(*) as total from fset_feature");
	$sth->execute();
	my ($k) = $sth->fetchrow_array();
	print STDERR "Indexing $k genscan peptides....\n";
	my @tmp;
	my ($genscan_id,$contig,$start,$end);
	my $desc;
	my $GSCANURL;
	my $xml;
	my $i;
	for ($i = 1; $i <=$k; $i++){
		$sth = $dbh->prepare("
			select 
				fset_feature.fset as genscan_id,
				contig.id as contig,
				min(feature.seq_start) as start, 
				max(feature.seq_end) as end 
			from fset_feature,feature,contig
			where 
				fset_feature.feature = feature.id
			and 
				feature.contig = contig.internal_id
			and 
				fset_feature.fset = $i
			group by 
				fset_feature.fset
		");
		$sth->execute();
		($genscan_id,$contig,$start,$end) = $sth->fetchrow_array();
		$end = $start +1; # hack!
		#print STDERR "Indexing $genscan_id\n";
		$desc = qq($genscan_id is an Ensembl Genscan peptide prediction located on contig $contig at bps $start-$end);
		$GSCANURL = qq(http://www.ensembl.org/perl/contigview?contig=$contig&fpos_start=$start&fpos_end=$end&fpos_context=10000);

		$xml = make_XML("Genscan peptide prediction",$genscan_id, $contig, $start, $end, "-", $genscan_id, $GSCANURL,\$desc);
		ens_avs_indexdoc($idx,"$genscan_id-$i",$genscan_id,$xml,"genscan");		
		print ".";
		if ($i > 0 && $i%1000 == 0){
			if(ens_avs_commit($idx)){
				print "[Index commit OK:$i]";
			}
		}
		($genscan_id,$contig,$start,$end) = ();
		return($i)  if $i >= $LIMIT;
	}
	return($i);
} 

#####################################################
sub IndexContig {

	my ($idx, $dbh) = @_;
	
	my $sth = $dbh->prepare("SELECT id, length FROM contig");
	$sth->execute();
	my ($id, $len);
	my $desc;
	my $i = 0;
	while(($id, $len) = $sth->fetchrow_array()){
		$desc = qq($id is an Ensembl/EMBL sequence contig ID [length: $len bp].);
		my $CONTIGURL = qq(http://www.ensembl.org/perl/contigview?contig=$id);
		my $xml = make_XML("contig",$id,  "-","-","-","-", $id, $CONTIGURL,\$desc);
		ens_avs_indexdoc($idx,$id,$id,$xml,"contig");		
		print ".";
		$i++;
		if ($i > 0 && $i%1000 == 0){
			if(ens_avs_commit($idx)){
				print "[Index commit OK:$i]";
			}
		}
		return($i)  if $i > $LIMIT;
	}
	return($i);
}

#####################################################
sub IndexClone {

	my ($idx, $dbh) = @_;
	
	my $sth = $dbh->prepare("SELECT id, version FROM clone");
	$sth->execute();
	my @tmp;
	my ($id, $ver);
	my $desc;
	my $i;
	while(($id, $ver) = $sth->fetchrow_array()){
		$desc = qq($id is an Ensembl/EMBL clone ID. The clone sequence version used in Ensembl is $ver [$id.$ver].);
		my $CLONEURL = qq(http://www.ensembl.org/perl/contigview?clone=$id);
		my $xml = make_XML("clone",$id,  "-","-","-","-", $id, $CLONEURL,\$desc);
		ens_avs_indexdoc($idx,"$id.$ver","$id $id.$ver",$xml,"clone");		
		print ".";
		$i++;
		if ($i > 0 && $i%1000 == 0){
			if(ens_avs_commit($idx)){
				print "[Index commit OK:$i]";
			}
		}
		return($i)  if $i > $LIMIT;
	}
	return($i);
}

#####################################################
sub IndexChromosome {

	my ($idx, $dbh) = @_;
	my $i = 0;
	my $CHRURL = qq(http://www.ensembl.org/perl/mapview?chr=);
	foreach (qw(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X)){
		my $desc = qq(Human chromosome $_);
		my $xml = make_XML("Human chromosome",$_, "-","-","-","-", $_, $CHRURL.$_,\$desc);
		ens_avs_indexdoc($idx,$_,$_,$xml,"chr");		
		print ".";
		$i++;
	}
	if(ens_avs_commit($idx)){
		print "[Index commit OK]";
	}
	return($i);
} 

#####################################################
sub IndexExternal {

	my ($idx, $dbh) = @_;
	my $filename = "/mysql/ensembl/www/server/utils/text_100.remap";
	open (DATA, "$filename") or die "Cannot open input file: $!\n";
    my $c = 0;
    my $geneid = undef;
    my $omimid = undef;
    my $omimabs = undef;
    my $spid = undef;
    my $spabs = undef;
    my $ipid = undef;
    my $ipabs = undef;
    my $pdbid = undef;
    my $pdbabs = undef;
	my $URL;

    while (<DATA>){

            if (/(.*?)\|O?MIM\|(.*?)\|(.*)/o){
                    $geneid = $1;
                    $omimid = $2;
                    $omimabs = $3;
					if (length($omimabs) > 500){
						$omimabs = substr($omimabs, 0, 500) . " [...]";
					}
                    $c++;
                    print "+";
					$URL = $GENEURL.$geneid;
			#		my $xml = make_XML("External Gene ID",$extid,"-","-","-","-",$extid,$GENEURL.$g,\$desc);
                    my $xml = make_XML("MIM reference",$geneid,"-","-","-","-",$omimid,$URL,\$omimabs);
                    ens_avs_indexdoc($idx,$c,$omimabs,$xml,"MIM");          
            }
            if (/(.*?)\|SPTR\|(.*?)\|(.*)/o){
                    $geneid = $1;
                    $spid = $2;
                    $spabs = $3;
                    $spabs =~ s/\|//og;
                    $spabs =~ s/\{E\w\d+\}//og;
					if (length($spabs) > 500){
						$spabs = substr($spabs, 0, 500) . " [...]";
					}
                    $c++;
                    print ".";
					$URL = $GENEURL.$geneid;
                    my $xml = make_XML("SPTR annotation",$geneid,"-","-","-","-",$spid,$URL,\$spabs);
                    ens_avs_indexdoc($idx,$c,$spabs,$xml,"SPTR");           
            }
            if (/(.*?)\|INTERPRO\|(.*?)\|(.*)/o){
                    $geneid = $1;
                    $ipid = $2;
                    $ipabs = $3;
					if (length($ipabs) > 500){
						$ipabs = substr($ipabs, 0, 500) . " [...]";
					}
                    $c++;
                    print "-";
					$URL = $GENEURL.$geneid;
                    my $xml = make_XML("INTERPRO annotation",$geneid,"-","-","-","-",$ipid,$URL,\$ipabs);
                    ens_avs_indexdoc($idx,$c,$ipabs,$xml,"INTERPRO");               
            }
            if (/(.*?)\|PDB\|(.*?)\|(.*)/o){
                    $geneid = $1;
                    $pdbid = $2;
                    $pdbabs = $3;
					if (length($pdbabs) > 500){
						$pdbabs = substr($pdbabs, 0, 500) . " [...]";
					}
                    $c++;
                    print "|";
					$URL = $GENEURL.$geneid;
                    my $xml = make_XML("PDB annotation",$geneid,"-","-","-","-",$pdbid,$URL,\$ipabs);
                    ens_avs_indexdoc($idx,$c,$pdbabs,$xml,"PDB");               
            }
            if ($c > 0 && $c%1000 == 0){
                    if(ens_avs_commit($idx)){
                            print "[Index commit OK:$c]";
                            $c++;
                    }
            }
			return($c)  if $c > $LIMIT;
    }
	close(DATA);
    return($c);
}

#####################################################
sub IndexDomain  {

	my ($idx, $dbh) = @_;
	my $sth = $dbh->prepare("select distinct interpro_ac from interpro_description");
	$sth->execute();
	my $i;
	my $id;
	my $ipid;
	my $interprodes;
	my $des;
	
	while(my ($ipid) = $sth->fetchrow_array()){
		next unless ($ipid =~ /IPR\d+/);
		my $sth2 = $dbh->prepare("
								select 
									interpro.id,interpro_description.description
								from 
									interpro,interpro_description 
								where 
									interpro_description.interpro_ac = '$ipid'
								and 
									interpro_description.interpro_ac = interpro.interpro_ac 
							");
		$sth2->execute();
		my @idlist = ();
		while(($id, $des) = $sth2->fetchrow_array()){
			push (@idlist, $id);
			$interprodes = $des;
		}
		#next unless ($interprodes =~ /globin/i);
		$interprodes =~ s/<.*?>//igo;
		#print STDERR "$ipid: $interprodes\n";
		my $desc = "InterPro domain $ipid [$interprodes] has " . scalar @idlist . " associated external database identifiers: " . join(" ",@idlist);
		$i++;
		#print "$desc\n";

		my $URL = qq(http://www.ensembl.org/perl/domainview?domainentry=$ipid);
		my $xml = make_XML("Interpro domain ID",$ipid,"-","-","-","-",$ipid,$URL,\$desc);
    	#my ($index_name,$title,$content,$summary,$fieldval) = @_;
		ens_avs_indexdoc($idx,"$ipid-$i","$ipid $interprodes ".join(" ",@idlist) ,$xml,"interpro");		
		#print STDERR "$ipid $interprodes ".join(" ",@idlist) . "\n";
		print ".";
		if ($i > 0 && $i%1000 == 0){
			if(ens_avs_commit($idx)){
				print "[Index commit OK:$i]";
			}
		}
		return($i)  if $i >= $LIMIT;
	}
	return($i);


}

#####################################################
sub IndexMarker { 
	my ($idx, $dbh) = @_;
	my $fpc_map;
	my $mapdb;
	my $desc;
	eval{
    my $mapdb   = new Bio::EnsEMBL::Map::DBSQL::Obj(   
                                                -dbname => $ENSEMBL_MAP,
                                                -user   => $ENSEMBL_DBUSER, 
                                                -host   => $ENSEMBL_HOST,
                                                -port   => $ENSEMBL_HOST_PORT,
                                                -ensdb  => $ENSEMBL_DB,
                                                );
    $fpc_map = $mapdb->get_Map( 'FPC' );
	};
	if( $@ ) {
    	print STDERR "Apologies. A connection exception occured: $@\n";
    	exit;
	}
	
	my $sth = $dbh->prepare("select hid from feature where analysis = 2");
	$sth->execute();
	my $i;
	my $rhid;
	my @SYN = ();
	my $syn;
	while(my ($rhid) = $sth->fetchrow_array()){
		my $sth2 = $fpc_map->db->prepare("
								select 
									name
								from 
									MarkerSynonym
								where
									marker = '$rhid'
							");
		$sth2->execute();
		while(($syn) = $sth2->fetchrow_array()){
			push (@SYN, $syn);
		}

		if (@SYN){
			$desc = "Map marker $rhid has " . scalar @SYN . " synonyms: " . join(" ",@SYN);
		} else {
			$desc = "Map marker $rhid";
		}
		$i++;
		#print "$desc\n";
		my $URL = qq(http://www.ensembl.org/perl/markerview?marker=$rhid);
		my $xml = make_XML("Radiation hybrid marker",$rhid,"-","-","-","-",$rhid,$URL,\$desc);
    	#my ($index_name,$title,$content,$summary,$fieldval) = @_;
		ens_avs_indexdoc($idx,"$rhid-$i","$rhid ".join(" ",@SYN) ,$xml,"rhmarker");		
		print ".";
		@SYN = ();
		if ($i > 0 && $i%1000 == 0){
			if(ens_avs_commit($idx)){
				print "[Index commit OK:$i]";
			}
		}
		return($i)  if $i >= $LIMIT;
	}
	return($i);

}

#####################################################
sub IndexSNP {
	my ($idx, $dbh) = @_;
	my $snpdb;
	my $desc;
	eval{
    	$snpdb = Bio::EnsEMBL::ExternalData::SNPSQL::WebSNPAdaptor->new(   
                                                	 -dbname => $ENSEMBL_SNP,
                                                	 -user   => $ENSEMBL_DBUSER,
                                                	 -host   => $ENSEMBL_HOST,
                                                	 -port   => $ENSEMBL_HOST_PORT,
                                                	 );
	};
	if( $@ ) {
    	print STDERR "Apologies. A connection exception occured: $@\n";
    	exit;
	}
	my $sth = $snpdb->prepare("select 
								RefSNP.id, 
								RefSNP.observed, 
								SubSNP.handle, 
								SubSNP.altid 
							from 
								RefSNP, SubSNP 
							where 
								RefSNP.id = SubSNP.refsnpid
							");
	$sth->execute();
	my $i;
	my $snpid;
	my $obs;
	my $handle;
	my $altid;
	while(my ($snpid, $obs, $handle, $altid) = $sth->fetchrow_array()){
		$i++;
		$desc = "Single nucleotide polymorphism (SNP) $snpid [Alleles: $obs]. Origin: $handle; Alternative ID: $altid";
		#print "$desc\n";
		my $URL = qq(http://www.ensembl.org/perl/snpview?snp=$snpid);
		my $xml = make_XML("SNP",$snpid,"-","-","-","-",$snpid,$URL,\$desc);
    	#my ($index_name,$title,$content,$summary,$fieldval) = @_;
		ens_avs_indexdoc($idx,"$snpid-$i","$snpid $altid" ,$xml,"snp");		
		print ".";
		if ($i > 0 && $i%1000 == 0){
			if(ens_avs_commit($idx)){
				print "[Index commit OK:$i]";
			}
		}
		$sth->finish()  if $i >= $LIMIT;
		return($i)  if $i >= $LIMIT;
	}
	return($i);



}  

#####################################################

sub IndexFamily {

	my ($idx, $dbh) = @_;
	my $famdb;
	eval{
        $famdb = Bio::EnsEMBL::ExternalData::Family::FamilyAdaptor->new(
                                                -dbname=>$ENSEMBL_FAMILY,
                                                -host=>$ENSEMBL_HOST,
                                                -port=>$ENSEMBL_HOST_PORT,
                                                -user=>$ENSEMBL_DBUSER,
                                                -pass=>$ENSEMBL_DBPASS
                                                    );
    	};
	if( $@ ) {
    	print STDERR "Apologies. A connection exception occured: $@\n";
    	exit;
	}
	my $sth = $famdb->_db_handle->prepare("select
								internal_id, id, description
							from
								family
							");
	$sth->execute();
	my $i;
	my ($famid, $id, $desc, $dbid);
	my @SYN = ();
	while(my ($famid, $id, $desc) = $sth->fetchrow_array()){
		my $sth2 = $famdb->_db_handle->prepare("
								select 
									db_id
								from 
									family_members
								where
									family = '$famid'
							");
		$sth2->execute();
		while(($dbid) = $sth2->fetchrow_array()){
			push (@SYN, $dbid);
		}

		if (@SYN){
			$desc = "Ensembl protein family $id [$desc] has " . scalar @SYN . " members: " . join(" ",@SYN);
		} else {
			$desc = "Ensembl protein family $id";
		}
		$i++;
		my $URL = qq(http://www.ensembl.org/perl/familyview?family=$id);
		my $xml = make_XML("Protein Family",$id,"-","-","-","-",$id,$URL,\$desc);
    	#my ($index_name,$title,$content,$summary,$fieldval) = @_;
		ens_avs_indexdoc($idx,"$id-$i","$id $desc".join(" ",@SYN) ,$xml,"protfamily");		
		print ".";
		@SYN = ();
		if ($i > 0 && $i%1000 == 0){
			if(ens_avs_commit($idx)){
				print "[Index commit OK:$i]";
			}
		}
		$sth2->finish() if $i >= $LIMIT;
		$sth->finish() if $i >= $LIMIT;
		last  if $i >= $LIMIT;
	}
	return($i);
}
 
#####################################################

sub IndexDisease  {

	my ($idx, $dbh) = @_;
	my $disdb;
	eval{
    	my $locator = &EnsWeb::get_locator();
    	my $ensembldb =  Bio::EnsEMBL::DBLoader->new($locator);

    	my $mapdb = new Bio::EnsEMBL::Map::DBSQL::Obj( 
                                -user   => $ENSEMBL_DBUSER, 
                                -dbname => $ENSEMBL_MAP, 
                                -host   => $ENSEMBL_HOST,
                                -port   => $ENSEMBL_HOST_PORT,
                                -ensdb  => $ENSEMBL_DB
                                );
    	$disdb = new DiseaseHandler( 
                                	-user   => $ENSEMBL_DBUSER, 
                                	-dbname => $ENSEMBL_DISEASE,
                                	-host   => $ENSEMBL_HOST,
                                	-port   => $ENSEMBL_HOST_PORT,
                                	-ensdb  => $ensembldb,
                                	-mapdb  => $mapdb
                                	);
		};

	if( $@ ) {
    	print STDERR "Apologies. A connection exception occured: $@\n";
    	exit;
	}
	my $sth = $disdb->_db_handle->prepare("select
								gene_symbol, omim_id, disease
							from
								disease, gene
							where
								disease.id = gene.id
							");
	$sth->execute();
	my $i;
	while(my ($symbol, $omimid, $disease) = $sth->fetchrow_array()){
		my $desc = "Disease ID $symbol (OMIM = $omimid): $disease";
		$i++;
		my $esc_disease = &CGI::escape($disease);
		my $URL = qq(http://www.ensembl.org/perl/diseaseview?disease=$esc_disease);
		my $xml = make_XML("Disease",$symbol,"-","-","-","-",$symbol,$URL,\$desc);
    	#my ($index_name,$title,$content,$summary,$fieldval) = @_;
		ens_avs_indexdoc($idx,"$symbol-$i","$symbol $desc $omimid" ,$xml,"disease");		
		print ".";
		if ($i > 0 && $i%1000 == 0){
			if(ens_avs_commit($idx)){
				print "[Index commit OK:$i]";
			}
		}
		$sth->finish() if $i >= $LIMIT;
		last  if $i >= $LIMIT;
	}
	return($i);


}

#####################################################

sub IndexIPI {

	my ($idx, $dbh) = @_;
	my $i = 0;
	open(IPI, "IPI.fa") or warn "Can't open IPI flat file for indexing: $!\n";
	while (<IPI>){
		if (/^>(\S+)/o){
			#print STDERR "$1\n";
			my $desc = "IPI/IGI: $1";
			my $URL = qq(http://www.sanger.ac.uk/srs6bin/cgi-bin/wgetz?-e+[IPI-ID:$1]);
			my $xml = make_XML("IPI Entry",$1,"-","-","-","-",$1,$URL,\$desc);
    		#my ($index_name,$title,$content,$summary,$fieldval) = @_;
			ens_avs_indexdoc($idx,"$1-$i",$1 ,$xml,"ipi");		
			$i++;
			print ".";
			if ($i > 0 && $i%1000 == 0){
				if(ens_avs_commit($idx)){
					print "[Index commit OK:$i]";
				}
			}
			last if $i >= $LIMIT;
		}
	}
	return($i);

}

#####################################################
 
sub IndexIDArchive {

	my ($idx, $dbh) = @_;
	my $i = 0;
	my $archdb;

	eval{
    	$archdb = new Bio::EnsEMBL::DBArchive::Obj( 
                                -user   => $ENSEMBL_DBUSER, 
                                -dbname => 'archive081', 
                                -host   => $ENSEMBL_HOST,
                                -port   => $ENSEMBL_HOST_PORT,
                                -ensdb  => $ENSEMBL_DB
                                );
	};

	if( $@ ) {
    	print STDERR "Apologies. A connection exception occured: $@\n";
    	exit;
	}
	my $sth = $archdb->_db_handle->prepare("select
								old_id, new_id
							from
								deleted_id
							");
	$sth->execute();
	my $URL = '';
	while(my ($old, $new) = $sth->fetchrow_array()){
		my $desc = "Ensembl ID $old";
		if ($new){
			$desc .= qq( is an old identifier that is no longer used. This ID it is currently mapped to: $new);
			if ($new =~ /ENSG/){
				$URL = qq(http://www.ensembl.org/perl/geneview?gene=$new);
			} elsif ($new =~ /ENST/){
				$URL = qq(http://www.ensembl.org/perl/transview?transcript=$new);
			} else {
				$URL = '';
			}

		} else {
			$desc .= qq(is no longer used. No current mapping exists for this ID);
		}
		$i++;
		my $xml = make_XML("Archived ID",$old,"-","-","-","-",$old,$URL,\$desc);
    	#my ($index_name,$title,$content,$summary,$fieldval) = @_;
		ens_avs_indexdoc($idx,"$new-$i",$new ,$xml,"archive");		
		print ".";
		if ($i > 0 && $i%1000 == 0){
			if(ens_avs_commit($idx)){
				print "[Index commit OK:$i]";
			}
		}
		$sth->finish() if $i >= $LIMIT;
		last  if $i >= $LIMIT;
	}
	return($i);


}
 
#####################################################
sub make_XML {

	my ($title, $name, $contig, $start, $end, $type, $id, $url, $txtref) = @_;
	my $XML = new XML::Generator();
	my $DATE =  `date '+%d-%m-%Y'`;
	chomp $DATE;
	if (length($$txtref) > 50000){
		$$txtref = substr($$txtref, 0, 50000) . "....";
	}
	my $xml =   $XML->AV_IDX_HDR(
				$XML->TITLE("Ensembl $title"),
				$XML->NAME($name),
				$XML->CONTIG($contig),
				$XML->START($start),
				$XML->END($end),
				$XML->TYPE($type),
				$XML->ENSACC($id),
				$XML->URL($url),
				$XML->ABSTRACT($$txtref),
				$XML->ABSLEN(int(length($$txtref)/1000)+1),
				$XML->LASTMOD($DATE),
				);
	if (length($xml) > 64000){
		die "SetDocData too big!\n";
	}
	#print "$xml\n\n";
	return($xml);		
}

