1. CompareSchema ................ # Subtest: CompareSchema # Subtest: zea_mays, core, zea_mays_core_50_103_8 not ok 1 - Database schema matches schema defined in file # Failed test 'Database schema matches schema defined in file' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/CompareSchema.pm line 75. # Structures begin differing at: # $got->{external_db_bak} = HASH(0x60956a8) # $expected->{external_db_bak} = Does not exist Fix: Run the sql: > drop table external_db_bak 2. ControlledAnalysis ........... not ok 8 - Correct display properties for 'pfam' analysis # Failed test 'Correct display properties for 'pfam' analysis' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledAnalysis.pm line 100. # Structures begin differing at: # $got->{web_data} = '{'type' => 'domain'}' # $expected->{web_data} = '{"type": "domain"}' not ok 9 - Analysis 'cshl_noncoding_gene' in production database # Failed test 'Analysis 'cshl_noncoding_gene' in production database' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledAnalysis.pm line 97. not ok 10 - Analysis 'cshl_gene' in production database # Failed test 'Analysis 'cshl_gene' in production database' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledAnalysis.pm line 97. not ok 12 - Correct display properties for 'repeatmask_customlib' analysis # Failed test 'Correct display properties for 'repeatmask_customlib' analysis' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledAnalysis.pm line 100. # Structures begin differing at: # $got->{web_data} = ''NULL'' # $expected->{web_data} = undef not ok 14 - Correct display properties for 'trf' analysis # Failed test 'Correct display properties for 'trf' analysis' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledAnalysis.pm line 100. # Structures begin differing at: # $got->{description} = 'Tandem Repeats Finder locates adjacent copies of a pattern of nucleotides.' # $expected->{description} = 'Tandem Repeats Finder locates adjacent copies of a pattern of nucleotides.' Fix: add these analyses to ensembl_production - cshl_gene - cshl_noncoding_gene run the sqls: >update analysis_description ad join analysis a using (analysis_id) set ad.web_data = {"type": "domain"}' where a.logic_name='pfam'; >update analysis_description ad join analysis a using (analysis_id) set ad.web_data = null where a.logic_name='repeatmask_customlib'; Dispute: I think it is OK to ignore, we didn't run the most updated version of trf here at CSHL. The 'trf' description discrepancy, datacheck recommend "https://tandem.bu.edu/trf/trf.html", while we use "http://nar.oxfordjournals.org/cgi/content/full/27/2/573?maxtoshow=&HITS=10&hits=10&RESULTFORMAT=1&author1=Benson&andorexacttitle=and&andorexacttitleabs=and&andorexactfulltext=and&searchid=1&FIRSTINDEX=0&sortspec=relevance&fdate=1/1/1999&tdate=12/31/1999&resourcetype=HWCIT" 3. ControlledMetaKeys ........... # Subtest: ControlledMetaKeys # Subtest: zea_mays, core, zea_mays_core_50_103_8 # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm line 68. # not ok 1 - Meta key 'assembly.overlapping_regions' in production database # Failed test 'Meta key 'assembly.overlapping_regions' in production database' not ok 6 - Meta key 'assembly.num_toplevel_seqs' in production database # Failed test 'Meta key 'assembly.num_toplevel_seqs' in production database' not ok 8 - Meta key 'sample.search_text ' in production database # Failed test 'Meta key 'sample.search_text ' in production database' not ok 14 - Meta key 'species.short_name' in production database # Failed test 'Meta key 'species.short_name' in production database' not ok 47 - Mandatory meta key 'species.division' exists # Failed test 'Mandatory meta key 'species.division' exists' not ok 51 - Mandatory meta key 'species.url' exists # Failed test 'Mandatory meta key 'species.url' exists' Fix: these seems to be the obsolete meta_keys and missing meta keys Run the following sql > delete from meta where meta_key in ('assembly.overlapping_regions', 'assembly.num_toplevel_seqs', 'sample.search_text', 'species.short_name'); > insert into meta(meta_key, meta_value) values('species.division', 'EnsemblPlants'),('species.url', 'Zea_mays'); 4. ControlledTablesCore ......... # Subtest: ControlledTablesCore # Subtest: zea_mays, core, zea_mays_core_50_103_8 not ok 2 - Data in attrib_type (attrib_type_id: 1) is consistent # Failed test 'Data in attrib_type (attrib_type_id: 1) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{description} = undef # $expected->[0]{description} = 'ENA' not ok 175 - Data in attrib_type (attrib_type_id: 363) is consistent # Failed test 'Data in attrib_type (attrib_type_id: 363) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{description} = 'Working genes that are screened for TE content and orthology with sorghum and rice.' # $expected->[0]{description} = 'Working genes that are screened for TE content and orthology with sorghum and rice' not ok 183 - Data in attrib_type (attrib_type_id: 371) is consistent # Failed test 'Data in attrib_type (attrib_type_id: 371) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{description} = 'Side of the contig on which a vector lies (enum:RIGHT, LEFT).' # $expected->[0]{description} = 'Side of the contig on which a vector lies (enum:RIGHT, LEFT)' not ok 184 - Data in attrib_type (attrib_type_id: 372) is consistent # Failed test 'Data in attrib_type (attrib_type_id: 372) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{description} = 'Scaffold that contains mutually ordered contigs.' # $expected->[0]{description} = 'Scaffold that contains mutually ordered contigs' not ok 185 - Data in attrib_type (attrib_type_id: 373) is consistent # Failed test 'Data in attrib_type (attrib_type_id: 373) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{description} = 'Identifies the most recent version of an accession.' # $expected->[0]{description} = 'Identifies the most recent version of an accession' not ok 229 - Data in attrib_type (attrib_type_id: 421) is consistent # Failed test 'Data in attrib_type (attrib_type_id: 421) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{code} = 'vectorbase_adar' # $expected->[0]{code} = 'vectorbase_maker_pre' not ok 233 - Data in attrib_type (attrib_type_id: 425) is consistent # Failed test 'Data in attrib_type (attrib_type_id: 425) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{description} = 'For polyploid genome, the genome component name the seq_region belongs to.' # $expected->[0]{description} = 'For polyploid genome, the genome component name the seq_region belongs to' not ok 320 - All data exists in master table # Failed test 'All data exists in master table' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 99. # got: '3' # expected: '0' # attrib_type (attrib_type_id: 5040)attrib_type (attrib_type_id: 5050)attrib_type (attrib_type_id: 5060) Fix: Due to ensembl_production difference, I think our local ensembl_production is out of date. Need to get most recent one from EG. Run the following sql: > update attrib_type set code = 'vectorbase_maker_pre' where code = 'vectorbase_adar'; > update attrib_type at join ensembl_production.attrib_type mat using(attrib_type_id, code) set at.description=mat.description; not ok 321 - Table 'biotype' is populated # Failed test 'Table 'biotype' is populated' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 48. # '0' # > # '0' Fix: I don't quite understand this one. There is no table biotype defined. not ok 323 - Data in external_db (external_db_id: 211) is consistent # Failed test 'Data in external_db (external_db_id: 211) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_name} = 'VB_Community_Symbol' # $expected->[0]{db_name} = 'BRC4_Community_Symbol' not ok 324 - Data in external_db (external_db_id: 212) is consistent # Failed test 'Data in external_db (external_db_id: 212) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_name} = 'VB_External_Description' # $expected->[0]{db_name} = 'BRC4_External_Description' not ok 325 - Data in external_db (external_db_id: 213) is consistent # Failed test 'Data in external_db (external_db_id: 213) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_name} = 'VB_RNA_Description' # $expected->[0]{db_name} = 'BRC4_RNA_Description' not ok 333 - Data in external_db (external_db_id: 800) is consistent # Failed test 'Data in external_db (external_db_id: 800) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'Flybase Gene' # $expected->[0]{db_display_name} = 'FlyBase gene' not ok 334 - Data in external_db (external_db_id: 801) is consistent # Failed test 'Data in external_db (external_db_id: 801) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'Flybase translation ID' # $expected->[0]{db_display_name} = 'FlyBase translation' not ok 335 - Data in external_db (external_db_id: 804) is consistent # Failed test 'Data in external_db (external_db_id: 804) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'Flybase Annotation ID' # $expected->[0]{db_display_name} = 'FlyBase annotation' not ok 336 - Data in external_db (external_db_id: 805) is consistent # Failed test 'Data in external_db (external_db_id: 805) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'Flybase Symbol' # $expected->[0]{db_display_name} = 'FlyBase symbol' not ok 337 - Data in external_db (external_db_id: 808) is consistent # Failed test 'Data in external_db (external_db_id: 808) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'Flybase transcript ID' # $expected->[0]{db_display_name} = 'FlyBase transcript' not ok 339 - Data in external_db (external_db_id: 821) is consistent # Failed test 'Data in external_db (external_db_id: 821) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{type} = 'MISC' # $expected->[0]{type} = 'PRIMARY_DB_SYNONYM' not ok 342 - Data in external_db (external_db_id: 826) is consistent # Failed test 'Data in external_db (external_db_id: 826) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{type} = 'MISC' # $expected->[0]{type} = 'PRIMARY_DB_SYNONYM' not ok 354 - Data in external_db (external_db_id: 1300) is consistent # Failed test 'Data in external_db (external_db_id: 1300) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{type} = 'MISC' # $expected->[0]{type} = 'PRIMARY_DB_SYNONYM' not ok 355 - Data in external_db (external_db_id: 1400) is consistent # Failed test 'Data in external_db (external_db_id: 1400) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{type} = 'MISC' # $expected->[0]{type} = 'PRIMARY_DB_SYNONYM' not ok 357 - Data in external_db (external_db_id: 1520) is consistent # Failed test 'Data in external_db (external_db_id: 1520) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'MIM disease' # $expected->[0]{db_display_name} = 'MIM morbid' not ok 393 - Data in external_db (external_db_id: 2800) is consistent # Failed test 'Data in external_db (external_db_id: 2800) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{type} = 'MISC' # $expected->[0]{type} = 'PRIMARY_DB_SYNONYM' not ok 400 - Data in external_db (external_db_id: 3300) is consistent # Failed test 'Data in external_db (external_db_id: 3300) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{type} = 'MISC' # $expected->[0]{type} = 'PRIMARY_DB_SYNONYM' not ok 401 - Data in external_db (external_db_id: 3400) is consistent # Failed test 'Data in external_db (external_db_id: 3400) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{type} = 'MISC' # $expected->[0]{type} = 'PRIMARY_DB_SYNONYM' not ok 406 - Data in external_db (external_db_id: 4200) is consistent # Failed test 'Data in external_db (external_db_id: 4200) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{type} = 'MISC' # $expected->[0]{type} = 'PRIMARY_DB_SYNONYM' not ok 407 - Data in external_db (external_db_id: 4400) is consistent # Failed test 'Data in external_db (external_db_id: 4400) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{type} = 'MISC' # $expected->[0]{type} = 'PRIMARY_DB_SYNONYM' not ok 433 - Data in external_db (external_db_id: 7180) is consistent # Failed test 'Data in external_db (external_db_id: 7180) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_name} = 'VB_Community_Annotation' # $expected->[0]{db_name} = 'BRC4_Community_Annotation' not ok 462 - Data in external_db (external_db_id: 12310) is consistent # Failed test 'Data in external_db (external_db_id: 12310) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'Clone-based (Vega)' # $expected->[0]{db_display_name} = 'Clone-based (Vega) gene' not ok 463 - Data in external_db (external_db_id: 12315) is consistent # Failed test 'Data in external_db (external_db_id: 12315) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'Clone-based (Ensembl)' # $expected->[0]{db_display_name} = 'Clone-based (Ensembl) gene' not ok 464 - Data in external_db (external_db_id: 12410) is consistent # Failed test 'Data in external_db (external_db_id: 12410) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'Clone-based (Vega)' # $expected->[0]{db_display_name} = 'Clone-based (Vega) transcript' not ok 465 - Data in external_db (external_db_id: 12415) is consistent # Failed test 'Data in external_db (external_db_id: 12415) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'Clone-based (Ensembl)' # $expected->[0]{db_display_name} = 'Clone-based (Ensembl) transcript' not ok 512 - Data in external_db (external_db_id: 20312) is consistent # Failed test 'Data in external_db (external_db_id: 20312) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'SGD' # $expected->[0]{db_display_name} = 'SGD gene name' not ok 513 - Data in external_db (external_db_id: 20313) is consistent # Failed test 'Data in external_db (external_db_id: 20313) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'SGD' # $expected->[0]{db_display_name} = 'SGD transcript name' not ok 544 - Data in external_db (external_db_id: 20378) is consistent # Failed test 'Data in external_db (external_db_id: 20378) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'PomBase' # $expected->[0]{db_display_name} = 'PomBase (peptide)' not ok 574 - Data in external_db (external_db_id: 50542) is consistent # Failed test 'Data in external_db (external_db_id: 50542) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'LRG display in Ensembl' # $expected->[0]{db_display_name} = 'LRG display in Ensembl gene' not ok 575 - Data in external_db (external_db_id: 50543) is consistent # Failed test 'Data in external_db (external_db_id: 50543) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'LRG display in Ensembl' # $expected->[0]{db_display_name} = 'LRG display in Ensembl transcript' not ok 578 - Data in external_db (external_db_id: 50609) is consistent # Failed test 'Data in external_db (external_db_id: 50609) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'HGNC transcript name' # $expected->[0]{db_display_name} = 'Transcript name' not ok 630 - Data in external_db (external_db_id: 50682) is consistent # Failed test 'Data in external_db (external_db_id: 50682) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'BGI_2005_indica_Gene' # $expected->[0]{db_display_name} = 'BGI Gene' not ok 632 - Data in external_db (external_db_id: 50684) is consistent # Failed test 'Data in external_db (external_db_id: 50684) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{description} = 'Plant environmental conditions ontology terms. # More information in there: # http://www.gramene.org/plant_ontology/ontology_browse.html#eo' # $expected->[0]{description} = 'Plant environmental conditions ontology terms. More information: http://www.gramene.org/plant_ontology/ontology_browse.html#eo' not ok 653 - Data in external_db (external_db_id: 50717) is consistent # Failed test 'Data in external_db (external_db_id: 50717) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'GOA' # $expected->[0]{db_display_name} = 'UniProtKB-Gene Ontology Annotation' not ok 660 - Data in external_db (external_db_id: 50726) is consistent # Failed test 'Data in external_db (external_db_id: 50726) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'ENA' # $expected->[0]{db_display_name} = 'European Nucleotide Archive feature (gene source)' not ok 661 - Data in external_db (external_db_id: 50727) is consistent # Failed test 'Data in external_db (external_db_id: 50727) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'ENA' # $expected->[0]{db_display_name} = 'European Nucleotide Archive feature (transcript source)' not ok 662 - Data in external_db (external_db_id: 50728) is consistent # Failed test 'Data in external_db (external_db_id: 50728) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'ENA' # $expected->[0]{db_display_name} = 'European Nucleotide Archive feature (translation source)' not ok 667 - Data in external_db (external_db_id: 50734) is consistent # Failed test 'Data in external_db (external_db_id: 50734) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{description} = 'TAIR identifier to link to Ensembl Translation entities. # The main requirement behind this entry, is to be able to link TAIR GO annotations to Ensembl Translation objects.' # $expected->[0]{description} = 'TAIR identifier to link to Ensembl Translation entities. The main requirement behind this entry, is to be able to link TAIR GO annotations to Ensembl Translation objects.' not ok 716 - Data in external_db (external_db_id: 50801) is consistent # Failed test 'Data in external_db (external_db_id: 50801) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'KEGG' # $expected->[0]{db_display_name} = 'KEGG Pathway and Enzyme' not ok 723 - Data in external_db (external_db_id: 50817) is consistent # Failed test 'Data in external_db (external_db_id: 50817) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_name} = 'RNACentral' # $expected->[0]{db_name} = 'RNAcentral' not ok 727 - Data in external_db (external_db_id: 50821) is consistent # Failed test 'Data in external_db (external_db_id: 50821) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{description} = 'Bread wheat 3B pseudomolecule gene models from INRA GDEC group. # ' # $expected->[0]{description} = 'Bread wheat 3B pseudomolecule gene models from INRA GDEC group.' not ok 728 - Data in external_db (external_db_id: 50822) is consistent # Failed test 'Data in external_db (external_db_id: 50822) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{description} = 'Gene identifiers from Araport, the Arabidopsis information portal (formerly TAIR). # ' # $expected->[0]{description} = 'Gene identifiers from Araport, the Arabidopsis information portal (formerly TAIR).' not ok 730 - Data in external_db (external_db_id: 50824) is consistent # Failed test 'Data in external_db (external_db_id: 50824) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'Reactome' # $expected->[0]{db_display_name} = 'Reactome gene' not ok 731 - Data in external_db (external_db_id: 50825) is consistent # Failed test 'Data in external_db (external_db_id: 50825) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{db_display_name} = 'Reactome' # $expected->[0]{db_display_name} = 'Reactome transcript' not ok 735 - All data exists in master table # Failed test 'All data exists in master table' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 99. # got: '217' # expected: '0' Fix: All the above failures due to obsolete ensembl_production database I used. There are too many to fix by updates. I suggest reload the whole external_db from your ensembl_production database. Something like > truncate table external_db; > insert into external_db select * from ensembl_production.external_db; not ok 744 - Data in misc_set (misc_set_id: 13) is consistent # Failed test 'Data in misc_set (misc_set_id: 13) is consistent' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledTablesCore.pm line 93. # Structures begin differing at: # $got->[0]{description} = '' # $expected->[0]{description} = 'NULL' Fix: > update misc_set set description = null where misc_set_id=13; 4. ForeignKeys .................. # Subtest: ForeignKeys not ok 23 - All exon_transcript.exon_id rows linked to exon.exon_id rows # Failed test 'All exon_transcript.exon_id rows linked to exon.exon_id rows' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ForeignKeys.pm line 55. # got: '1' # expected: '0' # Broken referential integrity found with SQL: # SELECT COUNT(*) FROM # exon_transcript t1 LEFT JOIN exon t2 ON t1.exon_id = t2.exon_id # WHERE t1.exon_id IS NOT NULL AND t2.exon_id IS NULL # not ok 24 - All exon_transcript.transcript_id rows linked to transcript.transcript_id rows # Failed test 'All exon_transcript.transcript_id rows linked to transcript.transcript_id rows' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ForeignKeys.pm line 55. # got: '1' # expected: '0' # Broken referential integrity found with SQL: # SELECT COUNT(*) FROM # exon_transcript t1 LEFT JOIN transcript t2 ON t1.transcript_id = t2.transcript_id # WHERE t1.transcript_id IS NOT NULL AND t2.transcript_id IS NULL # Fix: Both can be fixed by the following sql > delete from exon_transcript where exon_id=1975715; not ok 28 - All gene.canonical_transcript_id rows linked to transcript.transcript_id rows # Failed test 'All gene.canonical_transcript_id rows linked to transcript.transcript_id rows' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ForeignKeys.pm line 55. # got: '4547' # expected: '0' # Broken referential integrity found with SQL: # SELECT COUNT(*) FROM # gene t1 LEFT JOIN transcript t2 ON t1.canonical_transcript_id = t2.transcript_id # WHERE t1.canonical_transcript_id IS NOT NULL AND t2.transcript_id IS NULL # Fix: > update gene t1 LEFT JOIN transcript t2 ON t1.canonical_transcript_id = t2.transcript_id set t1.canonical_transcript_id = null WHERE t1.canonic al_transcript_id IS NOT NULL AND t2.transcript_id IS NULL ; not ok 43 - All transcript_attrib.transcript_id rows linked to transcript.transcript_id rows # Failed test 'All transcript_attrib.transcript_id rows linked to transcript.transcript_id rows' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ForeignKeys.pm line 55. # got: '63' # expected: '0' # Broken referential integrity found with SQL: # SELECT COUNT(*) FROM # transcript_attrib t1 LEFT JOIN transcript t2 ON t1.transcript_id = t2.transcript_id # WHERE t1.transcript_id IS NOT NULL AND t2.transcript_id IS NULL # Fix: > delete t1.* FROM transcript_attrib t1 LEFT JOIN transcript t2 ON t1.transcript_id = t2.transcript_id WHERE t1.transcript_id IS NOT NULL AND t2.transcript_id IS NULL; not ok 109 - All exon.exon_id rows linked to exon_transcript.exon_id rows # Failed test 'All exon.exon_id rows linked to exon_transcript.exon_id rows' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ForeignKeys.pm line 103. # got: '2' # expected: '0' # Broken referential integrity found with SQL: # SELECT COUNT(*) FROM # exon t1 LEFT JOIN exon_transcript t2 ON t1.exon_id = t2.exon_id # WHERE t1.exon_id IS NOT NULL AND t2.exon_id IS NULL # Fix: > delete t1.* FROM exon t1 LEFT JOIN exon_transcript t2 ON t1.exon_id = t2.exon_id WHERE t1.exon_id IS NOT NULL AND t2.exon_id IS NULL; 5. GeneBiotypes ................. # Subtest: GeneBiotypes not ok 1 - Genes have valid biotypes # Failed test 'Genes have valid biotypes' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/GeneBiotypes.pm line 75. # got: '44303' # expected: '0' # Invalid biotype for core gene: (t1.stable_id) = (Zm00001eb093920) # Invalid biotype for core gene: (t1.stable_id) = (Zm00001eb033650) # Invalid biotype for core gene: (t1.stable_id) = (Zm00001eb392700) # Invalid biotype for core gene: (t1.stable_id) = (Zm00001eb166650) # Invalid biotype for core gene: (t1.stable_id) = (Zm00001eb305300) # Invalid biotype for core gene: (t1.stable_id) = (Zm00001eb181300) # Invalid biotype for core gene: (t1.stable_id) = (Zm00001eb298920) # Invalid biotype for core gene: (t1.stable_id) = (Zm00001eb229510) # Invalid biotype for core gene: (t1.stable_id) = (Zm00001eb031900) # Invalid biotype for core gene: (t1.stable_id) = (Zm00001eb023090) # Reached limit for number of diagnostic messages # Execute # SELECT t1.stable_id FROM # gene t1 INNER JOIN # seq_region USING (seq_region_id) INNER JOIN # coord_system USING (coord_system_id) LEFT OUTER JOIN # biotype t2 ON ( # t1.biotype = t2.name COLLATE latin1_bin AND # t2.object_type = 'gene' AND # FIND_IN_SET('core', db_type) # ) # WHERE # t1.biotype IS NOT NULL AND # t2.name IS NULL AND # coord_system.species_id = 1 # against zea_mays_core_50_103_8 to see all results not ok 2 - Transcripts have valid biotypes # Failed test 'Transcripts have valid biotypes' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/GeneBiotypes.pm line 75. # got: '77341' # expected: '0' # Invalid biotype for core transcript: (t1.stable_id) = (Zm00001eb428660_T001) # Invalid biotype for core transcript: (t1.stable_id) = (Zm00001eb220250_T001) # Invalid biotype for core transcript: (t1.stable_id) = (Zm00001eb220250_T002) # Invalid biotype for core transcript: (t1.stable_id) = (Zm00001eb104060_T001) # Invalid biotype for core transcript: (t1.stable_id) = (Zm00001eb104060_T002) # Invalid biotype for core transcript: (t1.stable_id) = (Zm00001eb105040_T004) # Invalid biotype for core transcript: (t1.stable_id) = (Zm00001eb105040_T001) # Invalid biotype for core transcript: (t1.stable_id) = (Zm00001eb105040_T006) # Invalid biotype for core transcript: (t1.stable_id) = (Zm00001eb105040_T003) # Invalid biotype for core transcript: (t1.stable_id) = (Zm00001eb105040_T002) # Reached limit for number of diagnostic messages # Execute # SELECT t1.stable_id FROM # transcript t1 INNER JOIN # seq_region USING (seq_region_id) INNER JOIN # coord_system USING (coord_system_id) LEFT OUTER JOIN # biotype t2 ON ( # t1.biotype = t2.name COLLATE latin1_bin AND # t2.object_type = 'transcript' AND # FIND_IN_SET('core', db_type) # ) # WHERE # t1.biotype IS NOT NULL AND # t2.name IS NULL AND # coord_system.species_id = 1 # against zea_mays_core_50_103_8 to see all results Fix: Not fixable: There is no biotype table in my database. SELECT t1.stable_id FROM gene t1 INNER JOIN seq_region USING (seq_region_id) INNER JOIN coord_system USING (coord_system_id) LEFT OUTER JOIN biotype t2 ON (t1.biotype = t2.name COLLATE latin1_bin AND t2.object_type = 'gene' AND FIND_IN_SET('core', db_type)) WHERE t1.biotype IS NOT NULL AND t2.name IS NULL AND coord_system.species_id = 1 limit 5; ERROR 1146 (42S02): Table 'zea_maysb73_core_3_87_1.biotype' doesn't exist 6. MetaCoord .................... # Subtest: MetaCoord not ok 2 - Contents of meta_coord table are correct # Failed test 'Contents of meta_coord table are correct' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/MetaCoord.pm line 49. # Structures begin differing at: # $got->{gene}{2} = '288333' # $expected->{gene}{2} = '34834' # { # 'exon' => { # '1' => 10979, # '2' => 9171 # }, # 'gene' => { # '1' => 751401, # '2' => 288333 # }, # 'repeat_feature' => { # '1' => 300971, # '2' => 161743 # }, # 'transcript' => { # '1' => 17036, # '2' => 16822 # } # } # { # 'exon' => { # '1' => 10979, # '2' => 5086 # }, # 'gene' => { # '1' => 751401, # '2' => 34834 # }, # 'repeat_feature' => { # '1' => 300971, # '2' => 161743 # }, # 'transcript' => { # '1' => 745092, # '2' => 34834 # } # } Fix: > update meta_coord set max_length=34834 where table_name='gene' and coord_system_id=2; > update meta_coord set max_length=34834 where table_name='transcript' and coord_system_id=2; > update meta_coord set max_length=5086 where table_name='exon' and coord_system_id=2; 7. MetaKeyFormat ................ # Subtest: MetaKeyFormat # Subtest: zea_mays, core, zea_mays_core_50_103_8 not ok 2 - Value for assembly.date has correct format # Failed test 'Value for assembly.date has correct format' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyFormat.pm line 70. # '2019' # doesn't match '(?^:^\d{4}-\d{2}$)' => 2019-06 not ok 4 - Value for genebuild.id has correct format # Failed test 'Value for genebuild.id has correct format' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyFormat.pm line 70. # 'cshl2019' # doesn't match '(?^:^\d+$)' => ??? not ok 5 - Value for genebuild.initial_release_date has correct format # Failed test 'Value for genebuild.initial_release_date has correct format' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyFormat.pm line 70. # '2019' # doesn't match '(?^:^\d{4}-\d{2}$)' => 2020-09 not ok 6 - Value for genebuild.last_geneset_update has correct format # Failed test 'Value for genebuild.last_geneset_update has correct format' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyFormat.pm line 70. # '2019' # doesn't match '(?^:^\d{4}-\d{2}$)' => 2020-09 not ok 7 - Value for genebuild.method has correct format # Failed test 'Value for genebuild.method has correct format' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyFormat.pm line 70. # 'mikado' ==> import # doesn't match '(?^:^(full_genebuild|projection_build|import|mixed_strategy_build|external_annotation_import|maker_genebuild|curated)$)' not ok 8 - Value for genebuild.start_date has correct format # Failed test 'Value for genebuild.start_date has correct format' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyFormat.pm line 70. # '2019-cshl' # doesn't match '(?^:^\d{4}\-\d{2}\-\S+$)' => '2019-09-cshl' not ok 54 - All meta.meta_value rows linked to gene.stable_id rows # Failed test 'All meta.meta_value rows linked to gene.stable_id rows' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyFormat.pm line 78. # got: '1' # expected: '0' # Broken referential integrity found with SQL: # SELECT COUNT(*) FROM # meta t1 LEFT JOIN gene t2 ON t1.meta_value = t2.stable_id # WHERE t1.meta_value IS NOT NULL AND t2.stable_id IS NULL # AND meta_key = "sample.gene_param" not ok 55 - All meta.meta_value rows linked to transcript.stable_id rows # Failed test 'All meta.meta_value rows linked to transcript.stable_id rows' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyFormat.pm line 79. # got: '1' # expected: '0' # Broken referential integrity found with SQL: # SELECT COUNT(*) FROM # meta t1 LEFT JOIN transcript t2 ON t1.meta_value = t2.stable_id # WHERE t1.meta_value IS NOT NULL AND t2.stable_id IS NULL # AND meta_key = "sample.transcript_param" Fix: > update meta set meta_value=replace(meta_value, 'Zm00001e000001', 'Zm00001eb404730'); > update meta set meta_value='2019-06' where meta_key='assembly.date'; > update meta set meta_value='2020-09' where meta_key='genebuild.initial_release_date'; > update meta set meta_value='2020-09' where meta_key='genebuild.last_geneset_update'; > update meta set meta_value='import' where meta_key='genebuild.method'; > update meta set meta_value='2019-09-cshl' where meta_key='genebuild.start_date'; # genebuild.id ??? maybe should delete > delete from meta where meta_key='genebuild.id'; 8. MySQLStorageEngine ........... # Subtest: MySQLStorageEngine # Subtest: zea_mays, core, zea_mays_core_50_103_8 not ok 1 - All tables are using MySQL MyISAM storage engine # Failed test 'All tables are using MySQL MyISAM storage engine' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/MySQLStorageEngine.pm line 49. # got: '1' # expected: '0' # Non-MyISAM table: (TABLE_NAME) = (meta_coord_chr) # Execute SELECT TABLE_NAME FROM # information_schema.tables WHERE # table_schema = 'zea_mays_core_50_103_8' AND # engine <> 'MyISAM' # against zea_mays_core_50_103_8 to replicate these results 1..1 # Looks like you failed 1 test of 1. not ok 1 - zea_mays, core, zea_mays_core_50_103_8 # Failed test 'zea_mays, core, zea_mays_core_50_103_8' # at /nfs/panda/ensemblgenomes/development/bcontreras//ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/DbCheck.pm line 560. Fix: > drop table meta_coord_chr; 9. ProteinTranslation ........... # Subtest: ProteinTranslation # Subtest: zea_mays, core, zea_mays_core_50_103_8 ok 1 - Protein-coding genes have translations ok 2 - Amino acid sequences have non-zero length not ok 3 - Protein-coding genes have no internal stop codons # Failed test 'Protein-coding genes have no internal stop codons' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/ProteinTranslation.pm line 80. # got: '1' # expected: '0' # Zm00001eb379010_T001 has invalid translation: MERNRKSVVVRTPFCRPSAADGTGPPLFHPAAQAGALLQIPSTGSPQGLILGRSLHQAAASQLPSPGGVQSSGAGIPSTESGQRAIAQYPLAGFHGHVGCSETRSPPTLTQLLQSSDSFMNYMIGKEGQNLEENAHFIETTSRVSDVDIETDDGIEENRKGSRLIWKHDEDVRMMSAWLKHSLDPVRLDCTGNWSSYFTTRLD* Fix: Can we ignore this if the internal stop codon is at the end of the sequence? Or we could reduce the end_exon's seq_end by 3? (update translation set seq_end=2 where translation_id=588259) mysql> mysql> select e.exon_id, e.seq_region_id, e.seq_region_start, e.seq_region_end, e.seq_region_strand, e.stable_id from exon e join exon_transcript et using(exon_id) where et.transcript_id=593031; +---------+---------------+------------------+----------------+-------------------+----------------------------+ | exon_id | seq_region_id | seq_region_start | seq_region_end | seq_region_strand | stable_id | +---------+---------------+------------------+----------------+-------------------+----------------------------+ | 2265561 | 9 | 28500633 | 28500976 | 1 | Zm00001eb379010_T001.exon1 | | 2265562 | 9 | 28501083 | 28501260 | 1 | Zm00001eb379010_T001.exon2 | | 2265563 | 9 | 28501343 | 28501430 | 1 | Zm00001eb379010_T001.exon3 | | 2265564 | 9 | 28508846 | 28508850 | 1 | Zm00001eb379010_T001.exon4 | +---------+---------------+------------------+----------------+-------------------+----------------------------+ 4 rows in set (0.00 sec) mysql> select translation_id, transcript_id, start_exon_id, seq_start, end_exon_id, seq_end, stable_id from translation where transcript_id=593031; +----------------+---------------+---------------+-----------+-------------+---------+----------------------+ | translation_id | transcript_id | start_exon_id | seq_start | end_exon_id | seq_end | stable_id | +----------------+---------------+---------------+-----------+-------------+---------+----------------------+ | 588259 | 593031 | 2265561 | 1 | 2265564 | 5 | Zm00001eb379010_P001 | +----------------+---------------+---------------+-----------+-------------+---------+----------------------+ 1 row in set (0.00 sec) 10. RepeatFeatures ............... # Subtest: RepeatFeatures not ok 5 - Repeat start > 0 # Failed test 'Repeat start > 0' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/RepeatFeatures.pm line 100. # got: '3' # expected: '0' 1..5 # Looks like you failed 1 test of 5. not ok 1 - zea_mays, core, zea_mays_core_50_103_8 # Failed test 'zea_mays, core, zea_mays_core_50_103_8' # at /nfs/panda/ensemblgenomes/development/bcontreras//ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/DbCheck.pm line 560. Fix: I suggest delete them > delete from repeat_feature where repeat_start <= 0; mysql> select * from repeat_feature where repeat_start <= 0; +-------------------+---------------+------------------+----------------+-------------------+--------------+------------+---------------------+-------------+-------+ | repeat_feature_id | seq_region_id | seq_region_start | seq_region_end | seq_region_strand | repeat_start | repeat_end | repeat_consensus_id | analysis_id | score | +-------------------+---------------+------------------+----------------+-------------------+--------------+------------+---------------------+-------------+-------+ | 1627421 | 10 | 50242381 | 50242579 | 1 | -5 | 1550 | 118 | 5 | 5129 | | 4212931 | 9 | 41532865 | 41533553 | 1 | 0 | 1550 | 129 | 5 | 4451 | | 4413729 | 1 | 2355321 | 2355836 | 1 | 0 | 1550 | 129 | 5 | 4270 | +-------------------+---------------+------------------+----------------+-------------------+--------------+------------+---------------------+-------------+-------+ 3 rows in set (0.93 sec) 11. SpeciesTaxonomy .............. # Subtest: SpeciesTaxonomy not ok 4 - Species name correct for taxonomy ID (4577) # Failed test 'Species name correct for taxonomy ID (4577)' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/SpeciesTaxonomy.pm line 107. # got: 'Zea maize' # expected: 'Zea mays' 1..4 not ok 1 - zea_mays, core, zea_mays_core_50_103_8 # Failed test 'zea_mays, core, zea_mays_core_50_103_8' # at /nfs/panda/ensemblgenomes/development/bcontreras//ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/DbCheck.pm line 560. Fix: Seem like a typo, 'maize' should be 'mays', maybe it is in the meta table? 12. VersionedGenes ............... # Subtest: VersionedGenes # Subtest: zea_mays, core, zea_mays_core_50_103_8 not ok 1 - Genes are unversioned # Failed test 'Genes are unversioned' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/VersionedGenes.pm line 84. # got: '44303' # expected: '0' # Versioned gene: (gene.stable_id, gene.version) = (Zm00001eb093920, 1) # Versioned gene: (gene.stable_id, gene.version) = (Zm00001eb033650, 1) # Versioned gene: (gene.stable_id, gene.version) = (Zm00001eb392700, 1) # Versioned gene: (gene.stable_id, gene.version) = (Zm00001eb166650, 1) # Versioned gene: (gene.stable_id, gene.version) = (Zm00001eb305300, 1) # Versioned gene: (gene.stable_id, gene.version) = (Zm00001eb181300, 1) # Versioned gene: (gene.stable_id, gene.version) = (Zm00001eb298920, 1) # Versioned gene: (gene.stable_id, gene.version) = (Zm00001eb229510, 1) # Versioned gene: (gene.stable_id, gene.version) = (Zm00001eb031900, 1) # Versioned gene: (gene.stable_id, gene.version) = (Zm00001eb023090, 1) # Reached limit for number of diagnostic messages # Execute # SELECT gene.stable_id, gene.version FROM # gene INNER JOIN # seq_region sr USING (seq_region_id) INNER JOIN # coord_system cs USING (coord_system_id) # WHERE cs.species_id = 1 # AND gene.version IS NOT NULL # against zea_mays_core_50_103_8 to see all results not ok 2 - Transcripts are unversioned # Failed test 'Transcripts are unversioned' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/VersionedGenes.pm line 84. # got: '77341' # expected: '0' # Versioned transcript: (transcript.stable_id, transcript.version) = (Zm00001eb428660_T001, 1) # Versioned transcript: (transcript.stable_id, transcript.version) = (Zm00001eb220250_T001, 1) # Versioned transcript: (transcript.stable_id, transcript.version) = (Zm00001eb220250_T002, 1) # Versioned transcript: (transcript.stable_id, transcript.version) = (Zm00001eb104060_T001, 1) # Versioned transcript: (transcript.stable_id, transcript.version) = (Zm00001eb104060_T002, 1) # Versioned transcript: (transcript.stable_id, transcript.version) = (Zm00001eb105040_T004, 1) # Versioned transcript: (transcript.stable_id, transcript.version) = (Zm00001eb105040_T001, 1) # Versioned transcript: (transcript.stable_id, transcript.version) = (Zm00001eb105040_T006, 1) # Versioned transcript: (transcript.stable_id, transcript.version) = (Zm00001eb105040_T003, 1) # Versioned transcript: (transcript.stable_id, transcript.version) = (Zm00001eb105040_T002, 1) # Reached limit for number of diagnostic messages # Execute # SELECT transcript.stable_id, transcript.version FROM # transcript INNER JOIN # seq_region sr USING (seq_region_id) INNER JOIN # coord_system cs USING (coord_system_id) # WHERE cs.species_id = 1 # AND transcript.version IS NOT NULL # against zea_mays_core_50_103_8 to see all results not ok 3 - Exons are unversioned # Failed test 'Exons are unversioned' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/VersionedGenes.pm line 84. # got: '273409' # expected: '0' # Versioned exon: (exon.stable_id, exon.version) = (Zm00001eb284360_T001.exon5, 1) # Versioned exon: (exon.stable_id, exon.version) = (Zm00001eb284360_T001.exon6, 1) # Versioned exon: (exon.stable_id, exon.version) = (Zm00001eb284360_T001.exon7, 1) # Versioned exon: (exon.stable_id, exon.version) = (Zm00001eb284360_T001.exon8, 1) # Versioned exon: (exon.stable_id, exon.version) = (Zm00001eb284360_T001.exon9, 1) # Versioned exon: (exon.stable_id, exon.version) = (Zm00001eb284360_T001.exon10, 1) # Versioned exon: (exon.stable_id, exon.version) = (Zm00001eb284360_T001.exon11, 1) # Versioned exon: (exon.stable_id, exon.version) = (Zm00001eb284360_T001.exon12, 1) # Versioned exon: (exon.stable_id, exon.version) = (Zm00001eb284360_T001.exon13, 1) # Versioned exon: (exon.stable_id, exon.version) = (Zm00001eb284360_T001.exon14, 1) # Reached limit for number of diagnostic messages # Execute # SELECT exon.stable_id, exon.version FROM # exon INNER JOIN # seq_region sr USING (seq_region_id) INNER JOIN # coord_system cs USING (coord_system_id) # WHERE cs.species_id = 1 # AND exon.version IS NOT NULL # against zea_mays_core_50_103_8 to see all results not ok 4 - Translations are unversioned # Failed test 'Translations are unversioned' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/VersionedGenes.pm line 110. # got: '72569' # expected: '0' # Versioned translation: (tn.stable_id, tn.version) = (Zm00001eb289080_P002, 1) # Versioned translation: (tn.stable_id, tn.version) = (Zm00001eb294830_P001, 1) # Versioned translation: (tn.stable_id, tn.version) = (Zm00001eb231290_P002, 1) # Versioned translation: (tn.stable_id, tn.version) = (Zm00001eb231290_P004, 1) # Versioned translation: (tn.stable_id, tn.version) = (Zm00001eb231290_P001, 1) # Versioned translation: (tn.stable_id, tn.version) = (Zm00001eb231290_P003, 1) # Versioned translation: (tn.stable_id, tn.version) = (Zm00001eb231290_P005, 1) # Versioned translation: (tn.stable_id, tn.version) = (Zm00001eb417370_P003, 1) # Versioned translation: (tn.stable_id, tn.version) = (Zm00001eb417370_P002, 1) # Versioned translation: (tn.stable_id, tn.version) = (Zm00001eb417370_P001, 1) # Reached limit for number of diagnostic messages # Execute # SELECT tn.stable_id, tn.version FROM # translation tn INNER JOIN # transcript tt USING (transcript_id) INNER JOIN # seq_region sr USING (seq_region_id) INNER JOIN # coord_system cs USING (coord_system_id) # WHERE cs.species_id = 1 # AND tn.version IS NOT NULL # against zea_mays_core_50_103_8 to see all results 1..4 # Looks like you failed 4 tests of 4. Fix: Don't quite understand the problem. But guess we should not assign a default version for gene/transcript/exon/translation ? > update gene/transcript/exon/translation set version = null; ? 13. WhitespaceCritical ........... # Subtest: WhitespaceCritical not ok 43 - Column external_db.description contains no carriage returns # Failed test 'Column external_db.description contains no carriage returns' # at /nfs/panda/ensemblgenomes/development/bcontreras/ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/WhitespaceCritical.pm line 67. # got: '3' # expected: '0' # Whitespace characters (50684, EO, 1, XREF, 0, Environment Ontology, MISC, NULL, NULL, Plant environmental conditions ontology terms. # More information in there: # http://www.gramene.org/plant_ontology/ontology_browse.html#eo) # Whitespace characters (50734, TAIR_TRANSLATION, 1, XREF, 1, TAIR Translation identifier, MISC, NULL, NULL, TAIR identifier to link to Ensembl Translation entities. # The main requirement behind this entry, is to be able to link TAIR GO annotations to Ensembl Translation objects.) # Whitespace characters (50781, TriTryp_pathway_comparison, 1, KNOWNXREF, 50, TriTryp pathway comparison database, MISC, NULL, NULL, TriTryp pathway comparison database (http://tritrypdb.org/tritrypdb/) # ) # Execute # SELECT * FROM external_db # WHERE description REGEXP ' ' # against zea_mays_core_50_103_8 to replicate these results Fix: again this is related external_db, shoud be fixed if we replace it with the ensembl_production.external_db.