From c8d886d875609254fec32ab45b6b0cdd6f73d961 Mon Sep 17 00:00:00 2001 From: Trish Whetzel Date: Mon, 11 Nov 2024 13:03:21 -0800 Subject: [PATCH 1/5] init qc checks --- ...iple-equivalentTo-gene-associations.sparql | 35 +++++++++++++++++++ ...ltiple-subclassOf-gene-associations.sparql | 32 +++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 src/sparql/qc/mondo/qc-multiple-equivalentTo-gene-associations.sparql create mode 100644 src/sparql/qc/mondo/qc-multiple-subclassOf-gene-associations.sparql diff --git a/src/sparql/qc/mondo/qc-multiple-equivalentTo-gene-associations.sparql b/src/sparql/qc/mondo/qc-multiple-equivalentTo-gene-associations.sparql new file mode 100644 index 0000000000..5edb1614fd --- /dev/null +++ b/src/sparql/qc/mondo/qc-multiple-equivalentTo-gene-associations.sparql @@ -0,0 +1,35 @@ +PREFIX rdf: +PREFIX owl: +PREFIX rdfs: +PREFIX obo: +PREFIX oboInOwl: +PREFIX skos: + +SELECT DISTINCT ?class ?mondoCurie ?classLabel ?roProperty (GROUP_CONCAT(DISTINCT ?hgncIdentifier; separator=", ") AS ?hgncIdentifiers) (COALESCE(?source, "No Source") AS ?sourceAnnotation) +WHERE { + ?class owl:equivalentClass ?equivClass ; + rdfs:label ?classLabel . + + ?equivClass owl:intersectionOf/rdf:rest*/rdf:first ?component . + + ?component rdf:type owl:Restriction ; + owl:onProperty obo:RO_0004003 ; + owl:someValuesFrom ?hgncIdentifier . + + # Filter for gene identifiers with HGNC prefix + FILTER(STRSTARTS(STR(?hgncIdentifier), "http://identifiers.org/hgnc/") || STRSTARTS(STR(?hgncIdentifier), "http://identifiers.org/ncbigene/")) + + OPTIONAL { + ?annotation a owl:Axiom ; + owl:annotatedSource ?class ; + owl:annotatedProperty obo:RO_0004003 ; + owl:annotatedTarget ?hgncIdentifier ; + oboInOwl:source ?source . + } + + BIND(REPLACE(STR(?class), "http://purl.obolibrary.org/obo/MONDO_", "MONDO:") AS ?mondoCurie) + BIND(STR(obo:RO_0004003) AS ?roProperty) +} +GROUP BY ?class ?mondoCurie ?classLabel ?roProperty ?source +HAVING (COUNT(DISTINCT ?hgncIdentifier) > 1) +ORDER BY ?mondoCurie diff --git a/src/sparql/qc/mondo/qc-multiple-subclassOf-gene-associations.sparql b/src/sparql/qc/mondo/qc-multiple-subclassOf-gene-associations.sparql new file mode 100644 index 0000000000..f5fc9e0fed --- /dev/null +++ b/src/sparql/qc/mondo/qc-multiple-subclassOf-gene-associations.sparql @@ -0,0 +1,32 @@ +PREFIX rdf: +PREFIX owl: +PREFIX rdfs: +PREFIX obo: +PREFIX oboInOwl: + +SELECT DISTINCT ?class ?mondoCurie ?classLabel ?roProperty (GROUP_CONCAT(DISTINCT ?geneIdentifier; separator=", ") AS ?geneIdentifiers) (COALESCE(?source, "No Source") AS ?sourceAnnotation) +WHERE { + ?class rdfs:subClassOf ?restriction ; + rdfs:label ?classLabel . + + ?restriction rdf:type owl:Restriction ; + owl:onProperty obo:RO_0004003 ; + owl:someValuesFrom ?geneIdentifier . + + # Filter for gene identifiers with HGNC or NCBIGene prefixes + FILTER(STRSTARTS(STR(?geneIdentifier), "http://identifiers.org/hgnc/") || STRSTARTS(STR(?geneIdentifier), "http://identifiers.org/ncbigene/")) + + OPTIONAL { + ?annotation a owl:Axiom ; + owl:annotatedSource ?class ; + owl:annotatedProperty obo:RO_0004003 ; + owl:annotatedTarget ?geneIdentifier ; + oboInOwl:source ?source . + } + + BIND(REPLACE(STR(?class), "http://purl.obolibrary.org/obo/MONDO_", "MONDO:") AS ?mondoCurie) + BIND(STR(obo:RO_0004003) AS ?roProperty) +} +GROUP BY ?class ?mondoCurie ?classLabel ?roProperty ?source +HAVING (COUNT(DISTINCT ?geneIdentifier) > 1) +ORDER BY ?mondoCurie From 7a7d9ace964e4448b79b76ab5872cdcd0a7fdbc5 Mon Sep 17 00:00:00 2001 From: Trish Whetzel Date: Mon, 11 Nov 2024 17:17:22 -0800 Subject: [PATCH 2/5] update queries --- ...iple-equivalentTo-gene-associations.sparql | 30 +++++++------------ ...ltiple-subclassOf-gene-associations.sparql | 24 +++++---------- 2 files changed, 17 insertions(+), 37 deletions(-) diff --git a/src/sparql/qc/mondo/qc-multiple-equivalentTo-gene-associations.sparql b/src/sparql/qc/mondo/qc-multiple-equivalentTo-gene-associations.sparql index 5edb1614fd..b18e2b0e22 100644 --- a/src/sparql/qc/mondo/qc-multiple-equivalentTo-gene-associations.sparql +++ b/src/sparql/qc/mondo/qc-multiple-equivalentTo-gene-associations.sparql @@ -5,31 +5,21 @@ PREFIX obo: PREFIX oboInOwl: PREFIX skos: -SELECT DISTINCT ?class ?mondoCurie ?classLabel ?roProperty (GROUP_CONCAT(DISTINCT ?hgncIdentifier; separator=", ") AS ?hgncIdentifiers) (COALESCE(?source, "No Source") AS ?sourceAnnotation) +# Get classes that have more than 1 equivalentTo gene association + +SELECT DISTINCT ?entity ?label (GROUP_CONCAT(DISTINCT ?geneIdentifier; separator=", ") AS ?geneIdentifiers) WHERE { - ?class owl:equivalentClass ?equivClass ; - rdfs:label ?classLabel . + ?entity owl:equivalentClass ?equivClass ; + rdfs:label ?label . ?equivClass owl:intersectionOf/rdf:rest*/rdf:first ?component . ?component rdf:type owl:Restriction ; owl:onProperty obo:RO_0004003 ; - owl:someValuesFrom ?hgncIdentifier . - - # Filter for gene identifiers with HGNC prefix - FILTER(STRSTARTS(STR(?hgncIdentifier), "http://identifiers.org/hgnc/") || STRSTARTS(STR(?hgncIdentifier), "http://identifiers.org/ncbigene/")) - - OPTIONAL { - ?annotation a owl:Axiom ; - owl:annotatedSource ?class ; - owl:annotatedProperty obo:RO_0004003 ; - owl:annotatedTarget ?hgncIdentifier ; - oboInOwl:source ?source . - } + owl:someValuesFrom ?geneIdentifier . - BIND(REPLACE(STR(?class), "http://purl.obolibrary.org/obo/MONDO_", "MONDO:") AS ?mondoCurie) - BIND(STR(obo:RO_0004003) AS ?roProperty) + # Filter for gene identifiers with HGNC prefix or NCBI prefix (for non-human genes) + FILTER(STRSTARTS(STR(?geneIdentifier), "http://identifiers.org/hgnc/") || STRSTARTS(STR(?geneIdentifier), "http://identifiers.org/ncbigene/")) } -GROUP BY ?class ?mondoCurie ?classLabel ?roProperty ?source -HAVING (COUNT(DISTINCT ?hgncIdentifier) > 1) -ORDER BY ?mondoCurie +GROUP BY ?entity ?label +HAVING (COUNT(DISTINCT ?geneIdentifier) > 1) diff --git a/src/sparql/qc/mondo/qc-multiple-subclassOf-gene-associations.sparql b/src/sparql/qc/mondo/qc-multiple-subclassOf-gene-associations.sparql index f5fc9e0fed..fa9d9cd2c7 100644 --- a/src/sparql/qc/mondo/qc-multiple-subclassOf-gene-associations.sparql +++ b/src/sparql/qc/mondo/qc-multiple-subclassOf-gene-associations.sparql @@ -4,29 +4,19 @@ PREFIX rdfs: PREFIX obo: PREFIX oboInOwl: -SELECT DISTINCT ?class ?mondoCurie ?classLabel ?roProperty (GROUP_CONCAT(DISTINCT ?geneIdentifier; separator=", ") AS ?geneIdentifiers) (COALESCE(?source, "No Source") AS ?sourceAnnotation) +# Get classes that have more than 1 subClassOf gene association + +SELECT DISTINCT ?entity ?label (GROUP_CONCAT(DISTINCT ?geneIdentifier; separator=", ") AS ?geneIdentifiers) WHERE { - ?class rdfs:subClassOf ?restriction ; - rdfs:label ?classLabel . + ?entity rdfs:subClassOf ?restriction ; + rdfs:label ?label . ?restriction rdf:type owl:Restriction ; owl:onProperty obo:RO_0004003 ; owl:someValuesFrom ?geneIdentifier . - # Filter for gene identifiers with HGNC or NCBIGene prefixes + # Filter for gene identifiers with HGNC or NCBIGene prefixes or NCBI prefix (for non-human genes) FILTER(STRSTARTS(STR(?geneIdentifier), "http://identifiers.org/hgnc/") || STRSTARTS(STR(?geneIdentifier), "http://identifiers.org/ncbigene/")) - - OPTIONAL { - ?annotation a owl:Axiom ; - owl:annotatedSource ?class ; - owl:annotatedProperty obo:RO_0004003 ; - owl:annotatedTarget ?geneIdentifier ; - oboInOwl:source ?source . - } - - BIND(REPLACE(STR(?class), "http://purl.obolibrary.org/obo/MONDO_", "MONDO:") AS ?mondoCurie) - BIND(STR(obo:RO_0004003) AS ?roProperty) } -GROUP BY ?class ?mondoCurie ?classLabel ?roProperty ?source +GROUP BY ?entity ?label HAVING (COUNT(DISTINCT ?geneIdentifier) > 1) -ORDER BY ?mondoCurie From 493c0809f2331db02be30f865a996f279ec8d17a Mon Sep 17 00:00:00 2001 From: Trish Whetzel Date: Mon, 11 Nov 2024 17:18:26 -0800 Subject: [PATCH 3/5] remove individual queries --- ...iple-equivalentTo-gene-associations.sparql | 25 ------------------- ...ltiple-subclassOf-gene-associations.sparql | 22 ---------------- 2 files changed, 47 deletions(-) delete mode 100644 src/sparql/qc/mondo/qc-multiple-equivalentTo-gene-associations.sparql delete mode 100644 src/sparql/qc/mondo/qc-multiple-subclassOf-gene-associations.sparql diff --git a/src/sparql/qc/mondo/qc-multiple-equivalentTo-gene-associations.sparql b/src/sparql/qc/mondo/qc-multiple-equivalentTo-gene-associations.sparql deleted file mode 100644 index b18e2b0e22..0000000000 --- a/src/sparql/qc/mondo/qc-multiple-equivalentTo-gene-associations.sparql +++ /dev/null @@ -1,25 +0,0 @@ -PREFIX rdf: -PREFIX owl: -PREFIX rdfs: -PREFIX obo: -PREFIX oboInOwl: -PREFIX skos: - -# Get classes that have more than 1 equivalentTo gene association - -SELECT DISTINCT ?entity ?label (GROUP_CONCAT(DISTINCT ?geneIdentifier; separator=", ") AS ?geneIdentifiers) -WHERE { - ?entity owl:equivalentClass ?equivClass ; - rdfs:label ?label . - - ?equivClass owl:intersectionOf/rdf:rest*/rdf:first ?component . - - ?component rdf:type owl:Restriction ; - owl:onProperty obo:RO_0004003 ; - owl:someValuesFrom ?geneIdentifier . - - # Filter for gene identifiers with HGNC prefix or NCBI prefix (for non-human genes) - FILTER(STRSTARTS(STR(?geneIdentifier), "http://identifiers.org/hgnc/") || STRSTARTS(STR(?geneIdentifier), "http://identifiers.org/ncbigene/")) -} -GROUP BY ?entity ?label -HAVING (COUNT(DISTINCT ?geneIdentifier) > 1) diff --git a/src/sparql/qc/mondo/qc-multiple-subclassOf-gene-associations.sparql b/src/sparql/qc/mondo/qc-multiple-subclassOf-gene-associations.sparql deleted file mode 100644 index fa9d9cd2c7..0000000000 --- a/src/sparql/qc/mondo/qc-multiple-subclassOf-gene-associations.sparql +++ /dev/null @@ -1,22 +0,0 @@ -PREFIX rdf: -PREFIX owl: -PREFIX rdfs: -PREFIX obo: -PREFIX oboInOwl: - -# Get classes that have more than 1 subClassOf gene association - -SELECT DISTINCT ?entity ?label (GROUP_CONCAT(DISTINCT ?geneIdentifier; separator=", ") AS ?geneIdentifiers) -WHERE { - ?entity rdfs:subClassOf ?restriction ; - rdfs:label ?label . - - ?restriction rdf:type owl:Restriction ; - owl:onProperty obo:RO_0004003 ; - owl:someValuesFrom ?geneIdentifier . - - # Filter for gene identifiers with HGNC or NCBIGene prefixes or NCBI prefix (for non-human genes) - FILTER(STRSTARTS(STR(?geneIdentifier), "http://identifiers.org/hgnc/") || STRSTARTS(STR(?geneIdentifier), "http://identifiers.org/ncbigene/")) -} -GROUP BY ?entity ?label -HAVING (COUNT(DISTINCT ?geneIdentifier) > 1) From d0cd02ee6dd2bf525dddd4d66646abaa81c66ce2 Mon Sep 17 00:00:00 2001 From: Trish Whetzel Date: Mon, 11 Nov 2024 17:19:27 -0800 Subject: [PATCH 4/5] add combined query for mult gene associations --- .../qc-multiple-gene-associations.sparql | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 src/sparql/qc/mondo/qc-multiple-gene-associations.sparql diff --git a/src/sparql/qc/mondo/qc-multiple-gene-associations.sparql b/src/sparql/qc/mondo/qc-multiple-gene-associations.sparql new file mode 100644 index 0000000000..1fa354d310 --- /dev/null +++ b/src/sparql/qc/mondo/qc-multiple-gene-associations.sparql @@ -0,0 +1,34 @@ +PREFIX rdf: +PREFIX owl: +PREFIX rdfs: +PREFIX obo: +PREFIX oboInOwl: + +# Get classes that have more than 1 gene association (either subClassOf or equivalentClass) with RO:0004003 property + +SELECT DISTINCT ?entity ?label (GROUP_CONCAT(DISTINCT ?geneIdentifier; separator=", ") AS ?geneIdentifiers) +WHERE { + { + # subClassOf association + ?entity rdfs:subClassOf ?restriction ; + rdfs:label ?label . + + ?restriction rdf:type owl:Restriction ; + owl:onProperty obo:RO_0004003 ; + owl:someValuesFrom ?geneIdentifier . + } + UNION + { + # equivalentClass association + ?entity owl:equivalentClass ?equivClass ; + rdfs:label ?label . + + ?equivClass owl:intersectionOf/rdf:rest*/rdf:first ?component . + + ?component rdf:type owl:Restriction ; + owl:onProperty obo:RO_0004003 ; + owl:someValuesFrom ?geneIdentifier . + } +} +GROUP BY ?entity ?label +HAVING (COUNT(DISTINCT ?geneIdentifier) > 1) From 17889aacdc9f3f30d2e7e80afd5f98e6b3fe400e Mon Sep 17 00:00:00 2001 From: Trish Whetzel Date: Mon, 11 Nov 2024 17:55:23 -0800 Subject: [PATCH 5/5] add qc check for mismatched gene identifiers --- .../mondo/qc-gene-identifier-mismatch.sparql | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 src/sparql/qc/mondo/qc-gene-identifier-mismatch.sparql diff --git a/src/sparql/qc/mondo/qc-gene-identifier-mismatch.sparql b/src/sparql/qc/mondo/qc-gene-identifier-mismatch.sparql new file mode 100644 index 0000000000..6fa7433526 --- /dev/null +++ b/src/sparql/qc/mondo/qc-gene-identifier-mismatch.sparql @@ -0,0 +1,35 @@ +PREFIX rdf: +PREFIX owl: +PREFIX rdfs: +PREFIX obo: + +# Find classes with mismatched gene identifiers added as equivalentTo and subClassOf + +SELECT DISTINCT ?entity ?label ?equivGeneIdentifier ?subClassGeneIdentifier +WHERE { + ?entity rdf:type owl:Class ; + rdfs:label ?label . + + # Equivalent class restriction + ?entity owl:equivalentClass ?equivClass . + ?equivClass owl:intersectionOf/rdf:rest*/rdf:first ?equivComponent . + + ?equivComponent rdf:type owl:Restriction ; + owl:onProperty obo:RO_0004003 ; + owl:someValuesFrom ?equivGeneIdentifier . + + # subClassOf restriction + ?entity rdfs:subClassOf ?subClassRestriction . + ?subClassRestriction rdf:type owl:Restriction ; + owl:onProperty obo:RO_0004003 ; + owl:someValuesFrom ?subClassGeneIdentifier . + + # Filter for gene identifiers with HGNC or NCBIGene prefixes + FILTER(STRSTARTS(STR(?equivGeneIdentifier), "http://identifiers.org/hgnc/") || + STRSTARTS(STR(?equivGeneIdentifier), "http://identifiers.org/ncbigene/")) + FILTER(STRSTARTS(STR(?subClassGeneIdentifier), "http://identifiers.org/hgnc/") || + STRSTARTS(STR(?subClassGeneIdentifier), "http://identifiers.org/ncbigene/")) + + # Filter for cases where the gene identifiers do not match + FILTER(?equivGeneIdentifier != ?subClassGeneIdentifier) +}