Coverage for oc_validator / id_existence.py: 98%
89 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 15:46 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 15:46 +0000
1# Copyright (c) 2023, OpenCitations <contact@opencitations.net>
2#
3# Permission to use, copy, modify, and/or distribute this software for any purpose
4# with or without fee is hereby granted, provided that the above copyright notice
5# and this permission notice appear in all copies.
6#
7# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
8# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
9# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,
10# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
11# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
12# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
13# SOFTWARE.
15from oc_ds_converter.oc_idmanager import doi, isbn, issn, orcid, pmcid, pmid, ror, url, viaf, wikidata, wikipedia, \
16 openalex, crossref, jid, arxiv
17from sparqlite import SPARQLClient, SPARQLError
18import logging
20logger = logging.getLogger('oc_validator')
23class IdExistence:
24 """
25 Checks whether an external identifier actually exists by querying either
26 the appropriate external service API, the OpenCitations Meta triplestore,
27 or both.
28 """
30 def __init__(self, use_meta_endpoint: bool = True) -> None:
31 """
32 Initialise ID managers and the SPARQL endpoint for existence checks.
34 :param use_meta_endpoint: Whether to query the OC Meta triplestore
35 before falling back to external services. Defaults to ``True``.
36 :type use_meta_endpoint: bool
37 :rtype: None
38 """
39 self.doi_mngr = doi.DOIManager()
40 self.isbn_mngr = isbn.ISBNManager()
41 self.issn_mngr = issn.ISSNManager()
42 self.orcid_mngr = orcid.ORCIDManager()
43 self.pmcid_mngr = pmcid.PMCIDManager()
44 self.pmid_mngr = pmid.PMIDManager()
45 self.ror_mngr = ror.RORManager()
46 self.url_mngr = url.URLManager()
47 self.viaf_mngr = viaf.ViafManager()
48 self.wikidata_mngr = wikidata.WikidataManager()
49 self.wikipedia_mngr = wikipedia.WikipediaManager()
50 self.openalex_mngr = openalex.OpenAlexManager()
51 self.crossref_mngr = crossref.CrossrefManager()
52 self.jid_mngr = jid.JIDManager()
53 self.arxiv_mngr = arxiv.ArXivManager()
54 self.use_meta_endpoint = use_meta_endpoint
55 self.sparql = SPARQLClient("https://sparql.opencitations.net/meta")
57 def close(self) -> None:
58 """Close the SPARQL client and release resources."""
59 self.sparql.close()
61 def _recreate_sparql_client(self) -> None:
62 """Close and recreate the SPARQL client to release accumulated resources."""
63 self.sparql.close()
64 self.sparql = SPARQLClient("https://sparql.opencitations.net/meta")
66 def check_id_existence(self, id: str) -> bool:
67 """
68 Check whether an identifier exists in external services or Meta.
70 If ``use_meta_endpoint`` is ``False``, only external services are queried.
71 If ``True``, the OC Meta triplestore is queried first; if the ID is found
72 there, ``True`` is returned immediately. Otherwise, the external service
73 is queried as a fallback.
75 :param id: The identifier string, including its prefix.
76 :type id: str
77 :return: ``True`` if the identifier is confirmed to exist, ``False`` otherwise.
78 :rtype: bool
79 """
80 if id.startswith('temp:') or id.startswith('local:'): # temp: and local: internal IDs are always considered as exisiting
81 return True
82 if id.startswith('omid:'): # OMID needs to be checked with a specific query on the triplestore
83 return self.query_omid_in_meta(id)
84 if self.use_meta_endpoint:
85 meta_response = self.query_meta_triplestore(id)
86 return meta_response if meta_response is True else self.query_external_service(id)
87 return self.query_external_service(id)
89 def query_external_service(self, id: str) -> bool:
90 """
91 Check whether an identifier is registered in its native service.
93 Dispatches to the appropriate manager's ``exists()`` method based on
94 the identifier prefix.
96 :param id: The identifier string, including its prefix.
97 :type id: str
98 :return: ``True`` if the identifier exists in the external service, ``False`` otherwise.
99 :rtype: bool
100 """
101 oc_prefix = id[:(id.index(':') + 1)]
103 if oc_prefix == 'doi:':
104 vldt = self.doi_mngr
105 elif oc_prefix == 'isbn:':
106 vldt = self.isbn_mngr
107 elif oc_prefix == 'issn:':
108 vldt = self.issn_mngr
109 elif oc_prefix == 'orcid:':
110 vldt = self.orcid_mngr
111 elif oc_prefix == 'pmcid:':
112 vldt = self.pmcid_mngr
113 elif oc_prefix == 'pmid:':
114 vldt = self.pmid_mngr
115 elif oc_prefix == 'ror:':
116 vldt = self.ror_mngr
117 elif oc_prefix == 'url:':
118 vldt = self.url_mngr
119 elif oc_prefix == 'viaf:':
120 vldt = self.viaf_mngr
121 elif oc_prefix == 'wikidata:':
122 vldt = self.wikidata_mngr
123 elif oc_prefix == 'wikipedia:':
124 vldt = self.wikipedia_mngr
125 elif oc_prefix == 'openalex:':
126 vldt = self.openalex_mngr
127 elif oc_prefix == 'crossref:':
128 vldt = self.crossref_mngr
129 elif oc_prefix == 'jid:':
130 vldt = self.jid_mngr
131 elif oc_prefix == 'arxiv:':
132 vldt = self.arxiv_mngr
133 else:
134 return False
135 return vldt.exists(id.replace(oc_prefix, '', 1))
137 def query_meta_triplestore(self, id: str) -> bool:
138 """
139 Check whether an identifier exists in the OpenCitations Meta triplestore via SPARQL.
141 Uses the ``sparqlite`` client's built-in retry with exponential backoff
142 for transient failures. Returns ``False`` if the query fails after all
143 retries are exhausted.
145 :param id: The identifier string, including its prefix.
146 :type id: str
147 :return: ``True`` if the triplestore confirms the ID exists, ``False`` otherwise.
148 :rtype: bool
149 """
150 oc_prefix = id[:(id.index(':') + 1)]
151 lookup_id = id.replace(oc_prefix, '', 1)
152 datacite_id_scheme = oc_prefix[:-1] # same as OC prefix but without the ":"
154 q = '''
155 PREFIX datacite: <http://purl.org/spar/datacite/>
156 PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/>
157 PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
159 ASK {
160 VALUES ?val { "%s" "%s"^^xsd:string }
161 ?identifier literal:hasLiteralValue ?val .
162 ?res datacite:hasIdentifier ?identifier .
163 ?identifier datacite:usesIdentifierScheme datacite:%s .
164 }
165 ''' % (lookup_id, lookup_id, datacite_id_scheme)
167 try:
168 return self.sparql.ask(q)
169 except SPARQLError as e:
170 logger.warning("SPARQL query failed for '%s' after retries: %s", id, e)
171 return False
173 def query_omid_in_meta(self, id: str) -> bool:
174 """
175 Check whether an OMID is registered in the OpenCitations Meta triplestore.
177 Uses a dedicated SPARQL query that checks for the OMID as both
178 subject and object. Returns ``False`` if the query fails after all
179 retries are exhausted.
181 :param id: The OMID string, including the ``omid:`` prefix.
182 :type id: str
183 :return: ``True`` if the OMID exists in Meta, ``False`` otherwise.
184 :rtype: bool
185 """
186 lookup_id = id.replace('omid:', '', 1)
188 q = '''
189 ASK WHERE {
190 { <https://w3id.org/oc/meta/%s> ?p ?o }
191 UNION
192 { ?s ?p <https://w3id.org/oc/meta/%s> }
193 }
194 ''' % (lookup_id, lookup_id)
196 try:
197 return self.sparql.ask(q)
198 except SPARQLError as e:
199 logger.warning("OMID SPARQL query failed for '%s' after retries: %s", id, e)
200 return False