Coverage for oc_validator / id_existence.py: 98%

89 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-30 15:46 +0000

1# Copyright (c) 2023, OpenCitations <contact@opencitations.net> 

2# 

3# Permission to use, copy, modify, and/or distribute this software for any purpose 

4# with or without fee is hereby granted, provided that the above copyright notice 

5# and this permission notice appear in all copies. 

6# 

7# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 

8# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 

9# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, 

10# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 

11# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 

12# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 

13# SOFTWARE. 

14 

15from oc_ds_converter.oc_idmanager import doi, isbn, issn, orcid, pmcid, pmid, ror, url, viaf, wikidata, wikipedia, \ 

16 openalex, crossref, jid, arxiv 

17from sparqlite import SPARQLClient, SPARQLError 

18import logging 

19 

20logger = logging.getLogger('oc_validator') 

21 

22 

23class IdExistence: 

24 """ 

25 Checks whether an external identifier actually exists by querying either 

26 the appropriate external service API, the OpenCitations Meta triplestore, 

27 or both. 

28 """ 

29 

30 def __init__(self, use_meta_endpoint: bool = True) -> None: 

31 """ 

32 Initialise ID managers and the SPARQL endpoint for existence checks. 

33 

34 :param use_meta_endpoint: Whether to query the OC Meta triplestore 

35 before falling back to external services. Defaults to ``True``. 

36 :type use_meta_endpoint: bool 

37 :rtype: None 

38 """ 

39 self.doi_mngr = doi.DOIManager() 

40 self.isbn_mngr = isbn.ISBNManager() 

41 self.issn_mngr = issn.ISSNManager() 

42 self.orcid_mngr = orcid.ORCIDManager() 

43 self.pmcid_mngr = pmcid.PMCIDManager() 

44 self.pmid_mngr = pmid.PMIDManager() 

45 self.ror_mngr = ror.RORManager() 

46 self.url_mngr = url.URLManager() 

47 self.viaf_mngr = viaf.ViafManager() 

48 self.wikidata_mngr = wikidata.WikidataManager() 

49 self.wikipedia_mngr = wikipedia.WikipediaManager() 

50 self.openalex_mngr = openalex.OpenAlexManager() 

51 self.crossref_mngr = crossref.CrossrefManager() 

52 self.jid_mngr = jid.JIDManager() 

53 self.arxiv_mngr = arxiv.ArXivManager() 

54 self.use_meta_endpoint = use_meta_endpoint 

55 self.sparql = SPARQLClient("https://sparql.opencitations.net/meta") 

56 

57 def close(self) -> None: 

58 """Close the SPARQL client and release resources.""" 

59 self.sparql.close() 

60 

61 def _recreate_sparql_client(self) -> None: 

62 """Close and recreate the SPARQL client to release accumulated resources.""" 

63 self.sparql.close() 

64 self.sparql = SPARQLClient("https://sparql.opencitations.net/meta") 

65 

66 def check_id_existence(self, id: str) -> bool: 

67 """ 

68 Check whether an identifier exists in external services or Meta. 

69 

70 If ``use_meta_endpoint`` is ``False``, only external services are queried. 

71 If ``True``, the OC Meta triplestore is queried first; if the ID is found 

72 there, ``True`` is returned immediately. Otherwise, the external service 

73 is queried as a fallback. 

74 

75 :param id: The identifier string, including its prefix. 

76 :type id: str 

77 :return: ``True`` if the identifier is confirmed to exist, ``False`` otherwise. 

78 :rtype: bool 

79 """ 

80 if id.startswith('temp:') or id.startswith('local:'): # temp: and local: internal IDs are always considered as exisiting 

81 return True 

82 if id.startswith('omid:'): # OMID needs to be checked with a specific query on the triplestore 

83 return self.query_omid_in_meta(id) 

84 if self.use_meta_endpoint: 

85 meta_response = self.query_meta_triplestore(id) 

86 return meta_response if meta_response is True else self.query_external_service(id) 

87 return self.query_external_service(id) 

88 

89 def query_external_service(self, id: str) -> bool: 

90 """ 

91 Check whether an identifier is registered in its native service. 

92 

93 Dispatches to the appropriate manager's ``exists()`` method based on 

94 the identifier prefix. 

95 

96 :param id: The identifier string, including its prefix. 

97 :type id: str 

98 :return: ``True`` if the identifier exists in the external service, ``False`` otherwise. 

99 :rtype: bool 

100 """ 

101 oc_prefix = id[:(id.index(':') + 1)] 

102 

103 if oc_prefix == 'doi:': 

104 vldt = self.doi_mngr 

105 elif oc_prefix == 'isbn:': 

106 vldt = self.isbn_mngr 

107 elif oc_prefix == 'issn:': 

108 vldt = self.issn_mngr 

109 elif oc_prefix == 'orcid:': 

110 vldt = self.orcid_mngr 

111 elif oc_prefix == 'pmcid:': 

112 vldt = self.pmcid_mngr 

113 elif oc_prefix == 'pmid:': 

114 vldt = self.pmid_mngr 

115 elif oc_prefix == 'ror:': 

116 vldt = self.ror_mngr 

117 elif oc_prefix == 'url:': 

118 vldt = self.url_mngr 

119 elif oc_prefix == 'viaf:': 

120 vldt = self.viaf_mngr 

121 elif oc_prefix == 'wikidata:': 

122 vldt = self.wikidata_mngr 

123 elif oc_prefix == 'wikipedia:': 

124 vldt = self.wikipedia_mngr 

125 elif oc_prefix == 'openalex:': 

126 vldt = self.openalex_mngr 

127 elif oc_prefix == 'crossref:': 

128 vldt = self.crossref_mngr 

129 elif oc_prefix == 'jid:': 

130 vldt = self.jid_mngr 

131 elif oc_prefix == 'arxiv:': 

132 vldt = self.arxiv_mngr 

133 else: 

134 return False 

135 return vldt.exists(id.replace(oc_prefix, '', 1)) 

136 

137 def query_meta_triplestore(self, id: str) -> bool: 

138 """ 

139 Check whether an identifier exists in the OpenCitations Meta triplestore via SPARQL. 

140 

141 Uses the ``sparqlite`` client's built-in retry with exponential backoff 

142 for transient failures. Returns ``False`` if the query fails after all 

143 retries are exhausted. 

144 

145 :param id: The identifier string, including its prefix. 

146 :type id: str 

147 :return: ``True`` if the triplestore confirms the ID exists, ``False`` otherwise. 

148 :rtype: bool 

149 """ 

150 oc_prefix = id[:(id.index(':') + 1)] 

151 lookup_id = id.replace(oc_prefix, '', 1) 

152 datacite_id_scheme = oc_prefix[:-1] # same as OC prefix but without the ":" 

153 

154 q = ''' 

155 PREFIX datacite: <http://purl.org/spar/datacite/> 

156 PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/> 

157 PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 

158 

159 ASK { 

160 VALUES ?val { "%s" "%s"^^xsd:string } 

161 ?identifier literal:hasLiteralValue ?val . 

162 ?res datacite:hasIdentifier ?identifier . 

163 ?identifier datacite:usesIdentifierScheme datacite:%s . 

164 } 

165 ''' % (lookup_id, lookup_id, datacite_id_scheme) 

166 

167 try: 

168 return self.sparql.ask(q) 

169 except SPARQLError as e: 

170 logger.warning("SPARQL query failed for '%s' after retries: %s", id, e) 

171 return False 

172 

173 def query_omid_in_meta(self, id: str) -> bool: 

174 """ 

175 Check whether an OMID is registered in the OpenCitations Meta triplestore. 

176 

177 Uses a dedicated SPARQL query that checks for the OMID as both 

178 subject and object. Returns ``False`` if the query fails after all 

179 retries are exhausted. 

180 

181 :param id: The OMID string, including the ``omid:`` prefix. 

182 :type id: str 

183 :return: ``True`` if the OMID exists in Meta, ``False`` otherwise. 

184 :rtype: bool 

185 """ 

186 lookup_id = id.replace('omid:', '', 1) 

187 

188 q = ''' 

189 ASK WHERE { 

190 { <https://w3id.org/oc/meta/%s> ?p ?o } 

191 UNION 

192 { ?s ?p <https://w3id.org/oc/meta/%s> } 

193 } 

194 ''' % (lookup_id, lookup_id) 

195 

196 try: 

197 return self.sparql.ask(q) 

198 except SPARQLError as e: 

199 logger.warning("OMID SPARQL query failed for '%s' after retries: %s", id, e) 

200 return False