Coverage for oc_ds_converter / oc_idmanager / wikipedia.py: 61%
88 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
1# SPDX-FileCopyrightText: 2023-2024 Arcangelo Massari <arcangelo.massari@unibo.it>
2#
3# SPDX-License-Identifier: ISC
6from json import loads
7from re import match, sub
8from time import sleep
9from urllib.parse import unquote
11from oc_ds_converter.oc_idmanager.base import IdentifierManager
12from requests import ReadTimeout, get
13from requests.exceptions import ConnectionError
16class WikipediaManager(IdentifierManager):
17 """This class implements an identifier manager for wikidata identifier"""
19 def __init__(self, data={}, use_api_service=True):
20 """Wikipedia manager constructor."""
21 super(WikipediaManager, self).__init__()
22 self._api = "https://en.wikipedia.org/w/api.php/"
23 self._use_api_service = use_api_service
24 self._p = "wikipedia:"
25 self._data = data
27 def is_valid(self, wikipedia_id, get_extra_info=False):
29 wikipedia_id = self.normalise(wikipedia_id, include_prefix=True)
31 if wikipedia_id is None:
32 return False
33 else:
34 if wikipedia_id not in self._data or self._data[wikipedia_id] is None:
35 if get_extra_info:
36 info = self.exists(wikipedia_id, get_extra_info=True)
37 self._data[wikipedia_id] = info[1]
38 return (info[0] and self.syntax_ok(wikipedia_id)), info[1]
39 self._data[wikipedia_id] = dict()
40 self._data[wikipedia_id]["valid"] = True if (self.syntax_ok(wikipedia_id) and self.exists(wikipedia_id)) else False
41 return self._data[wikipedia_id].get("valid")
42 if get_extra_info:
43 return self._data[wikipedia_id].get("valid"), self._data[wikipedia_id]
44 return self._data[wikipedia_id].get("valid")
46 def normalise(self, id_string, include_prefix=False):
47 try:
48 if id_string.startswith(self._p):
49 wikipedia_string = id_string[len(self._p):]
50 else:
51 wikipedia_string = id_string
53 wikipedia_string = sub("\0+", "", sub("[^0-9]", "", unquote(wikipedia_string)))
54 return "%s%s" % (
55 self._p if include_prefix else "",
56 wikipedia_string.strip(),
57 )
58 except:
59 # Any error in processing the MediaWiki pageID will return None
60 return None
62 def syntax_ok(self, id_string):
64 if not id_string.startswith("wikipedia:"):
65 id_string = self._p + id_string
66 return True if match("^wikipedia:[1-9][0-9]*$", id_string) else False
68 def exists(self, wikipedia_id_full, get_extra_info=False, allow_extra_api=None):
69 valid_bool = True
70 if self._use_api_service:
71 wikipedia_id = self.normalise(wikipedia_id_full)
72 if wikipedia_id is not None:
73 tentative = 3
74 while tentative:
75 tentative -= 1
76 try:
77 query_params = {
78 "action": "query",
79 "pageids" : wikipedia_id,
80 "format": "json",
81 "formatversion": "1", # format of json output (current version 1; might be replaced w/ v.2)
82 }
84 r = get(self._api, params=query_params, headers=self._headers, timeout=30) # controlla
85 if r.status_code == 200:
86 r.encoding = "utf-8"
87 json_res = loads(r.text)
88 if get_extra_info:
89 extra_info_result = {}
90 try:
91 result = True if 'title' in json_res['query']['pages'][wikipedia_id].keys() else False
92 extra_info_result["valid"] = result
93 return result, extra_info_result
94 except KeyError:
95 extra_info_result["valid"] = False
96 return False, extra_info_result
97 try:
98 return True if 'title' in json_res['query']['pages'][wikipedia_id].keys() else False
99 except KeyError:
100 return False
102 elif 400 <= r.status_code < 500:
103 if get_extra_info:
104 return False, {"valid": False}
105 return False
106 except ReadTimeout:
107 # Do nothing, just try again
108 pass
109 except ConnectionError:
110 # Sleep 5 seconds, then try again
111 sleep(5)
112 valid_bool=False
113 else:
114 if get_extra_info:
115 return False, {"valid": False}
116 return False
118 if get_extra_info:
119 return valid_bool, {"valid": valid_bool}
120 return valid_bool
122 def extra_info(self, api_response, choose_api=None, info_dict={}):
123 result = {}
124 result["valid"] = True
125 # to be implemented
126 return result