Coverage for oc_ds_converter / oc_idmanager / wikipedia.py: 61%

88 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-25 18:06 +0000

1# SPDX-FileCopyrightText: 2023-2024 Arcangelo Massari <arcangelo.massari@unibo.it> 

2# 

3# SPDX-License-Identifier: ISC 

4 

5 

6from json import loads 

7from re import match, sub 

8from time import sleep 

9from urllib.parse import unquote 

10 

11from oc_ds_converter.oc_idmanager.base import IdentifierManager 

12from requests import ReadTimeout, get 

13from requests.exceptions import ConnectionError 

14 

15 

16class WikipediaManager(IdentifierManager): 

17 """This class implements an identifier manager for wikidata identifier""" 

18 

19 def __init__(self, data={}, use_api_service=True): 

20 """Wikipedia manager constructor.""" 

21 super(WikipediaManager, self).__init__() 

22 self._api = "https://en.wikipedia.org/w/api.php/" 

23 self._use_api_service = use_api_service 

24 self._p = "wikipedia:" 

25 self._data = data 

26 

27 def is_valid(self, wikipedia_id, get_extra_info=False): 

28 

29 wikipedia_id = self.normalise(wikipedia_id, include_prefix=True) 

30 

31 if wikipedia_id is None: 

32 return False 

33 else: 

34 if wikipedia_id not in self._data or self._data[wikipedia_id] is None: 

35 if get_extra_info: 

36 info = self.exists(wikipedia_id, get_extra_info=True) 

37 self._data[wikipedia_id] = info[1] 

38 return (info[0] and self.syntax_ok(wikipedia_id)), info[1] 

39 self._data[wikipedia_id] = dict() 

40 self._data[wikipedia_id]["valid"] = True if (self.syntax_ok(wikipedia_id) and self.exists(wikipedia_id)) else False 

41 return self._data[wikipedia_id].get("valid") 

42 if get_extra_info: 

43 return self._data[wikipedia_id].get("valid"), self._data[wikipedia_id] 

44 return self._data[wikipedia_id].get("valid") 

45 

46 def normalise(self, id_string, include_prefix=False): 

47 try: 

48 if id_string.startswith(self._p): 

49 wikipedia_string = id_string[len(self._p):] 

50 else: 

51 wikipedia_string = id_string 

52 

53 wikipedia_string = sub("\0+", "", sub("[^0-9]", "", unquote(wikipedia_string))) 

54 return "%s%s" % ( 

55 self._p if include_prefix else "", 

56 wikipedia_string.strip(), 

57 ) 

58 except: 

59 # Any error in processing the MediaWiki pageID will return None 

60 return None 

61 

62 def syntax_ok(self, id_string): 

63 

64 if not id_string.startswith("wikipedia:"): 

65 id_string = self._p + id_string 

66 return True if match("^wikipedia:[1-9][0-9]*$", id_string) else False 

67 

68 def exists(self, wikipedia_id_full, get_extra_info=False, allow_extra_api=None): 

69 valid_bool = True 

70 if self._use_api_service: 

71 wikipedia_id = self.normalise(wikipedia_id_full) 

72 if wikipedia_id is not None: 

73 tentative = 3 

74 while tentative: 

75 tentative -= 1 

76 try: 

77 query_params = { 

78 "action": "query", 

79 "pageids" : wikipedia_id, 

80 "format": "json", 

81 "formatversion": "1", # format of json output (current version 1; might be replaced w/ v.2) 

82 } 

83 

84 r = get(self._api, params=query_params, headers=self._headers, timeout=30) # controlla 

85 if r.status_code == 200: 

86 r.encoding = "utf-8" 

87 json_res = loads(r.text) 

88 if get_extra_info: 

89 extra_info_result = {} 

90 try: 

91 result = True if 'title' in json_res['query']['pages'][wikipedia_id].keys() else False 

92 extra_info_result["valid"] = result 

93 return result, extra_info_result 

94 except KeyError: 

95 extra_info_result["valid"] = False 

96 return False, extra_info_result 

97 try: 

98 return True if 'title' in json_res['query']['pages'][wikipedia_id].keys() else False 

99 except KeyError: 

100 return False 

101 

102 elif 400 <= r.status_code < 500: 

103 if get_extra_info: 

104 return False, {"valid": False} 

105 return False 

106 except ReadTimeout: 

107 # Do nothing, just try again 

108 pass 

109 except ConnectionError: 

110 # Sleep 5 seconds, then try again 

111 sleep(5) 

112 valid_bool=False 

113 else: 

114 if get_extra_info: 

115 return False, {"valid": False} 

116 return False 

117 

118 if get_extra_info: 

119 return valid_bool, {"valid": valid_bool} 

120 return valid_bool 

121 

122 def extra_info(self, api_response, choose_api=None, info_dict={}): 

123 result = {} 

124 result["valid"] = True 

125 # to be implemented 

126 return result