Coverage for oc_meta / lib / master_of_regex.py: 97%

34 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-21 09:24 +0000

1# SPDX-FileCopyrightText: 2022-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

2# 

3# SPDX-License-Identifier: ISC 

4 

5import re 

6 

7# Split by ';' outside '[]' (any spaces before and after ';'). 

8semicolon_in_people_field = r'\s*;\s*(?=[^\]]*(?:\[|$))' 

9 

10# A single id token "schema:value". Square brackets are excluded from both 

11# sides of the colon so a stray '[' that leaks into the name (e.g. 

12# '[Labour Party[ [omid:ra/123]') cannot be absorbed into the captured id. 

13_ID_TOKEN = r'[^\s\[\]]+:[^\s\[\]]+' 

14 

15# It captures a colon preceded and followed by zero or more spaces. 

16colon_and_spaces = r'\s*:\s*' 

17 

18# It captures a comma preceded and followed by zero or more spaces. 

19comma_and_spaces = r'\s*,\s*' 

20 

21# It captures one or more spaces. 

22one_or_more_spaces = r'\s+' 

23 

24RE_ENTITY_URI = re.compile(r'^(?P<base>https://w3id\.org/oc/meta)/(?P<short_name>br|ra|ar|re|id)/(?P<supplier_prefix>06[1-9]*0)(?P<entity_number>[1-9]\d*)$') 

25RE_SEMICOLON_IN_PEOPLE_FIELD = re.compile(semicolon_in_people_field) 

26 

27# Parses a responsible-agent / venue cell into a name and an optional list of 

28# ids inside square brackets. Always matches any input: when the '[ids]' block 

29# is present the engine captures it (preferring the first occurrence in a 

30# multi-RA string), otherwise the whole (trimmed) cell lands in 'name' and 

31# 'ids' is None. Downstream code can therefore rely on .groups() always 

32# succeeding; a regex failure indicates a real bug and should crash. 

33RE_NAME_AND_IDS = re.compile( 

34 rf''' 

35 \s* 

36 (?P<name> .*? ) # name, possibly empty, possibly with junk 

37 (?: 

38 \s* \[ \s* 

39 (?P<ids> 

40 (?: {_ID_TOKEN} )? # optional first id ... 

41 (?: \s+ {_ID_TOKEN} )* # ... followed by any number of space-separated ids 

42 ) 

43 \s* \] 

44 | \s* \Z # or no '[ids]' block at all 

45 ) 

46 ''', 

47 re.VERBOSE, 

48) 

49 

50 

51def split_name_and_ids(text: str) -> tuple[str, str]: 

52 """Parse a responsible-agent / venue cell into ``(name, ids_str)``. 

53 

54 ``RE_NAME_AND_IDS`` is built to match any input, so a ``None`` here 

55 signals a regression in the pattern itself and must be raised loudly 

56 rather than silently fallen back to. 

57 """ 

58 match = RE_NAME_AND_IDS.match(text) 

59 if match is None: 

60 raise RuntimeError(f"RE_NAME_AND_IDS failed to match {text!r}") 

61 return match["name"], match["ids"] or "" 

62 

63 

64RE_COLON_AND_SPACES = re.compile(colon_and_spaces) 

65RE_COMMA_AND_SPACES = re.compile(comma_and_spaces) 

66RE_ONE_OR_MORE_SPACES = re.compile(one_or_more_spaces) 

67RE_MULTIPLE_SPACES = re.compile(r'\s+') 

68 

69# It captures any pages range separator. 

70pages_separator = r'[^A-Za-z\d]+(?=[A-Za-z\d]+)' 

71 

72# It captures an ORCID 

73orcid_pattern = r'([0-9]{4}-){3}[0-9]{3}[0-9X]' 

74 

75# A series of patterns useful to clean invalid "volume" and "issue" fields 

76good_sep = r'\-' 

77bad_sep = r'&\/_,\.:+;\(\[\|' 

78separators = good_sep + bad_sep 

79alphabets = r'a-zà-öø-ÿ\u0430-я\u0391-ω' # basic latin, latin-1 supplement, cyrillic, greek 

80vi_pattern = fr'((?:[{alphabets}]*\d[{alphabets}\d]*|[ivxlcdm]+)(?:\s?(?:[{separators}]|and|\()\s?[{alphabets}\d]+\)?)*?)' 

81numero = r'(?:№|no?(?!v)\.?|n[º°]\.?|n[uú]m(?:[eé]ro)?|number)' 

82year_pattern = r'(\d{4})' 

83valid_vi_patterns = [ 

84 vi_pattern, 

85 fr'[‹<\()]?[{alphabets}]+?([{separators}\s]?[{alphabets}])*[\)›>]?', 

86 fr'[{alphabets}{separators}\s]+{vi_pattern}', 

87 fr"[{alphabets}\d\-'/]*\d[{alphabets}\d\-'/]*(,?\s[{alphabets}\d\-'/]+)+", 

88 fr'\(?s(uppl([eéi]ment(ary|um)?))?\)?\s?(part)?\s?(s?{vi_pattern})?', 

89 fr'({vi_pattern}_)?({vi_pattern}\s)?[\(_]?supp?(plement|pl)?l?[\s\._]*({vi_pattern}|[{alphabets}])?\)?\.?', 

90 fr'{vi_pattern}*,?\s?part[\s_]{vi_pattern}(\sof\s{vi_pattern})?(,\sno?\.\s?{vi_pattern})?', 

91 fr'{vi_pattern}*[_\s]?pt?[_\s\.]{vi_pattern}', 

92 r'(ed|pt|d)\sinside(d|r)', 

93 r'p(ublish\s)?a(head\sof\s)?p(rint)?', 

94 '預刊文章', 

95 '[\u0621-\u064A]+', 

96 fr'\[{year_pattern}\]\s(\d\s)?[{alphabets}]+', 

97 fr'{vi_pattern}\s\[\+CDROM\]', 

98 fr'{vi_pattern}[{separators}\s]?\({vi_pattern}\)(\s{vi_pattern})?', 

99 fr'([{alphabets}]+\.)?[{alphabets}]+\.?', 

100 fr'[{alphabets}]+-\d+', 

101 fr'[{alphabets}]+(_[{alphabets}]+)+', 

102 fr'{numero}:?\s?{vi_pattern}(,?\s({year_pattern}|\({vi_pattern}\)))?', 

103 r'historica\svol\.\s\d+(,\d+(-\d+)?)?', 

104 r'\d+\(\d+\)\d{2,4}', 

105 fr'(\[{year_pattern}\]\s)?(\d+\s)?vl?r(\s\([a-z]+\))?', 

106 fr'\({vi_pattern}\/{vi_pattern}\)\s[{alphabets}]+(-[{alphabets}]+)?' 

107] 

108volumes_valid_patterns = [ 

109 r'original\sseries,\svolume\s\d+', 

110 fr'(vol(ume)?|tome|cilt)\s?[{separators}]?\s?{vi_pattern}' 

111] 

112issues_valid_patterns = [ 

113 fr'issue[\.,]?\s{vi_pattern}', 

114 fr'({vi_pattern}\s)?e?sp?e?(ecial)?[\s_\-\.](issue)?(_number_)?[\s_-]?({vi_pattern})?(["“][{alphabets}\s]+?["”])?', 

115 fr'ö(zel)?(\ss(ayı)?|\(special\))?(\s?{vi_pattern})?', 

116 fr'({numero}[{separators}\s]?)?hors[{separators}\s]série[{separators}\s]{vi_pattern}', 

117 '특별호', 

118 fr'([{alphabets}]+\s{year_pattern}\s)?\(?(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(tember)?|oct(ober)?|(nov|dec)(ember)?|spring|summer|autumn|winter)(\s{year_pattern})?\)?', 

119 fr'{vi_pattern},\spart\s{vi_pattern}\sof\s{vi_pattern}', 

120 fr'sayı[{separators}\s]\s?{vi_pattern}', 

121 fr'issues?\s{vi_pattern},\s(supplement|part)\s{vi_pattern}', 

122 fr'issues?\s{vi_pattern}\.?\spp\.\s[a-z\d]+[^a-z\d]+[a-z\d]+' 

123] 

124invalid_vi_patterns = { 

125 fr'.*?(?:vol\.?(?:ume)?|tome)(?:[{separators}]?\s?){vi_pattern}[\-&\/_,\.:+;\(\)\[\]|\s]*(?:{numero}|issues?)[{separators}|\s]*(?:sp[eé]cial\s)?{vi_pattern}': 'vol_iss', 

126 fr'{vi_pattern},\s?{numero}\s?{vi_pattern}': 'vol_iss', 

127 fr'tập\s?{vi_pattern},?\s?số\s?{vi_pattern}': 'vol_iss', 

128 fr'issues?\s{vi_pattern}\svol\.?(?:ume)?\s{vi_pattern}(?:.*?{year_pattern}.*?)?': 'iss_vol_year', 

129 fr"{vi_pattern}\s?\({vi_pattern}'{year_pattern}\)": 'vol_iss_year', 

130 fr'cilt[{separators}\s]\s?{vi_pattern}[{separators}\s]sayı[{separators}\s]\s?{vi_pattern}(?:[{separators}\s]\s?temmuz\s{year_pattern})?': 'vol_iss_year', 

131 r'&na;|n\/a|not\savailable': 'del', 

132 r'[\:\-\.`ё/]': 'del', 

133 fr'\${{[{alphabets}]+(\.[{alphabets}]+)?}}': 'del', 

134 fr"[&\/_,:+;\|`'#]\s*{vi_pattern}": 'all', 

135 fr'[\->+]{vi_pattern}': 'do_nothing', 

136 fr"{vi_pattern}[\.+]": "do_nothing", 

137 fr"{numero}?[{separators}]?\s?{vi_pattern}[&\/_,:;\|`'\(\[\{{]": 'all', 

138 fr'{vi_pattern}\(\)': 'all', 

139 fr'n[�?]+{vi_pattern}': 'all', 

140 fr'{vi_pattern}(?:â\x80[\x92\x93\x94]|�+|â|\?+){vi_pattern}': 'sep', 

141 fr'{vi_pattern}\s?\(first\sserie': 's)' 

142} 

143 

144RE_INVALID_VI_PATTERNS = { 

145 re.compile(f'^{pattern}$', re.IGNORECASE): strategy 

146 for pattern, strategy in invalid_vi_patterns.items() 

147} 

148RE_VOLUMES_VALID_PATTERNS = [ 

149 re.compile(f'^{pattern}$', re.IGNORECASE) 

150 for pattern in volumes_valid_patterns 

151] 

152RE_ISSUES_VALID_PATTERNS = [ 

153 re.compile(f'^{pattern}$', re.IGNORECASE) 

154 for pattern in issues_valid_patterns 

155]