Coverage for oc_meta / lib / master_of_regex.py: 97%
34 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-21 09:24 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-21 09:24 +0000
1# SPDX-FileCopyrightText: 2022-2026 Arcangelo Massari <arcangelo.massari@unibo.it>
2#
3# SPDX-License-Identifier: ISC
5import re
7# Split by ';' outside '[]' (any spaces before and after ';').
8semicolon_in_people_field = r'\s*;\s*(?=[^\]]*(?:\[|$))'
10# A single id token "schema:value". Square brackets are excluded from both
11# sides of the colon so a stray '[' that leaks into the name (e.g.
12# '[Labour Party[ [omid:ra/123]') cannot be absorbed into the captured id.
13_ID_TOKEN = r'[^\s\[\]]+:[^\s\[\]]+'
15# It captures a colon preceded and followed by zero or more spaces.
16colon_and_spaces = r'\s*:\s*'
18# It captures a comma preceded and followed by zero or more spaces.
19comma_and_spaces = r'\s*,\s*'
21# It captures one or more spaces.
22one_or_more_spaces = r'\s+'
24RE_ENTITY_URI = re.compile(r'^(?P<base>https://w3id\.org/oc/meta)/(?P<short_name>br|ra|ar|re|id)/(?P<supplier_prefix>06[1-9]*0)(?P<entity_number>[1-9]\d*)$')
25RE_SEMICOLON_IN_PEOPLE_FIELD = re.compile(semicolon_in_people_field)
27# Parses a responsible-agent / venue cell into a name and an optional list of
28# ids inside square brackets. Always matches any input: when the '[ids]' block
29# is present the engine captures it (preferring the first occurrence in a
30# multi-RA string), otherwise the whole (trimmed) cell lands in 'name' and
31# 'ids' is None. Downstream code can therefore rely on .groups() always
32# succeeding; a regex failure indicates a real bug and should crash.
33RE_NAME_AND_IDS = re.compile(
34 rf'''
35 \s*
36 (?P<name> .*? ) # name, possibly empty, possibly with junk
37 (?:
38 \s* \[ \s*
39 (?P<ids>
40 (?: {_ID_TOKEN} )? # optional first id ...
41 (?: \s+ {_ID_TOKEN} )* # ... followed by any number of space-separated ids
42 )
43 \s* \]
44 | \s* \Z # or no '[ids]' block at all
45 )
46 ''',
47 re.VERBOSE,
48)
51def split_name_and_ids(text: str) -> tuple[str, str]:
52 """Parse a responsible-agent / venue cell into ``(name, ids_str)``.
54 ``RE_NAME_AND_IDS`` is built to match any input, so a ``None`` here
55 signals a regression in the pattern itself and must be raised loudly
56 rather than silently fallen back to.
57 """
58 match = RE_NAME_AND_IDS.match(text)
59 if match is None:
60 raise RuntimeError(f"RE_NAME_AND_IDS failed to match {text!r}")
61 return match["name"], match["ids"] or ""
64RE_COLON_AND_SPACES = re.compile(colon_and_spaces)
65RE_COMMA_AND_SPACES = re.compile(comma_and_spaces)
66RE_ONE_OR_MORE_SPACES = re.compile(one_or_more_spaces)
67RE_MULTIPLE_SPACES = re.compile(r'\s+')
69# It captures any pages range separator.
70pages_separator = r'[^A-Za-z\d]+(?=[A-Za-z\d]+)'
72# It captures an ORCID
73orcid_pattern = r'([0-9]{4}-){3}[0-9]{3}[0-9X]'
75# A series of patterns useful to clean invalid "volume" and "issue" fields
76good_sep = r'\-'
77bad_sep = r'&\/_,\.:+;\(\[\|'
78separators = good_sep + bad_sep
79alphabets = r'a-zà-öø-ÿ\u0430-я\u0391-ω' # basic latin, latin-1 supplement, cyrillic, greek
80vi_pattern = fr'((?:[{alphabets}]*\d[{alphabets}\d]*|[ivxlcdm]+)(?:\s?(?:[{separators}]|and|\()\s?[{alphabets}\d]+\)?)*?)'
81numero = r'(?:№|no?(?!v)\.?|n[º°]\.?|n[uú]m(?:[eé]ro)?|number)'
82year_pattern = r'(\d{4})'
83valid_vi_patterns = [
84 vi_pattern,
85 fr'[‹<\()]?[{alphabets}]+?([{separators}\s]?[{alphabets}])*[\)›>]?',
86 fr'[{alphabets}{separators}\s]+{vi_pattern}',
87 fr"[{alphabets}\d\-'/]*\d[{alphabets}\d\-'/]*(,?\s[{alphabets}\d\-'/]+)+",
88 fr'\(?s(uppl([eéi]ment(ary|um)?))?\)?\s?(part)?\s?(s?{vi_pattern})?',
89 fr'({vi_pattern}_)?({vi_pattern}\s)?[\(_]?supp?(plement|pl)?l?[\s\._]*({vi_pattern}|[{alphabets}])?\)?\.?',
90 fr'{vi_pattern}*,?\s?part[\s_]{vi_pattern}(\sof\s{vi_pattern})?(,\sno?\.\s?{vi_pattern})?',
91 fr'{vi_pattern}*[_\s]?pt?[_\s\.]{vi_pattern}',
92 r'(ed|pt|d)\sinside(d|r)',
93 r'p(ublish\s)?a(head\sof\s)?p(rint)?',
94 '預刊文章',
95 '[\u0621-\u064A]+',
96 fr'\[{year_pattern}\]\s(\d\s)?[{alphabets}]+',
97 fr'{vi_pattern}\s\[\+CDROM\]',
98 fr'{vi_pattern}[{separators}\s]?\({vi_pattern}\)(\s{vi_pattern})?',
99 fr'([{alphabets}]+\.)?[{alphabets}]+\.?',
100 fr'[{alphabets}]+-\d+',
101 fr'[{alphabets}]+(_[{alphabets}]+)+',
102 fr'{numero}:?\s?{vi_pattern}(,?\s({year_pattern}|\({vi_pattern}\)))?',
103 r'historica\svol\.\s\d+(,\d+(-\d+)?)?',
104 r'\d+\(\d+\)\d{2,4}',
105 fr'(\[{year_pattern}\]\s)?(\d+\s)?vl?r(\s\([a-z]+\))?',
106 fr'\({vi_pattern}\/{vi_pattern}\)\s[{alphabets}]+(-[{alphabets}]+)?'
107]
108volumes_valid_patterns = [
109 r'original\sseries,\svolume\s\d+',
110 fr'(vol(ume)?|tome|cilt)\s?[{separators}]?\s?{vi_pattern}'
111]
112issues_valid_patterns = [
113 fr'issue[\.,]?\s{vi_pattern}',
114 fr'({vi_pattern}\s)?e?sp?e?(ecial)?[\s_\-\.](issue)?(_number_)?[\s_-]?({vi_pattern})?(["“][{alphabets}\s]+?["”])?',
115 fr'ö(zel)?(\ss(ayı)?|\(special\))?(\s?{vi_pattern})?',
116 fr'({numero}[{separators}\s]?)?hors[{separators}\s]série[{separators}\s]{vi_pattern}',
117 '특별호',
118 fr'([{alphabets}]+\s{year_pattern}\s)?\(?(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(tember)?|oct(ober)?|(nov|dec)(ember)?|spring|summer|autumn|winter)(\s{year_pattern})?\)?',
119 fr'{vi_pattern},\spart\s{vi_pattern}\sof\s{vi_pattern}',
120 fr'sayı[{separators}\s]\s?{vi_pattern}',
121 fr'issues?\s{vi_pattern},\s(supplement|part)\s{vi_pattern}',
122 fr'issues?\s{vi_pattern}\.?\spp\.\s[a-z\d]+[^a-z\d]+[a-z\d]+'
123]
124invalid_vi_patterns = {
125 fr'.*?(?:vol\.?(?:ume)?|tome)(?:[{separators}]?\s?){vi_pattern}[\-&\/_,\.:+;\(\)\[\]|\s]*(?:{numero}|issues?)[{separators}|\s]*(?:sp[eé]cial\s)?{vi_pattern}': 'vol_iss',
126 fr'{vi_pattern},\s?{numero}\s?{vi_pattern}': 'vol_iss',
127 fr'tập\s?{vi_pattern},?\s?số\s?{vi_pattern}': 'vol_iss',
128 fr'issues?\s{vi_pattern}\svol\.?(?:ume)?\s{vi_pattern}(?:.*?{year_pattern}.*?)?': 'iss_vol_year',
129 fr"{vi_pattern}\s?\({vi_pattern}'{year_pattern}\)": 'vol_iss_year',
130 fr'cilt[{separators}\s]\s?{vi_pattern}[{separators}\s]sayı[{separators}\s]\s?{vi_pattern}(?:[{separators}\s]\s?temmuz\s{year_pattern})?': 'vol_iss_year',
131 r'&na;|n\/a|not\savailable': 'del',
132 r'[\:\-\.`ё/]': 'del',
133 fr'\${{[{alphabets}]+(\.[{alphabets}]+)?}}': 'del',
134 fr"[&\/_,:+;\|`'#]\s*{vi_pattern}": 'all',
135 fr'[\->+]{vi_pattern}': 'do_nothing',
136 fr"{vi_pattern}[\.+]": "do_nothing",
137 fr"{numero}?[{separators}]?\s?{vi_pattern}[&\/_,:;\|`'\(\[\{{]": 'all',
138 fr'{vi_pattern}\(\)': 'all',
139 fr'n[�?]+{vi_pattern}': 'all',
140 fr'{vi_pattern}(?:â\x80[\x92\x93\x94]|�+|â|\?+){vi_pattern}': 'sep',
141 fr'{vi_pattern}\s?\(first\sserie': 's)'
142}
144RE_INVALID_VI_PATTERNS = {
145 re.compile(f'^{pattern}$', re.IGNORECASE): strategy
146 for pattern, strategy in invalid_vi_patterns.items()
147}
148RE_VOLUMES_VALID_PATTERNS = [
149 re.compile(f'^{pattern}$', re.IGNORECASE)
150 for pattern in volumes_valid_patterns
151]
152RE_ISSUES_VALID_PATTERNS = [
153 re.compile(f'^{pattern}$', re.IGNORECASE)
154 for pattern in issues_valid_patterns
155]