Coverage for oc_ds_converter / lib / master_of_regex.py: 100%

19 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-25 18:06 +0000

1# SPDX-FileCopyrightText: 2023-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

2# 

3# SPDX-License-Identifier: ISC 

4 

5# Split by ';' outside '[]' (any spaces before and after ';'). 

6semicolon_in_people_field = r'\s*;\s*(?=[^\]]*(?:\[|$))' 

7 

8# It gets string inside '[]' ignoring any space between (ex: [ TARGET ] --> TARGET). 

9# An id schema must be present, followed by a colon. 

10# Before the colon, there must be any character that is not a square bracket  

11# to prevent that in strings like 'Boezaart, Andr[eacute] [omid:123]' the id captured is '[eacute] [omid:123]'. 

12# Alternatively, empty square brackets containing one or more spaces also represent a valid match. 

13ids_inside_square_brackets = r'\[\s*((?:[^\s]+:[^\s]+)?(?:\s+[^\s]+:[^\s]+)*)\s*\]' 

14 

15# It gets the name and ids in two capturing groups. 

16# As for ids, it gets the string inside '[]' ignoring any space between (ex: [ TARGET ] --> TARGET). 

17# An id schema must be present, followed by a colon. 

18name_and_ids = fr'\s*(.*?)\s*{ids_inside_square_brackets}' 

19 

20# It captures a colon preceded and followed by zero or more spaces. 

21colon_and_spaces = r'\s*:\s*' 

22 

23# It captures a comma preceded and followed by zero or more spaces. 

24comma_and_spaces = r'\s*,\s*' 

25 

26# It captures one or more spaces. 

27one_or_more_spaces = r'\s+' 

28 

29# It captures any pages range separator. 

30pages_separator = r'[^A-Za-z\d]+(?=[A-Za-z\d]+)' 

31 

32# It captures an ORCID 

33orcid_pattern = r'([0-9]{4}-){3}[0-9]{3}[0-9X]' 

34 

35# A series of patterns useful to clean invalid "volume" and "issue" fields 

36good_sep = r'\-' 

37bad_sep = r'&\/_,\.:+;\(\[\|' 

38separators = good_sep + bad_sep 

39alphabets = r'a-zà-öø-ÿ\u0430-я\u0391-ω' # basic latin, latin-1 supplement, cyrillic, greek 

40vi_pattern = fr'((?:[{alphabets}]*\d[{alphabets}\d]*|[ivxlcdm]+)(?:\s?(?:[{separators}]|and|\()\s?[{alphabets}\d]+\)?)*?)' 

41numero = r'(?:№|no?(?!v)\.?|n[º°]\.?|n[uú]m(?:[eé]ro)?|number)' 

42year_pattern = r'(\d{4})' 

43valid_vi_patterns = [ 

44 vi_pattern, 

45 fr'[‹<\()]?[{alphabets}]+?([{separators}\s]?[{alphabets}])*[\)›>]?', 

46 fr'[{alphabets}{separators}\s]+{vi_pattern}', 

47 fr"[{alphabets}\d\-'/]*\d[{alphabets}\d\-'/]*(,?\s[{alphabets}\d\-'/]+)+", 

48 fr'\(?s(uppl([eéi]ment(ary|um)?))?\)?\s?(part)?\s?(s?{vi_pattern})?', 

49 fr'({vi_pattern}_)?({vi_pattern}\s)?[\(_]?supp?(plement|pl)?l?[\s\._]*({vi_pattern}|[{alphabets}])?\)?\.?', 

50 fr'{vi_pattern}*,?\s?part[\s_]{vi_pattern}(\sof\s{vi_pattern})?(,\sno?\.\s?{vi_pattern})?', 

51 fr'{vi_pattern}*[_\s]?pt?[_\s\.]{vi_pattern}', 

52 r'(ed|pt|d)\sinside(d|r)', 

53 r'p(ublish\s)?a(head\sof\s)?p(rint)?', 

54 '預刊文章', 

55 r'[\u0621-\u064A]+', 

56 fr'\[{year_pattern}\]\s(\d\s)?[{alphabets}]+', 

57 fr'{vi_pattern}\s\[\+CDROM\]', 

58 fr'{vi_pattern}[{separators}\s]?\({vi_pattern}\)(\s{vi_pattern})?', 

59 fr'([{alphabets}]+\.)?[{alphabets}]+\.?', 

60 fr'[{alphabets}]+-\d+', 

61 fr'[{alphabets}]+(_[{alphabets}]+)+', 

62 fr'{numero}:?\s?{vi_pattern}(,?\s({year_pattern}|\({vi_pattern}\)))?', 

63 r'historica\svol\.\s\d+(,\d+(-\d+)?)?', 

64 r'\d+\(\d+\)\d{2,4}', 

65 fr'(\[{year_pattern}\]\s)?(\d+\s)?vl?r(\s\([a-z]+\))?', 

66 fr'\({vi_pattern}\/{vi_pattern}\)\s[{alphabets}]+(-[{alphabets}]+)?' 

67] 

68volumes_valid_patterns = [ 

69 r'original\sseries,\svolume\s\d+', 

70 fr'(vol(ume)?|tome|cilt)\s?[{separators}]?\s?{vi_pattern}' 

71] 

72issues_valid_patterns = [ 

73 fr'issue[\.,]?\s{vi_pattern}', 

74 fr'({vi_pattern}\s)?e?sp?e?(ecial)?[\s_\-\.](issue)?(_number_)?[\s_-]?({vi_pattern})?(["“][{alphabets}\s]+?["”])?', 

75 fr'ö(zel)?(\ss(ayı)?|\(special\))?(\s?{vi_pattern})?', 

76 fr'({numero}[{separators}\s]?)?hors[{separators}\s]série[{separators}\s]{vi_pattern}', 

77 '특별호', 

78 fr'([{alphabets}]+\s{year_pattern}\s)?\(?(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(tember)?|oct(ober)?|(nov|dec)(ember)?|spring|summer|autumn|winter)(\s{year_pattern})?\)?', 

79 fr'{vi_pattern},\spart\s{vi_pattern}\sof\s{vi_pattern}', 

80 fr'sayı[{separators}\s]\s?{vi_pattern}', 

81 fr'issues?\s{vi_pattern},\s(supplement|part)\s{vi_pattern}', 

82 fr'issues?\s{vi_pattern}\.?\spp\.\s[a-z\d]+[^a-z\d]+[a-z\d]+' 

83] 

84invalid_vi_patterns = { 

85 fr'.*?(?:vol\.?(?:ume)?|tome)(?:[{separators}]?\s?){vi_pattern}[\-&\/_,\.:+;\(\)\[\]|\s]*(?:{numero}|issues?)[{separators}|\s]*(?:sp[eé]cial\s)?{vi_pattern}': 'vol_iss', 

86 fr'{vi_pattern},\s?{numero}\s?{vi_pattern}': 'vol_iss', 

87 fr'tập\s?{vi_pattern},?\s?số\s?{vi_pattern}': 'vol_iss', 

88 fr'issues?\s{vi_pattern}\svol\.?(?:ume)?\s{vi_pattern}(?:.*?{year_pattern}.*?)?': 'iss_vol_year', 

89 fr"{vi_pattern}\s?\({vi_pattern}'{year_pattern}\)": 'vol_iss_year', 

90 fr'cilt[{separators}\s]\s?{vi_pattern}[{separators}\s]sayı[{separators}\s]\s?{vi_pattern}(?:[{separators}\s]\s?temmuz\s{year_pattern})?': 'vol_iss_year', 

91 r'&na;|n\/a|not\savailable': 'del', 

92 r'[\:\-\.`ё/]': 'del', 

93 fr'\${{[{alphabets}]+(\.[{alphabets}]+)?}}': 'del', 

94 fr"[&\/_,:+;\|`'#]\s*{vi_pattern}": 'all', 

95 fr'[\->+]{vi_pattern}': 'do_nothing', 

96 fr"{vi_pattern}[\.+]": "do_nothing", 

97 fr"{numero}?[{separators}]?\s?{vi_pattern}[&\/_,:;\|`'\(\[\{{]": 'all', 

98 fr'{vi_pattern}\(\)': 'all', 

99 fr'n[�?]+{vi_pattern}': 'all', 

100 fr'{vi_pattern}(?:â\x80[\x92\x93\x94]|�+|â|\?+){vi_pattern}': 'sep', 

101 fr'{vi_pattern}\s?\(first\sserie': 's)' 

102}