Coverage for ramose / hash_format.py: 98%

45 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-15 15:58 +0000

1# SPDX-FileCopyrightText: 2018-2021 Silvio Peroni <silvio.peroni@unibo.it> 

2# SPDX-FileCopyrightText: 2020-2021 Marilena Daquino <marilena.daquino2@unibo.it> 

3# SPDX-FileCopyrightText: 2022 Davide Brembilla 

4# SPDX-FileCopyrightText: 2024 Ivan Heibi <ivan.heibi2@unibo.it> 

5# SPDX-FileCopyrightText: 2025 Sergei Slinkin 

6# SPDX-FileCopyrightText: 2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

7# 

8# SPDX-License-Identifier: ISC 

9 

10from pathlib import Path 

11from re import DOTALL, search 

12 

13BUILTIN_PARAMS = frozenset({"require", "filter", "sort", "format", "json", "page", "page_size"}) 

14 

15 

16def parse_disable_params(raw: str) -> set[str]: 

17 stripped = raw.strip() 

18 if stripped == "*": 

19 return set(BUILTIN_PARAMS) 

20 return {name.strip() for name in stripped.split(",") if name.strip()} 

21 

22 

23def parse_custom_params(raw: str) -> dict[str, dict[str, str]]: 

24 result = {} 

25 for raw_part in raw.split(";"): 

26 part = raw_part.strip() 

27 if not part: 

28 continue 

29 name, handler, phase, *desc_parts = part.split(",", 3) 

30 result[name.strip()] = { 

31 "handler": handler.strip(), 

32 "phase": phase.strip(), 

33 "description": desc_parts[0].strip() if desc_parts else "", 

34 } 

35 return result 

36 

37 

38class HashFormatHandler: 

39 """This class creates an object capable to read files stored in Hash Format (see 

40 https://github.com/opencitations/ramose#Hashformat-configuration-file). A Hash Format 

41 file (.hf) is a specification file that includes information structured using the following 

42 syntax: 

43 

44 ``` 

45 #<field_name_1> <field_value_1> 

46 #<field_name_1> <field_value_2> 

47 #<field_name_3> <field_value_3> 

48 [...] 

49 #<field_name_n> <field_value_n> 

50 ```""" 

51 

52 def read(self, file_path): 

53 """This method takes in input a path of a file containing a document specified in 

54 Hash Format, and returns its representation as list of dictionaries.""" 

55 result = [] 

56 

57 with Path(file_path).open(newline=None) as f: 

58 first_field_name = None 

59 cur_object: dict[str, str] = {} 

60 cur_field_name = None 

61 for line in f: 

62 cur_matching = search(r"^#([^\s]+)\s(.+)$", line, DOTALL) 

63 if cur_matching is not None: 

64 cur_field_name = cur_matching.group(1) 

65 cur_field_content = cur_matching.group(2) 

66 

67 # If both the name and the content are defined, continue to process 

68 if cur_field_name and cur_field_content: 

69 # Identify the separator key 

70 if first_field_name is None: 

71 first_field_name = cur_field_name 

72 

73 # If the current field is equal to the separator key, 

74 # then create a new object 

75 if cur_field_name == first_field_name: 

76 # If there is an already defined object, add it to the 

77 # final result 

78 if cur_object: 

79 result.append(cur_object) 

80 cur_object = {} 

81 

82 # Add the new key to the object 

83 cur_object[cur_field_name] = cur_field_content 

84 elif cur_object and cur_field_name is not None: 

85 cur_object[cur_field_name] += line 

86 

87 if cur_object: 

88 result.append(cur_object) 

89 

90 # Clean the final \n 

91 for item in result: 

92 for key in item: 

93 item[key] = item[key].rstrip() 

94 

95 return result