Coverage for src / piccione / upload / on_figshare.py: 81%

135 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2026-05-27 20:21 +0000

1# SPDX-FileCopyrightText: 2025-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

2# 

3# SPDX-License-Identifier: ISC 

4 

5import argparse 

6import hashlib 

7import json 

8import time 

9from pathlib import Path 

10from typing import BinaryIO, TypedDict, cast 

11 

12import requests 

13import yaml 

14from requests.exceptions import HTTPError 

15from rich.console import Console 

16from tqdm import tqdm 

17 

18console = Console() 

19 

20BASE_URL = "https://api.figshare.com/v2/account/articles" 

21CHUNK_SIZE = 1048576 

22HTTP_INTERNAL_SERVER_ERROR = 500 

23 

24 

25class FigshareFileInfo(TypedDict): 

26 upload_url: str 

27 id: int | str 

28 

29 

30class FigsharePart(TypedDict): 

31 partNo: int 

32 startOffset: int 

33 endOffset: int 

34 

35 

36class FigsharePartsResponse(TypedDict): 

37 parts: list[FigsharePart] 

38 

39 

40class FigshareExistingFile(TypedDict): 

41 id: int | str 

42 md5: str 

43 

44 

45def get_file_check_data(file_name: str | Path) -> tuple[str, int]: 

46 with Path(file_name).open("rb") as fin: 

47 md5 = hashlib.md5(usedforsecurity=False) 

48 size = 0 

49 data = fin.read(CHUNK_SIZE) 

50 while data: 

51 size += len(data) 

52 md5.update(data) 

53 data = fin.read(CHUNK_SIZE) 

54 return md5.hexdigest(), size 

55 

56 

57def issue_request( 

58 method: str, 

59 url: str, 

60 token: str, 

61 data: str | bytes | dict[str, object] | None = None, 

62 *, 

63 binary: bool = False, 

64) -> dict[str, object] | bytes: 

65 headers = {"Authorization": "token " + token} 

66 if data is not None and not binary: 

67 data = json.dumps(data) 

68 

69 attempt = 0 

70 while True: 

71 attempt += 1 

72 try: 

73 response = requests.request(method, url, headers=headers, data=data, timeout=(30, 300)) 

74 if response.status_code >= HTTP_INTERNAL_SERVER_ERROR: 

75 console.print(f"[ERROR] Server error {response.status_code}: {response.text[:200]}") 

76 wait = min(2 ** (attempt - 1), 60) 

77 console.print(f"Retrying in {wait}s...") 

78 time.sleep(wait) 

79 continue 

80 response.raise_for_status() 

81 try: 

82 return json.loads(response.content) 

83 except ValueError: 

84 return response.content 

85 except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: 

86 console.print(f"[ERROR] Network error: {e}") 

87 wait = min(2 ** (attempt - 1), 60) 

88 console.print(f"Retrying in {wait}s...") 

89 time.sleep(wait) 

90 except HTTPError as e: 

91 console.print(f"[ERROR] HTTP error: {e}") 

92 if e.response is not None: 

93 console.print("Body:", e.response.text) 

94 raise 

95 

96 

97def upload_parts(file_info: FigshareFileInfo, file_path: str | Path, token: str) -> None: 

98 result = issue_request(method="GET", url=file_info["upload_url"], token=token) 

99 if not isinstance(result, dict): 

100 msg = "Expected dict response" 

101 raise TypeError(msg) 

102 console.print(f"\nUploading {Path(file_path).name}:") 

103 

104 parts = cast("FigsharePartsResponse", result)["parts"] 

105 total_size = sum(part["endOffset"] - part["startOffset"] + 1 for part in parts) 

106 

107 with ( 

108 Path(file_path).open("rb") as fin, 

109 tqdm(total=total_size, unit="B", unit_scale=True, unit_divisor=1024) as pbar, 

110 ): 

111 for part in parts: 

112 chunk_size = part["endOffset"] - part["startOffset"] + 1 

113 upload_part(file_info, fin, part, token) 

114 pbar.update(chunk_size) 

115 

116 

117def upload_part(file_info: FigshareFileInfo, stream: BinaryIO, part: FigsharePart, token: str) -> None: 

118 url = f"{file_info['upload_url']}/{part['partNo']}" 

119 stream.seek(part["startOffset"]) 

120 data = stream.read(part["endOffset"] - part["startOffset"] + 1) 

121 issue_request(method="PUT", url=url, data=data, binary=True, token=token) 

122 console.print(" Uploaded part {partNo} from {startOffset} to {endOffset}".format_map(part)) 

123 

124 

125def get_existing_files(article_id: str, token: str) -> dict[str, FigshareExistingFile]: 

126 url = f"{BASE_URL}/{article_id}/files" 

127 headers = {"Authorization": f"token {token}"} 

128 response = requests.get(url, headers=headers, timeout=30) 

129 response.raise_for_status() 

130 return {f["name"]: {"id": f["id"], "md5": f["computed_md5"]} for f in response.json()} 

131 

132 

133def delete_file(article_id: str, file_id: str, token: str) -> None: 

134 url = f"{BASE_URL}/{article_id}/files/{file_id}" 

135 headers = {"Authorization": f"token {token}"} 

136 response = requests.delete(url, headers=headers, timeout=30) 

137 response.raise_for_status() 

138 

139 

140def create_file(article_id: str, file_name: str, file_path: str | Path, token: str) -> FigshareFileInfo: 

141 url = f"{BASE_URL}/{article_id}/files" 

142 headers = {"Authorization": f"token {token}"} 

143 md5, size = get_file_check_data(file_path) 

144 data = {"name": Path(file_name).name, "md5": md5, "size": size} 

145 post_response = requests.post(url, headers=headers, json=data, timeout=30) 

146 post_response.raise_for_status() 

147 get_response = requests.get(post_response.json()["location"], headers=headers, timeout=30) 

148 get_response.raise_for_status() 

149 return get_response.json() 

150 

151 

152def complete_upload(article_id: str, file_id: str, token: str) -> None: 

153 url = f"{BASE_URL}/{article_id}/files/{file_id}" 

154 issue_request(method="POST", url=url, token=token) 

155 console.print(f" Upload completion confirmed for file {file_id}") 

156 

157 

158def main(config_path: str | Path) -> None: 

159 with Path(config_path).open() as f: 

160 config = yaml.safe_load(f) 

161 

162 token = config["TOKEN"] 

163 article_id = config["ARTICLE_ID"] 

164 files_to_upload = config["files_to_upload"] 

165 

166 console.print(f"Starting upload of {len(files_to_upload)} files to Figshare...") 

167 existing_files = get_existing_files(article_id, token) 

168 console.print(f"Found {len(existing_files)} existing files in article") 

169 

170 for file_path in tqdm(files_to_upload, desc="Total progress", unit="file"): 

171 file_name = Path(file_path).name 

172 local_md5, _ = get_file_check_data(file_path) 

173 

174 if file_name in existing_files: 

175 if existing_files[file_name]["md5"] == local_md5: 

176 console.print(f"\n[SKIP] {file_name} (already uploaded, MD5 matches)") 

177 continue 

178 console.print(f"\n[REPLACE] {file_name} (MD5 mismatch, deleting old version)") 

179 delete_file(article_id, str(existing_files[file_name]["id"]), token) 

180 

181 console.print(f"\nPreparing {file_name}...") 

182 file_info = create_file(article_id, file_name, file_path, token) 

183 upload_parts(file_info, file_path, token) 

184 complete_upload(article_id, str(file_info["id"]), token) 

185 console.print(f"[OK] {file_name} completed") 

186 

187 console.print("\nAll files uploaded successfully to Figshare!") 

188 

189 

190if __name__ == "__main__": # pragma: no cover 

191 parser = argparse.ArgumentParser(description="Upload files to Figshare.") 

192 parser.add_argument("config", help="Path to the YAML configuration file.") 

193 args = parser.parse_args() 

194 main(args.config)