Coverage for src / piccione / download / from_figshare.py: 100%

44 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2026-05-27 20:21 +0000

1# Copyright (C) 2025 Arcangelo Massari <arcangelo.massari@unibo.it> 

2# SPDX-FileCopyrightText: 2025 Arcangelo Massari <arcangelo.massari@unibo.it> 

3# 

4# SPDX-License-Identifier: ISC 

5 

6""" 

7Download files from a Figshare article using the Figshare API. 

8 

9This script downloads all files associated with a Figshare article ID. 

10It uses the public Figshare API which works reliably unlike direct wget/curl 

11on Figshare URLs. 

12""" 

13 

14from __future__ import annotations 

15 

16import argparse 

17import hashlib 

18import sys 

19from pathlib import Path 

20from typing import TypedDict 

21 

22import requests 

23from rich.console import Console 

24from tqdm import tqdm 

25 

26console = Console() 

27 

28BASE_URL = "https://api.figshare.com/v2" 

29CHUNK_SIZE = 8192 

30 

31 

32class FigshareFileEntry(TypedDict): 

33 name: str 

34 size: int 

35 download_url: str 

36 supplied_md5: str | None 

37 

38 

39class FigshareArticleMetadata(TypedDict): 

40 files: list[FigshareFileEntry] 

41 

42 

43def get_article_metadata(article_id: int) -> FigshareArticleMetadata: 

44 url = f"{BASE_URL}/articles/{article_id}" 

45 response = requests.get(url, timeout=30) 

46 response.raise_for_status() 

47 article_data = response.json() 

48 

49 # Figshare API has a default limit of 10 files. We need to fetch files separately with pagination. 

50 files_url = f"{BASE_URL}/articles/{article_id}/files" 

51 files_response = requests.get(files_url, params={"page_size": 1000}, timeout=30) 

52 files_response.raise_for_status() 

53 article_data["files"] = files_response.json() 

54 

55 return article_data 

56 

57 

58def download_file( 

59 download_url: str, 

60 output_path: str | Path, 

61 expected_size: int, 

62 expected_md5: str | None = None, 

63) -> None: 

64 response = requests.get(download_url, stream=True, timeout=(30, 300)) 

65 response.raise_for_status() 

66 

67 md5_hash = hashlib.md5(usedforsecurity=False) 

68 

69 with ( 

70 Path(output_path).open("wb") as f, 

71 tqdm(total=expected_size, unit="B", unit_scale=True, unit_divisor=1024) as pbar, 

72 ): 

73 for chunk in response.iter_content(chunk_size=CHUNK_SIZE): 

74 f.write(chunk) 

75 md5_hash.update(chunk) 

76 pbar.update(len(chunk)) 

77 

78 if expected_md5: 

79 actual_md5 = md5_hash.hexdigest() 

80 if actual_md5 != expected_md5: 

81 msg = f"MD5 mismatch: expected {expected_md5}, got {actual_md5}" 

82 raise ValueError(msg) 

83 console.print(f" MD5 checksum verified: {actual_md5}") 

84 

85 

86def main() -> int: # pragma: no cover 

87 parser = argparse.ArgumentParser(description="Download files from a Figshare article") 

88 parser.add_argument("article_id", type=int, help="Figshare article ID") 

89 parser.add_argument( 

90 "-o", 

91 "--output-dir", 

92 type=Path, 

93 default=Path(), 

94 help="Output directory for downloaded files (default: current directory)", 

95 ) 

96 

97 args = parser.parse_args() 

98 

99 args.output_dir.mkdir(parents=True, exist_ok=True) 

100 

101 console.print(f"Fetching metadata for article {args.article_id}...") 

102 metadata = get_article_metadata(args.article_id) 

103 

104 files = metadata["files"] 

105 if not files: 

106 console.print("No files found in this article") 

107 return 1 

108 

109 console.print(f"\nFound {len(files)} file(s) to download:") 

110 for f in files: 

111 size_mb = f["size"] / (1024 * 1024) 

112 console.print(f" - {f['name']} ({size_mb:.2f} MB)") 

113 

114 console.print(f"\nDownloading to: {args.output_dir.absolute()}\n") 

115 

116 for file_info in files: 

117 filename = file_info["name"] 

118 download_url = file_info["download_url"] 

119 size = file_info["size"] 

120 md5 = file_info["supplied_md5"] 

121 

122 output_path = args.output_dir / filename 

123 

124 console.print(f"Downloading {filename}...") 

125 download_file(download_url, output_path, size, md5) 

126 console.print(f" Saved to {output_path}\n") 

127 

128 console.print("All files downloaded successfully") 

129 return 0 

130 

131 

132if __name__ == "__main__": # pragma: no cover 

133 sys.exit(main())