Coverage for src / piccione / download / from_figshare.py: 100%

33 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2026-03-21 11:49 +0000

1#!/usr/bin/python 

2 

3# Copyright (C) 2025 Arcangelo Massari <arcangelo.massari@unibo.it> 

4# SPDX-FileCopyrightText: 2025 Arcangelo Massari <arcangelo.massari@unibo.it> 

5# 

6# SPDX-License-Identifier: ISC 

7 

8""" 

9Download files from a Figshare article using the Figshare API. 

10 

11This script downloads all files associated with a Figshare article ID. 

12It uses the public Figshare API which works reliably unlike direct wget/curl 

13on Figshare URLs. 

14""" 

15 

16import argparse 

17import hashlib 

18import sys 

19from pathlib import Path 

20 

21import requests 

22from tqdm import tqdm 

23 

24BASE_URL = "https://api.figshare.com/v2" 

25CHUNK_SIZE = 8192 

26 

27 

28def get_article_metadata(article_id): 

29 """Retrieve article metadata from Figshare API.""" 

30 url = f"{BASE_URL}/articles/{article_id}" 

31 response = requests.get(url) 

32 response.raise_for_status() 

33 article_data = response.json() 

34 

35 # Figshare API has a default limit of 10 files. We need to fetch files separately with pagination. 

36 files_url = f"{BASE_URL}/articles/{article_id}/files" 

37 files_response = requests.get(files_url, params={"page_size": 1000}) 

38 files_response.raise_for_status() 

39 article_data['files'] = files_response.json() 

40 

41 return article_data 

42 

43 

44def download_file(download_url, output_path, expected_size, expected_md5=None): 

45 """Download a file from URL with progress bar and optional MD5 verification.""" 

46 response = requests.get(download_url, stream=True) 

47 response.raise_for_status() 

48 

49 md5_hash = hashlib.md5() 

50 

51 with open(output_path, 'wb') as f: 

52 with tqdm(total=expected_size, unit='B', unit_scale=True, unit_divisor=1024) as pbar: 

53 for chunk in response.iter_content(chunk_size=CHUNK_SIZE): 

54 f.write(chunk) 

55 md5_hash.update(chunk) 

56 pbar.update(len(chunk)) 

57 

58 if expected_md5: 

59 actual_md5 = md5_hash.hexdigest() 

60 if actual_md5 != expected_md5: 

61 raise ValueError(f"MD5 mismatch: expected {expected_md5}, got {actual_md5}") 

62 print(f" MD5 checksum verified: {actual_md5}") 

63 

64 

65def main(): # pragma: no cover 

66 parser = argparse.ArgumentParser( 

67 description="Download files from a Figshare article" 

68 ) 

69 parser.add_argument( 

70 "article_id", 

71 type=int, 

72 help="Figshare article ID" 

73 ) 

74 parser.add_argument( 

75 "-o", "--output-dir", 

76 type=Path, 

77 default=Path("."), 

78 help="Output directory for downloaded files (default: current directory)" 

79 ) 

80 

81 args = parser.parse_args() 

82 

83 args.output_dir.mkdir(parents=True, exist_ok=True) 

84 

85 print(f"Fetching metadata for article {args.article_id}...") 

86 metadata = get_article_metadata(args.article_id) 

87 

88 files = metadata.get("files", []) 

89 if not files: 

90 print("No files found in this article") 

91 return 1 

92 

93 print(f"\nFound {len(files)} file(s) to download:") 

94 for f in files: 

95 size_mb = f['size'] / (1024 * 1024) 

96 print(f" - {f['name']} ({size_mb:.2f} MB)") 

97 

98 print(f"\nDownloading to: {args.output_dir.absolute()}\n") 

99 

100 for file_info in files: 

101 filename = file_info['name'] 

102 download_url = file_info['download_url'] 

103 size = file_info['size'] 

104 md5 = file_info.get('supplied_md5') 

105 

106 output_path = args.output_dir / filename 

107 

108 print(f"Downloading {filename}...") 

109 download_file(download_url, output_path, size, md5) 

110 print(f" Saved to {output_path}\n") 

111 

112 print("All files downloaded successfully") 

113 return 0 

114 

115 

116if __name__ == "__main__": # pragma: no cover 

117 sys.exit(main())