Coverage for src / piccione / download / from_figshare.py: 100%

33 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-11 13:41 +0000

1#!/usr/bin/python 

2# -*- coding: utf-8 -*- 

3# Copyright (c) 2025, Arcangelo Massari <arcangelo.massari@unibo.it> 

4# 

5# Permission to use, copy, modify, and/or distribute this software for any purpose 

6# with or without fee is hereby granted, provided that the above copyright notice 

7# and this permission notice appear in all copies. 

8# 

9# THE SOFTWARE IS PROVIDED 'AS IS' AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 

10# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 

11# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, 

12# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 

13# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 

14# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 

15# SOFTWARE. 

16 

17""" 

18Download files from a Figshare article using the Figshare API. 

19 

20This script downloads all files associated with a Figshare article ID. 

21It uses the public Figshare API which works reliably unlike direct wget/curl 

22on Figshare URLs. 

23""" 

24 

25import argparse 

26import hashlib 

27import sys 

28from pathlib import Path 

29 

30import requests 

31from tqdm import tqdm 

32 

33BASE_URL = "https://api.figshare.com/v2" 

34CHUNK_SIZE = 8192 

35 

36 

37def get_article_metadata(article_id): 

38 """Retrieve article metadata from Figshare API.""" 

39 url = f"{BASE_URL}/articles/{article_id}" 

40 response = requests.get(url) 

41 response.raise_for_status() 

42 article_data = response.json() 

43 

44 # Figshare API has a default limit of 10 files. We need to fetch files separately with pagination. 

45 files_url = f"{BASE_URL}/articles/{article_id}/files" 

46 files_response = requests.get(files_url, params={"page_size": 1000}) 

47 files_response.raise_for_status() 

48 article_data['files'] = files_response.json() 

49 

50 return article_data 

51 

52 

53def download_file(download_url, output_path, expected_size, expected_md5=None): 

54 """Download a file from URL with progress bar and optional MD5 verification.""" 

55 response = requests.get(download_url, stream=True) 

56 response.raise_for_status() 

57 

58 md5_hash = hashlib.md5() 

59 

60 with open(output_path, 'wb') as f: 

61 with tqdm(total=expected_size, unit='B', unit_scale=True, unit_divisor=1024) as pbar: 

62 for chunk in response.iter_content(chunk_size=CHUNK_SIZE): 

63 f.write(chunk) 

64 md5_hash.update(chunk) 

65 pbar.update(len(chunk)) 

66 

67 if expected_md5: 

68 actual_md5 = md5_hash.hexdigest() 

69 if actual_md5 != expected_md5: 

70 raise ValueError(f"MD5 mismatch: expected {expected_md5}, got {actual_md5}") 

71 print(f" MD5 checksum verified: {actual_md5}") 

72 

73 

74def main(): # pragma: no cover 

75 parser = argparse.ArgumentParser( 

76 description="Download files from a Figshare article" 

77 ) 

78 parser.add_argument( 

79 "article_id", 

80 type=int, 

81 help="Figshare article ID" 

82 ) 

83 parser.add_argument( 

84 "-o", "--output-dir", 

85 type=Path, 

86 default=Path("."), 

87 help="Output directory for downloaded files (default: current directory)" 

88 ) 

89 

90 args = parser.parse_args() 

91 

92 args.output_dir.mkdir(parents=True, exist_ok=True) 

93 

94 print(f"Fetching metadata for article {args.article_id}...") 

95 metadata = get_article_metadata(args.article_id) 

96 

97 files = metadata.get("files", []) 

98 if not files: 

99 print("No files found in this article") 

100 return 1 

101 

102 print(f"\nFound {len(files)} file(s) to download:") 

103 for f in files: 

104 size_mb = f['size'] / (1024 * 1024) 

105 print(f" - {f['name']} ({size_mb:.2f} MB)") 

106 

107 print(f"\nDownloading to: {args.output_dir.absolute()}\n") 

108 

109 for file_info in files: 

110 filename = file_info['name'] 

111 download_url = file_info['download_url'] 

112 size = file_info['size'] 

113 md5 = file_info.get('supplied_md5') 

114 

115 output_path = args.output_dir / filename 

116 

117 print(f"Downloading {filename}...") 

118 download_file(download_url, output_path, size, md5) 

119 print(f" Saved to {output_path}\n") 

120 

121 print("All files downloaded successfully") 

122 return 0 

123 

124 

125if __name__ == "__main__": # pragma: no cover 

126 sys.exit(main())