Coverage for src / piccione / download / from_figshare.py: 100%
33 statements
« prev ^ index » next coverage.py v7.13.0, created at 2026-03-21 11:49 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2026-03-21 11:49 +0000
1#!/usr/bin/python
3# Copyright (C) 2025 Arcangelo Massari <arcangelo.massari@unibo.it>
4# SPDX-FileCopyrightText: 2025 Arcangelo Massari <arcangelo.massari@unibo.it>
5#
6# SPDX-License-Identifier: ISC
8"""
9Download files from a Figshare article using the Figshare API.
11This script downloads all files associated with a Figshare article ID.
12It uses the public Figshare API which works reliably unlike direct wget/curl
13on Figshare URLs.
14"""
16import argparse
17import hashlib
18import sys
19from pathlib import Path
21import requests
22from tqdm import tqdm
24BASE_URL = "https://api.figshare.com/v2"
25CHUNK_SIZE = 8192
28def get_article_metadata(article_id):
29 """Retrieve article metadata from Figshare API."""
30 url = f"{BASE_URL}/articles/{article_id}"
31 response = requests.get(url)
32 response.raise_for_status()
33 article_data = response.json()
35 # Figshare API has a default limit of 10 files. We need to fetch files separately with pagination.
36 files_url = f"{BASE_URL}/articles/{article_id}/files"
37 files_response = requests.get(files_url, params={"page_size": 1000})
38 files_response.raise_for_status()
39 article_data['files'] = files_response.json()
41 return article_data
44def download_file(download_url, output_path, expected_size, expected_md5=None):
45 """Download a file from URL with progress bar and optional MD5 verification."""
46 response = requests.get(download_url, stream=True)
47 response.raise_for_status()
49 md5_hash = hashlib.md5()
51 with open(output_path, 'wb') as f:
52 with tqdm(total=expected_size, unit='B', unit_scale=True, unit_divisor=1024) as pbar:
53 for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
54 f.write(chunk)
55 md5_hash.update(chunk)
56 pbar.update(len(chunk))
58 if expected_md5:
59 actual_md5 = md5_hash.hexdigest()
60 if actual_md5 != expected_md5:
61 raise ValueError(f"MD5 mismatch: expected {expected_md5}, got {actual_md5}")
62 print(f" MD5 checksum verified: {actual_md5}")
65def main(): # pragma: no cover
66 parser = argparse.ArgumentParser(
67 description="Download files from a Figshare article"
68 )
69 parser.add_argument(
70 "article_id",
71 type=int,
72 help="Figshare article ID"
73 )
74 parser.add_argument(
75 "-o", "--output-dir",
76 type=Path,
77 default=Path("."),
78 help="Output directory for downloaded files (default: current directory)"
79 )
81 args = parser.parse_args()
83 args.output_dir.mkdir(parents=True, exist_ok=True)
85 print(f"Fetching metadata for article {args.article_id}...")
86 metadata = get_article_metadata(args.article_id)
88 files = metadata.get("files", [])
89 if not files:
90 print("No files found in this article")
91 return 1
93 print(f"\nFound {len(files)} file(s) to download:")
94 for f in files:
95 size_mb = f['size'] / (1024 * 1024)
96 print(f" - {f['name']} ({size_mb:.2f} MB)")
98 print(f"\nDownloading to: {args.output_dir.absolute()}\n")
100 for file_info in files:
101 filename = file_info['name']
102 download_url = file_info['download_url']
103 size = file_info['size']
104 md5 = file_info.get('supplied_md5')
106 output_path = args.output_dir / filename
108 print(f"Downloading {filename}...")
109 download_file(download_url, output_path, size, md5)
110 print(f" Saved to {output_path}\n")
112 print("All files downloaded successfully")
113 return 0
116if __name__ == "__main__": # pragma: no cover
117 sys.exit(main())