Coverage for src / piccione / download / from_figshare.py: 100%
44 statements
« prev ^ index » next coverage.py v7.13.0, created at 2026-05-27 20:21 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2026-05-27 20:21 +0000
1# Copyright (C) 2025 Arcangelo Massari <arcangelo.massari@unibo.it>
2# SPDX-FileCopyrightText: 2025 Arcangelo Massari <arcangelo.massari@unibo.it>
3#
4# SPDX-License-Identifier: ISC
6"""
7Download files from a Figshare article using the Figshare API.
9This script downloads all files associated with a Figshare article ID.
10It uses the public Figshare API which works reliably unlike direct wget/curl
11on Figshare URLs.
12"""
14from __future__ import annotations
16import argparse
17import hashlib
18import sys
19from pathlib import Path
20from typing import TypedDict
22import requests
23from rich.console import Console
24from tqdm import tqdm
26console = Console()
28BASE_URL = "https://api.figshare.com/v2"
29CHUNK_SIZE = 8192
32class FigshareFileEntry(TypedDict):
33 name: str
34 size: int
35 download_url: str
36 supplied_md5: str | None
39class FigshareArticleMetadata(TypedDict):
40 files: list[FigshareFileEntry]
43def get_article_metadata(article_id: int) -> FigshareArticleMetadata:
44 url = f"{BASE_URL}/articles/{article_id}"
45 response = requests.get(url, timeout=30)
46 response.raise_for_status()
47 article_data = response.json()
49 # Figshare API has a default limit of 10 files. We need to fetch files separately with pagination.
50 files_url = f"{BASE_URL}/articles/{article_id}/files"
51 files_response = requests.get(files_url, params={"page_size": 1000}, timeout=30)
52 files_response.raise_for_status()
53 article_data["files"] = files_response.json()
55 return article_data
58def download_file(
59 download_url: str,
60 output_path: str | Path,
61 expected_size: int,
62 expected_md5: str | None = None,
63) -> None:
64 response = requests.get(download_url, stream=True, timeout=(30, 300))
65 response.raise_for_status()
67 md5_hash = hashlib.md5(usedforsecurity=False)
69 with (
70 Path(output_path).open("wb") as f,
71 tqdm(total=expected_size, unit="B", unit_scale=True, unit_divisor=1024) as pbar,
72 ):
73 for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
74 f.write(chunk)
75 md5_hash.update(chunk)
76 pbar.update(len(chunk))
78 if expected_md5:
79 actual_md5 = md5_hash.hexdigest()
80 if actual_md5 != expected_md5:
81 msg = f"MD5 mismatch: expected {expected_md5}, got {actual_md5}"
82 raise ValueError(msg)
83 console.print(f" MD5 checksum verified: {actual_md5}")
86def main() -> int: # pragma: no cover
87 parser = argparse.ArgumentParser(description="Download files from a Figshare article")
88 parser.add_argument("article_id", type=int, help="Figshare article ID")
89 parser.add_argument(
90 "-o",
91 "--output-dir",
92 type=Path,
93 default=Path(),
94 help="Output directory for downloaded files (default: current directory)",
95 )
97 args = parser.parse_args()
99 args.output_dir.mkdir(parents=True, exist_ok=True)
101 console.print(f"Fetching metadata for article {args.article_id}...")
102 metadata = get_article_metadata(args.article_id)
104 files = metadata["files"]
105 if not files:
106 console.print("No files found in this article")
107 return 1
109 console.print(f"\nFound {len(files)} file(s) to download:")
110 for f in files:
111 size_mb = f["size"] / (1024 * 1024)
112 console.print(f" - {f['name']} ({size_mb:.2f} MB)")
114 console.print(f"\nDownloading to: {args.output_dir.absolute()}\n")
116 for file_info in files:
117 filename = file_info["name"]
118 download_url = file_info["download_url"]
119 size = file_info["size"]
120 md5 = file_info["supplied_md5"]
122 output_path = args.output_dir / filename
124 console.print(f"Downloading {filename}...")
125 download_file(download_url, output_path, size, md5)
126 console.print(f" Saved to {output_path}\n")
128 console.print("All files downloaded successfully")
129 return 0
132if __name__ == "__main__": # pragma: no cover
133 sys.exit(main())