Coverage for src / piccione / download / from_figshare.py: 100%
33 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-11 13:41 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-11 13:41 +0000
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3# Copyright (c) 2025, Arcangelo Massari <arcangelo.massari@unibo.it>
4#
5# Permission to use, copy, modify, and/or distribute this software for any purpose
6# with or without fee is hereby granted, provided that the above copyright notice
7# and this permission notice appear in all copies.
8#
9# THE SOFTWARE IS PROVIDED 'AS IS' AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
10# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
11# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,
12# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
13# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
14# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
15# SOFTWARE.
17"""
18Download files from a Figshare article using the Figshare API.
20This script downloads all files associated with a Figshare article ID.
21It uses the public Figshare API which works reliably unlike direct wget/curl
22on Figshare URLs.
23"""
25import argparse
26import hashlib
27import sys
28from pathlib import Path
30import requests
31from tqdm import tqdm
33BASE_URL = "https://api.figshare.com/v2"
34CHUNK_SIZE = 8192
37def get_article_metadata(article_id):
38 """Retrieve article metadata from Figshare API."""
39 url = f"{BASE_URL}/articles/{article_id}"
40 response = requests.get(url)
41 response.raise_for_status()
42 article_data = response.json()
44 # Figshare API has a default limit of 10 files. We need to fetch files separately with pagination.
45 files_url = f"{BASE_URL}/articles/{article_id}/files"
46 files_response = requests.get(files_url, params={"page_size": 1000})
47 files_response.raise_for_status()
48 article_data['files'] = files_response.json()
50 return article_data
53def download_file(download_url, output_path, expected_size, expected_md5=None):
54 """Download a file from URL with progress bar and optional MD5 verification."""
55 response = requests.get(download_url, stream=True)
56 response.raise_for_status()
58 md5_hash = hashlib.md5()
60 with open(output_path, 'wb') as f:
61 with tqdm(total=expected_size, unit='B', unit_scale=True, unit_divisor=1024) as pbar:
62 for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
63 f.write(chunk)
64 md5_hash.update(chunk)
65 pbar.update(len(chunk))
67 if expected_md5:
68 actual_md5 = md5_hash.hexdigest()
69 if actual_md5 != expected_md5:
70 raise ValueError(f"MD5 mismatch: expected {expected_md5}, got {actual_md5}")
71 print(f" MD5 checksum verified: {actual_md5}")
74def main(): # pragma: no cover
75 parser = argparse.ArgumentParser(
76 description="Download files from a Figshare article"
77 )
78 parser.add_argument(
79 "article_id",
80 type=int,
81 help="Figshare article ID"
82 )
83 parser.add_argument(
84 "-o", "--output-dir",
85 type=Path,
86 default=Path("."),
87 help="Output directory for downloaded files (default: current directory)"
88 )
90 args = parser.parse_args()
92 args.output_dir.mkdir(parents=True, exist_ok=True)
94 print(f"Fetching metadata for article {args.article_id}...")
95 metadata = get_article_metadata(args.article_id)
97 files = metadata.get("files", [])
98 if not files:
99 print("No files found in this article")
100 return 1
102 print(f"\nFound {len(files)} file(s) to download:")
103 for f in files:
104 size_mb = f['size'] / (1024 * 1024)
105 print(f" - {f['name']} ({size_mb:.2f} MB)")
107 print(f"\nDownloading to: {args.output_dir.absolute()}\n")
109 for file_info in files:
110 filename = file_info['name']
111 download_url = file_info['download_url']
112 size = file_info['size']
113 md5 = file_info.get('supplied_md5')
115 output_path = args.output_dir / filename
117 print(f"Downloading {filename}...")
118 download_file(download_url, output_path, size, md5)
119 print(f" Saved to {output_path}\n")
121 print("All files downloaded successfully")
122 return 0
125if __name__ == "__main__": # pragma: no cover
126 sys.exit(main())