Coverage for crowdsourcing/process

1# SPDX-FileCopyrightText: 2025 Arcangelo Massari <arcangelo.massari@unibo.it>

3# SPDX-License-Identifier: ISC

6import csv

7import io

8import json

9import logging

10import os

11import re

12import shutil

13import time

14from datetime import datetime

15from typing import List, Optional, Tuple

17import requests

18import yaml

19from oc_ds_converter.oc_idmanager.base import IdentifierManager

20from oc_ds_converter.oc_idmanager.doi import DOIManager

21from oc_ds_converter.oc_idmanager.isbn import ISBNManager

22from oc_ds_converter.oc_idmanager.openalex import OpenAlexManager

23from oc_ds_converter.oc_idmanager.pmcid import PMCIDManager

24from oc_ds_converter.oc_idmanager.pmid import PMIDManager

25from oc_ds_converter.oc_idmanager.url import URLManager

26from oc_ds_converter.oc_idmanager.wikidata import WikidataManager

27from oc_ds_converter.oc_idmanager.wikipedia import WikipediaManager

28from oc_validator.interface.gui import make_gui, merge_html_files

29from oc_validator.main import ClosureValidator

30from crowdsourcing.archive_manager import ArchiveManager

31from crowdsourcing.zenodo_utils import create_deposition_resource, get_zenodo_token

33# Constants

34SAFE_LIST_PATH = "safe_list.yaml"

36# Initialize archive manager

37archive_manager = ArchiveManager()

40def _validate_title(title: str) -> Tuple[bool, str]:

41 """Validate the format and identifier in an issue title."""

42 basic_format = re.search(

43 r"deposit\s+(.+?)\s+[a-zA-Z]+:.+",

44 title,

45 re.IGNORECASE,

46 )

47 if not basic_format:

48 return (

49 False,

50 'The title of the issue was not structured correctly. Please, follow this format: deposit {domain name of journal} {doi or other supported identifier}. For example "deposit localhost:330 doi:10.1007/978-3-030-00668-6_8". The following identifiers are currently supported: doi, isbn, pmid, pmcid, url, wikidata, wikipedia, openalex, temp, and local',

51 )

53 match = re.search(

54 r"deposit\s+(.+?)\s+([a-zA-Z]+):(.+)",

55 title,

56 re.IGNORECASE,

57 )

59 identifier_schema = match.group(2).lower()

60 identifier = match.group(3)

62 # Map of identifier types to their manager classes

63 manager_map = {

64 "doi": DOIManager,

65 "isbn": ISBNManager,

66 "pmid": PMIDManager,

67 "pmcid": PMCIDManager,

68 "url": URLManager,

69 "wikidata": WikidataManager,

70 "wikipedia": WikipediaManager,

71 "openalex": OpenAlexManager,

72 }

74 # Special handling for temp and local identifiers

75 if identifier_schema in ["temp", "local"]:

76 # For temp and local identifiers, we just check they have some content after the colon

77 if not identifier.strip():

78 return False, f"The {identifier_schema} identifier cannot be empty"

79 return True, ""

81 manager_class = manager_map.get(identifier_schema)

82 if not manager_class:

83 return False, f"The identifier schema '{identifier_schema}' is not supported"

85 # Use API service for all identifiers that require online validation

86 needs_api = {"doi", "pmid", "pmcid", "url", "wikidata", "wikipedia", "openalex"}

87 id_manager: IdentifierManager = (

88 manager_class(use_api_service=True)

89 if identifier_schema in needs_api

90 else manager_class()

91 )

92 is_valid = id_manager.is_valid(identifier)

94 if not is_valid:

95 return (

96 False,

97 f"The identifier with literal value {identifier} specified in the issue title is not a valid {identifier_schema.upper()}",

98 )

99 return True, ""

100

101

102def validate(

103 issue_title: str,

104 issue_body: str,

105 issue_number: str,

106 validation_output_dir: str = "validation_output",

107 validation_reports_dir: str = "docs/validation_reports",

108) -> Tuple[bool, str]:

109 """Validate issue title and body content using oc_validator.

110

111 Args:

112 issue_title: Title of the GitHub issue

113 issue_body: Body content of the GitHub issue

114 issue_number: GitHub issue number to update

115 validation_output_dir: Directory for temporary validation output files

116 validation_reports_dir: Directory where validation reports will be stored

117

118 Returns:

119 Tuple containing:

120 - bool: Whether the content is valid

121 - str: Validation message or error details

122 """

123 logger = logging.getLogger(__name__)

124

125 logger.info("Starting validation")

126 logger.info(f"Validating title: {issue_title}")

127

128 # First validate the title format

129 is_valid_title, title_message = _validate_title(issue_title)

130 if not is_valid_title:

131 logger.warning(f"Invalid title format: {title_message}")

132 return False, title_message

133

134 # Check if body is empty

135 if not issue_body:

136 logger.warning("Empty issue body")

137 return (

138 False,

139 "The issue body cannot be empty. Please provide metadata and citations in CSV format separated by '===###===@@@===', as shown in the guide: https://github.com/opencitations/crowdsourcing/blob/main/README.md",

140 )

141

142 # Check for required separator

143 if "===###===@@@===" not in issue_body:

144 logger.warning("Missing required separator in issue body")

145 return (

146 False,

147 'Please use the separator "===###===@@@===" to divide metadata from citations, as shown in the following guide: https://github.com/opencitations/crowdsourcing/blob/main/README.md',

148 )

149

150 try:

151 logger.info("Creating validation output directory")

152 os.makedirs(validation_output_dir, exist_ok=True)

153 os.makedirs(validation_reports_dir, exist_ok=True)

154

155 # Split the data into metadata and citations

156 split_data = issue_body.split("===###===@@@===")

157 metadata_csv = split_data[0].strip()

158 citations_csv = split_data[1].strip()

159

160 # Create temporary files for validation

161 with open("temp_metadata.csv", "w", encoding="utf-8") as f:

162 f.write(metadata_csv)

163 with open("temp_citations.csv", "w", encoding="utf-8") as f:

164 f.write(citations_csv)

165

166 # Initialize and run validator

167 validator = ClosureValidator(

168 meta_csv_doc="temp_metadata.csv",

169 meta_output_dir=validation_output_dir,

170 cits_csv_doc="temp_citations.csv",

171 cits_output_dir=validation_output_dir,

172 strict_sequenciality=True,

173 meta_kwargs={"verify_id_existence": True},

174 cits_kwargs={"verify_id_existence": True},

175 )

176

177 # Get validation results

178 validation_result = validator.validate()

179

180 # Check if there are any validation errors

181 has_meta_errors = (

182 os.path.exists(f"{validation_output_dir}/meta_validation_summary.txt")

183 and os.path.getsize(f"{validation_output_dir}/meta_validation_summary.txt")

184 > 0

185 )

186 has_cits_errors = (

187 os.path.exists(f"{validation_output_dir}/cits_validation_summary.txt")

188 and os.path.getsize(f"{validation_output_dir}/cits_validation_summary.txt")

189 > 0

190 )

191

192 if has_meta_errors or has_cits_errors:

193 # Generate HTML report for validation errors

194 report_filename = f"validation_issue_{issue_number}.html"

195 report_path = f"{validation_reports_dir}/{report_filename}"

196

197 # Generate metadata report if there were metadata errors

198 if has_meta_errors:

199 make_gui(

200 "temp_metadata.csv",

201 f"{validation_output_dir}/out_validate_meta.json",

202 f"{validation_output_dir}/meta_report.html",

203 )

204

205 # Generate citations report if there were citation errors

206 if has_cits_errors:

207 make_gui(

208 "temp_citations.csv",

209 f"{validation_output_dir}/out_validate_cits.json",

210 f"{validation_output_dir}/cits_report.html",

211 )

212

213 # Merge reports if both exist, otherwise copy the single report

214 if has_meta_errors and has_cits_errors:

215 merge_html_files(

216 f"{validation_output_dir}/meta_report.html",

217 f"{validation_output_dir}/cits_report.html",

218 report_path,

219 )

220 elif has_meta_errors:

221 shutil.copy(f"{validation_output_dir}/meta_report.html", report_path)

222 else: # has_cits_errors

223 shutil.copy(f"{validation_output_dir}/cits_report.html", report_path)

224

225 # Get repository from environment and construct report URL

226 repository = os.environ["GITHUB_REPOSITORY"]

227 base_url = f"https://{repository.split('/')[0]}.github.io/{repository.split('/')[1]}"

228 report_url = f"{base_url}/validation_reports/{report_filename}"

229

230 # Add report to archive manager

231 archive_manager.add_report(report_filename, report_url)

232

233 # Create error message based on which parts have errors

234 error_parts = []

235 if has_meta_errors:

236 error_parts.append("metadata")

237 if has_cits_errors:

238 error_parts.append("citations")

239

240 # Use index.html with report parameter for redirection

241 error_message = f"Validation errors found in {' and '.join(error_parts)}. Please check the detailed validation report: {base_url}/validation_reports/index.html?report={report_filename}"

242 return False, error_message

243

244 # If no validation errors, return success

245 # Check if this is a test deposit (localhost domain)

246 domain_match = re.search(r"deposit\s+(.+?)\s+[a-zA-Z]+:.+", issue_title, re.IGNORECASE)

247 is_test = domain_match and "localhost" in domain_match.group(1).lower()

248

249 if is_test:

250 success_message = "Test deposit validated successfully! This is recognized as a test deposit (localhost domain) and will not be uploaded to Zenodo or processed into OpenCitations Index/Meta. The data has been validated and the issue will be closed."

251 else:

252 success_message = "Thank you for your contribution! OpenCitations just processed the data you provided. The citations will soon be available on the [OpenCitations Index](https://opencitations.net/index) and metadata on [OpenCitations Meta](https://opencitations.net/meta)"

253

254 return (True, success_message)

255

256 except Exception as e:

257 logger.error(f"Validation error: {e}")

258 return (

259 False,

260 "Please ensure both metadata and citations are valid CSVs following the required format. For more information about the correct format, please check our guide: https://github.com/opencitations/crowdsourcing/blob/main/README.md",

261 )

262 finally:

263 # Clean up temporary files in all cases

264 cleanup_files = [

265 "temp_metadata.csv",

266 "temp_citations.csv",

267 ]

268 for file in cleanup_files:

269 if os.path.exists(file):

270 os.remove(file)

271

272 if os.path.exists(validation_output_dir):

273 shutil.rmtree(validation_output_dir)

274

275

276def answer(

277 is_valid: bool, message: str, issue_number: str, is_authorized: bool = True, is_test: bool = False

278) -> None:

279 """Update issue status and add comment using GitHub REST API.

280

281 Args:

282 is_valid: Whether the issue content is valid

283 message: Comment message to add

284 issue_number: GitHub issue number to update

285 is_authorized: Whether the user is authorized (in safe list)

286 is_test: Whether this is a test issue (localhost domain)

287 """

288 print(f"Updating issue #{issue_number}")

289 # Determine label based on validation and authorization

290 if not is_authorized:

291 label = "rejected"

292 elif not is_valid:

293 label = "invalid"

294 elif is_test:

295 label = "done" # Test issues are marked as done since they won't be processed

296 else:

297 label = "to be processed"

298

299 print(f"Adding label '{label}' to issue #{issue_number}")

300

301 # Get repository from environment

302 repository = os.environ["GITHUB_REPOSITORY"]

303

304 headers = {

305 "Accept": "application/vnd.github+json",

306 "Authorization": f"Bearer {os.environ['GH_TOKEN']}",

307 "X-GitHub-Api-Version": "2022-11-28",

308 }

309

310 base_url = f"https://api.github.com/repos/{repository}/issues"

311

312 # Add label

313 try:

314 response = requests.post(

315 f"{base_url}/{issue_number}/labels",

316 headers=headers,

317 json={"labels": [label]},

318 timeout=30,

319 )

320 response.raise_for_status()

321 print(f"Successfully added label '{label}' to issue #{issue_number}")

322 except requests.RequestException as e:

323 print(f"Error adding label to issue #{issue_number}: {e}")

324 raise

325

326 # Add comment and close issue

327 try:

328 # Add comment

329 response = requests.post(

330 f"{base_url}/{issue_number}/comments",

331 headers=headers,

332 json={"body": message},

333 timeout=30,

334 )

335 response.raise_for_status()

336 print(f"Successfully added comment to issue #{issue_number}")

337

338 # Close issue

339 response = requests.patch(

340 f"{base_url}/{issue_number}",

341 headers=headers,

342 json={"state": "closed"},

343 timeout=30,

344 )

345 response.raise_for_status()

346 print(f"Successfully closed issue #{issue_number}")

347

348 except requests.RequestException as e:

349 print(f"Error updating issue #{issue_number}: {e}")

350 raise

351

352

353def get_user_id(username: str) -> Optional[int]:

354 """Get GitHub user ID from username with retries on failure.

355

356 Args:

357 username: GitHub username to lookup

358

359 Returns:

360 The user's GitHub ID if found, None otherwise

361 """

362 MAX_RETRIES = 3

363 RETRY_DELAY = 5 # seconds

364

365 for attempt in range(MAX_RETRIES):

366 try:

367 response = requests.get(

368 f"https://api.github.com/users/{username}",

369 headers={

370 "Accept": "application/vnd.github+json",

371 "Authorization": f"Bearer {os.environ['GH_TOKEN']}",

372 },

373 timeout=30,

374 )

375 if response.status_code == 200:

376 return response.json().get("id")

377 elif response.status_code == 404:

378 return None

379 # Handle rate limiting

380 elif (

381 response.status_code == 403

382 and "X-RateLimit-Remaining" in response.headers

383 ):

384 if int(response.headers["X-RateLimit-Remaining"]) == 0:

385 reset_time = int(response.headers["X-RateLimit-Reset"])

386 sleep_time = max(reset_time - time.time(), 0)

387 time.sleep(sleep_time)

388 continue

389 # Altri status code indicano problemi con l'API, quindi continuiamo a riprovare

390

391 except requests.ReadTimeout:

392 continue

393 except requests.ConnectionError:

394 time.sleep(RETRY_DELAY)

395 continue

396

397 return None # Tutti i tentativi falliti

398

399

400def get_data_to_store(

401 issue_title: str,

402 issue_body: str,

403 created_at: str,

404 had_primary_source: str,

405 user_id: int,

406) -> dict:

407 """Get structured data from issue content for storage.

408

409 Args:

410 issue_title: Title of the GitHub issue

411 issue_body: Body content of the GitHub issue

412 created_at: ISO timestamp when issue was created

413 had_primary_source: URL of the original issue

414 user_id: GitHub user ID of issue author

415

416 Returns:

417 Dictionary containing structured issue data and provenance information

418

419 Raises:

420 ValueError: If issue body cannot be split or CSV data is invalid

421 """

422 try:

423 # Extract domain from title (e.g., "deposit localhost:330 doi:10.1234/..." -> "localhost:330")

424 domain_match = re.search(r"deposit\s+(.+?)\s+[a-zA-Z]+:.+", issue_title, re.IGNORECASE)

425 domain = domain_match.group(1) if domain_match else ""

426

427 # Split and clean the data sections

428 metadata_csv, citations_csv = [

429 section.strip() for section in issue_body.split("===###===@@@===")

430 ]

431

432 metadata = list(csv.DictReader(io.StringIO(metadata_csv)))

433 citations = list(csv.DictReader(io.StringIO(citations_csv)))

434

435 # Validate required data

436 if not metadata or not citations:

437 raise ValueError("Empty metadata or citations section")

438

439 return {

440 "data": {

441 "title": issue_title,

442 "domain": domain,

443 "metadata": metadata,

444 "citations": citations,

445 },

446 "provenance": {

447 "generatedAtTime": created_at,

448 "wasAttributedTo": f"https://api.github.com/user/{user_id}",

449 "hadPrimarySource": had_primary_source,

450 },

451 }

452 except Exception as e:

453 raise ValueError(f"Failed to process issue data: {str(e)}")

454

455

456def _get_zenodo_token() -> str:

457 """Get the appropriate Zenodo token based on environment."""

458 return get_zenodo_token()

459

460

461def _create_deposition_resource(

462 date: str, base_url: str = "https://zenodo.org/api"

463) -> Tuple[str, str]:

464 """Create a new deposition resource on Zenodo."""

465 metadata = {

466 "upload_type": "dataset",

467 "publication_date": date,

468 "title": f"OpenCitations crowdsourcing: deposits of {date[:7]}",

469 "creators": [

470 {

471 "name": "crocibot",

472 "affiliation": "Research Centre for Open Scholarly Metadata, Department of Classical Philology and Italian Studies, University of Bologna, Bologna, Italy",

473 }

474 ],

475 "description": f"OpenCitations collects citation data and related metadata from the community through issues on the GitHub repository <a href='https://github.com/opencitations/crowdsourcing'>https://github.com/opencitations/crowdsourcing</a>. In order to preserve long-term provenance information, such data is uploaded to Zenodo every month. This upload contains the data of deposit issues published in {date[:7]}.",

476 "access_right": "open",

477 "license": "CC0-1.0",

478 "prereserve_doi": True,

479 "keywords": [

480 "OpenCitations",

481 "crowdsourcing",

482 "provenance",

483 "GitHub issues",

484 ],

485 "related_identifiers": [

486 {

487 "identifier": "https://github.com/opencitations/crowdsourcing",

488 "relation": "isDerivedFrom",

489 "resource_type": "dataset",

490 }

491 ],

492 "version": "1.0.0",

493 }

494 return create_deposition_resource(date, metadata, base_url)

495

496

497def _upload_data(

498 date: str, bucket: str, base_url: str = "https://zenodo.org/api"

499) -> None:

500 """Upload data file to Zenodo bucket."""

501 filename = f"{date}_weekly_deposit.json"

502

503 with open("data_to_store.json", "rb") as fp:

504 response = requests.put(

505 f"{bucket}/{filename}",

506 data=fp,

507 params={"access_token": _get_zenodo_token()},

508 timeout=30,

509 )

510 response.raise_for_status()

511

512

513def deposit_on_zenodo(data_to_store: List[dict]) -> None:

514 """Deposit data on Zenodo based on environment."""

515 environment = os.environ.get("ENVIRONMENT", "development")

516

517 # In development, usa la Zenodo Sandbox

518 if environment == "development":

519 base_url = "https://sandbox.zenodo.org/api"

520 else:

521 base_url = "https://zenodo.org/api"

522

523 try:

524 # Salva i dati in un file temporaneo

525 with open("data_to_store.json", "w") as f:

526 json.dump(data_to_store, f)

527

528 # Crea una nuova deposizione

529 deposition_id, bucket = _create_deposition_resource(

530 datetime.now().strftime("%Y-%m-%d"), base_url=base_url

531 )

532

533 # Carica i dati

534 _upload_data(datetime.now().strftime("%Y-%m-%d"), bucket, base_url=base_url)

535

536 # Pubblica la deposizione

537 response = requests.post(

538 f"{base_url}/deposit/depositions/{deposition_id}/actions/publish",

539 params={"access_token": _get_zenodo_token()},

540 timeout=30,

541 )

542

543 if response.status_code != 202:

544 raise Exception(f"Failed to publish deposition: {response.text}")

545

546 finally:

547 # Pulisci i file temporanei

548 if os.path.exists("data_to_store.json"):

549 os.remove("data_to_store.json")

550

551

552def is_in_safe_list(user_id: int) -> bool:

553 """Check if a user ID is in the safe list.

554

555 Args:

556 user_id: GitHub user ID to check

557

558 Returns:

559 bool: True if user is in safe list, False otherwise

560 """

561 try:

562 with open(SAFE_LIST_PATH, "r") as f:

563 safe_list = yaml.safe_load(f)

564 # Extract just the IDs for comparison

565 allowed_ids = {str(user["id"]) for user in safe_list.get("users", [])}

566 return str(user_id) in allowed_ids

567 except FileNotFoundError:

568 print("Warning: safe_list.yaml not found, creating empty file")

569 # Create empty safe list file with proper structure

570 with open(SAFE_LIST_PATH, "w") as f:

571 yaml.dump({"users": []}, f)

572 return False

573 except yaml.YAMLError as e:

574 print(f"Error parsing safe_list.yaml: {e}")

575 return False

576

577

578def get_open_issues() -> List[dict]:

579 """Fetch open issues with title starting with 'deposit ' that have no labels.

580

581 Issues without labels have not been processed yet by the bot.

582 Once processed, the bot adds a label (rejected, invalid, to be processed).

583 """

584 print("Attempting to fetch open issues...")

585

586 # Get repository info from GitHub Actions environment

587 repository = os.environ["GITHUB_REPOSITORY"]

588 print(f"Checking repository: {repository}")

589

590 MAX_RETRIES = 3

591 RETRY_DELAY = 5

592

593 headers = {

594 "Accept": "application/vnd.github+json",

595 "Authorization": f"Bearer {os.environ['GH_TOKEN']}",

596 }

597

598 for attempt in range(MAX_RETRIES):

599 try:

600 print(f"Attempt {attempt + 1} of {MAX_RETRIES}")

601 response = requests.get(

602 f"https://api.github.com/repos/{repository}/issues",

603 params={"state": "open"},

604 headers=headers,

605 timeout=30,

606 )

607

608 print(f"Response status code: {response.status_code}")

609

610 if response.status_code == 200:

611 issues = response.json()

612 print(f"Found {len(issues)} total open issues")

613

614 # Filter issues by title and absence of labels

615 deposit_issues = []

616 for issue in issues:

617 # Skip if title doesn't start with 'deposit '

618 if not issue["title"].startswith("deposit "):

619 continue

620

621 # Skip if issue already has labels (already processed by bot)

622 if issue.get("labels"):

623 print(f"Skipping issue #{issue['number']} - already has labels")

624 continue

625

626 deposit_issues.append({

627 "title": issue["title"],

628 "body": issue["body"],

629 "number": str(issue["number"]),

630 "author": {"login": issue["user"]["login"]},

631 "createdAt": issue["created_at"],

632 "url": issue["html_url"],

633 })

634

635 print(f"Found {len(deposit_issues)} deposit issues to process")

636 return deposit_issues

637

638 elif response.status_code == 404:

639 print("Repository or endpoint not found (404)")

640 return []

641

642 elif (

643 response.status_code == 403

644 and "X-RateLimit-Remaining" in response.headers

645 ):

646 print(

647 f"Rate limit info: {response.headers.get('X-RateLimit-Remaining')} requests remaining"

648 )

649 if int(response.headers["X-RateLimit-Remaining"]) == 0:

650 reset_time = int(response.headers["X-RateLimit-Reset"])

651 current_time = time.time()

652 if reset_time > current_time:

653 sleep_time = reset_time - current_time

654 print(f"Rate limit exceeded. Waiting {sleep_time} seconds")

655 time.sleep(sleep_time)

656 continue

657 continue

658 else:

659 print(f"Unexpected status code: {response.status_code}")

660 print(f"Response body: {response.text}")

661

662 except (requests.RequestException, KeyError) as e:

663 print(f"Error during request: {str(e)}")

664 if attempt < MAX_RETRIES - 1:

665 print(f"Waiting {RETRY_DELAY} seconds before retry")

666 time.sleep(RETRY_DELAY)

667 continue

668 raise RuntimeError(

669 f"Failed to fetch issues after {MAX_RETRIES} attempts"

670 ) from e

671

672 return []

673

674

675def process_open_issues() -> None:

676 """Process all open issues."""

677

678 try:

679 print("Starting to process open issues")

680 issues = get_open_issues()

681 print(f"Found {len(issues)} open issues to process")

682

683 data_to_store = list()

684

685 for issue in issues:

686 issue_number = issue["number"]

687 print(f"Processing issue #{issue_number}")

688

689 username = issue["author"]["login"]

690 print(f"Getting user ID for {username}")

691 user_id = get_user_id(username)

692 print(f"User ID for {username}: {user_id}")

693

694 if not is_in_safe_list(user_id):

695 print(f"WARNING: User {username} (ID: {user_id}) not in safe list")

696 answer(

697 False,

698 "To make a deposit, please contact OpenCitations at <contact@opencitations.net> to register as a trusted user",

699 issue_number,

700 is_authorized=False,

701 )

702 continue

703

704 print(f"User {username} is authorized")

705 issue_title = issue["title"]

706 issue_body = issue["body"]

707 created_at = issue["createdAt"]

708 had_primary_source = issue["url"]

709

710 print(f"Validating issue #{issue_number}")

711 is_valid, message = validate(issue_title, issue_body, issue_number)

712 print(

713 f"Validation result for #{issue_number}: valid={is_valid}, message={message}"

714 )

715

716 # Check if this is a test issue (localhost domain)

717 domain_match = re.search(r"deposit\s+(.+?)\s+[a-zA-Z]+:.+", issue_title, re.IGNORECASE)

718 is_test = domain_match and "localhost" in domain_match.group(1).lower()

719

720 answer(is_valid, message, issue_number, is_authorized=True, is_test=is_test)

721 print(f"Posted answer to issue #{issue_number}")

722

723 if is_valid:

724 print(f"Getting data to store for issue #{issue_number}")

725 try:

726 issue_data = get_data_to_store(

727 issue_title, issue_body, created_at, had_primary_source, user_id

728 )

729 data_to_store.append(issue_data)

730 print(f"Successfully processed data for issue #{issue_number}")

731 except Exception as e:

732 print(f"ERROR: Processing data for issue #{issue_number}: {e}")

733 continue

734

735 if data_to_store:

736 # Filter out test issues (those with "localhost" in domain)

737 production_data = [

738 item for item in data_to_store

739 if "localhost" not in item.get("data", {}).get("domain", "").lower()

740 ]

741 test_data = [

742 item for item in data_to_store

743 if "localhost" in item.get("data", {}).get("domain", "").lower()

744 ]

745

746 if test_data:

747 print(f"Skipping Zenodo deposit for {len(test_data)} test issue(s) with localhost domain")

748

749 if production_data:

750 print(f"Attempting to deposit {len(production_data)} production items to Zenodo")

751 try:

752 deposit_on_zenodo(production_data)

753 print("Successfully deposited data to Zenodo")

754 except Exception as e:

755 print(f"ERROR: Failed to deposit data to Zenodo: {e}")

756 raise

757 else:

758 print("No production data to deposit (all issues are test issues with localhost domain)")

759

760 except Exception as e:

761 print(f"ERROR: Processing issues: {e}")

762 raise

763 finally:

764 print("Completed processing open issues")

765

766

767if __name__ == "__main__": # pragma: no cover

768 process_open_issues()

Coverage for crowdsourcing / process_issues.py: 98%

323 statements