Coverage for crowdsourcing / process_issues.py: 98%
323 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-21 14:31 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-21 14:31 +0000
1# SPDX-FileCopyrightText: 2025 Arcangelo Massari <arcangelo.massari@unibo.it>
2#
3# SPDX-License-Identifier: ISC
6import csv
7import io
8import json
9import logging
10import os
11import re
12import shutil
13import time
14from datetime import datetime
15from typing import List, Optional, Tuple
17import requests
18import yaml
19from oc_ds_converter.oc_idmanager.base import IdentifierManager
20from oc_ds_converter.oc_idmanager.doi import DOIManager
21from oc_ds_converter.oc_idmanager.isbn import ISBNManager
22from oc_ds_converter.oc_idmanager.openalex import OpenAlexManager
23from oc_ds_converter.oc_idmanager.pmcid import PMCIDManager
24from oc_ds_converter.oc_idmanager.pmid import PMIDManager
25from oc_ds_converter.oc_idmanager.url import URLManager
26from oc_ds_converter.oc_idmanager.wikidata import WikidataManager
27from oc_ds_converter.oc_idmanager.wikipedia import WikipediaManager
28from oc_validator.interface.gui import make_gui, merge_html_files
29from oc_validator.main import ClosureValidator
30from crowdsourcing.archive_manager import ArchiveManager
31from crowdsourcing.zenodo_utils import create_deposition_resource, get_zenodo_token
33# Constants
34SAFE_LIST_PATH = "safe_list.yaml"
36# Initialize archive manager
37archive_manager = ArchiveManager()
40def _validate_title(title: str) -> Tuple[bool, str]:
41 """Validate the format and identifier in an issue title."""
42 basic_format = re.search(
43 r"deposit\s+(.+?)\s+[a-zA-Z]+:.+",
44 title,
45 re.IGNORECASE,
46 )
47 if not basic_format:
48 return (
49 False,
50 'The title of the issue was not structured correctly. Please, follow this format: deposit {domain name of journal} {doi or other supported identifier}. For example "deposit localhost:330 doi:10.1007/978-3-030-00668-6_8". The following identifiers are currently supported: doi, isbn, pmid, pmcid, url, wikidata, wikipedia, openalex, temp, and local',
51 )
53 match = re.search(
54 r"deposit\s+(.+?)\s+([a-zA-Z]+):(.+)",
55 title,
56 re.IGNORECASE,
57 )
59 identifier_schema = match.group(2).lower()
60 identifier = match.group(3)
62 # Map of identifier types to their manager classes
63 manager_map = {
64 "doi": DOIManager,
65 "isbn": ISBNManager,
66 "pmid": PMIDManager,
67 "pmcid": PMCIDManager,
68 "url": URLManager,
69 "wikidata": WikidataManager,
70 "wikipedia": WikipediaManager,
71 "openalex": OpenAlexManager,
72 }
74 # Special handling for temp and local identifiers
75 if identifier_schema in ["temp", "local"]:
76 # For temp and local identifiers, we just check they have some content after the colon
77 if not identifier.strip():
78 return False, f"The {identifier_schema} identifier cannot be empty"
79 return True, ""
81 manager_class = manager_map.get(identifier_schema)
82 if not manager_class:
83 return False, f"The identifier schema '{identifier_schema}' is not supported"
85 # Use API service for all identifiers that require online validation
86 needs_api = {"doi", "pmid", "pmcid", "url", "wikidata", "wikipedia", "openalex"}
87 id_manager: IdentifierManager = (
88 manager_class(use_api_service=True)
89 if identifier_schema in needs_api
90 else manager_class()
91 )
92 is_valid = id_manager.is_valid(identifier)
94 if not is_valid:
95 return (
96 False,
97 f"The identifier with literal value {identifier} specified in the issue title is not a valid {identifier_schema.upper()}",
98 )
99 return True, ""
102def validate(
103 issue_title: str,
104 issue_body: str,
105 issue_number: str,
106 validation_output_dir: str = "validation_output",
107 validation_reports_dir: str = "docs/validation_reports",
108) -> Tuple[bool, str]:
109 """Validate issue title and body content using oc_validator.
111 Args:
112 issue_title: Title of the GitHub issue
113 issue_body: Body content of the GitHub issue
114 issue_number: GitHub issue number to update
115 validation_output_dir: Directory for temporary validation output files
116 validation_reports_dir: Directory where validation reports will be stored
118 Returns:
119 Tuple containing:
120 - bool: Whether the content is valid
121 - str: Validation message or error details
122 """
123 logger = logging.getLogger(__name__)
125 logger.info("Starting validation")
126 logger.info(f"Validating title: {issue_title}")
128 # First validate the title format
129 is_valid_title, title_message = _validate_title(issue_title)
130 if not is_valid_title:
131 logger.warning(f"Invalid title format: {title_message}")
132 return False, title_message
134 # Check if body is empty
135 if not issue_body:
136 logger.warning("Empty issue body")
137 return (
138 False,
139 "The issue body cannot be empty. Please provide metadata and citations in CSV format separated by '===###===@@@===', as shown in the guide: https://github.com/opencitations/crowdsourcing/blob/main/README.md",
140 )
142 # Check for required separator
143 if "===###===@@@===" not in issue_body:
144 logger.warning("Missing required separator in issue body")
145 return (
146 False,
147 'Please use the separator "===###===@@@===" to divide metadata from citations, as shown in the following guide: https://github.com/opencitations/crowdsourcing/blob/main/README.md',
148 )
150 try:
151 logger.info("Creating validation output directory")
152 os.makedirs(validation_output_dir, exist_ok=True)
153 os.makedirs(validation_reports_dir, exist_ok=True)
155 # Split the data into metadata and citations
156 split_data = issue_body.split("===###===@@@===")
157 metadata_csv = split_data[0].strip()
158 citations_csv = split_data[1].strip()
160 # Create temporary files for validation
161 with open("temp_metadata.csv", "w", encoding="utf-8") as f:
162 f.write(metadata_csv)
163 with open("temp_citations.csv", "w", encoding="utf-8") as f:
164 f.write(citations_csv)
166 # Initialize and run validator
167 validator = ClosureValidator(
168 meta_csv_doc="temp_metadata.csv",
169 meta_output_dir=validation_output_dir,
170 cits_csv_doc="temp_citations.csv",
171 cits_output_dir=validation_output_dir,
172 strict_sequenciality=True,
173 meta_kwargs={"verify_id_existence": True},
174 cits_kwargs={"verify_id_existence": True},
175 )
177 # Get validation results
178 validation_result = validator.validate()
180 # Check if there are any validation errors
181 has_meta_errors = (
182 os.path.exists(f"{validation_output_dir}/meta_validation_summary.txt")
183 and os.path.getsize(f"{validation_output_dir}/meta_validation_summary.txt")
184 > 0
185 )
186 has_cits_errors = (
187 os.path.exists(f"{validation_output_dir}/cits_validation_summary.txt")
188 and os.path.getsize(f"{validation_output_dir}/cits_validation_summary.txt")
189 > 0
190 )
192 if has_meta_errors or has_cits_errors:
193 # Generate HTML report for validation errors
194 report_filename = f"validation_issue_{issue_number}.html"
195 report_path = f"{validation_reports_dir}/{report_filename}"
197 # Generate metadata report if there were metadata errors
198 if has_meta_errors:
199 make_gui(
200 "temp_metadata.csv",
201 f"{validation_output_dir}/out_validate_meta.json",
202 f"{validation_output_dir}/meta_report.html",
203 )
205 # Generate citations report if there were citation errors
206 if has_cits_errors:
207 make_gui(
208 "temp_citations.csv",
209 f"{validation_output_dir}/out_validate_cits.json",
210 f"{validation_output_dir}/cits_report.html",
211 )
213 # Merge reports if both exist, otherwise copy the single report
214 if has_meta_errors and has_cits_errors:
215 merge_html_files(
216 f"{validation_output_dir}/meta_report.html",
217 f"{validation_output_dir}/cits_report.html",
218 report_path,
219 )
220 elif has_meta_errors:
221 shutil.copy(f"{validation_output_dir}/meta_report.html", report_path)
222 else: # has_cits_errors
223 shutil.copy(f"{validation_output_dir}/cits_report.html", report_path)
225 # Get repository from environment and construct report URL
226 repository = os.environ["GITHUB_REPOSITORY"]
227 base_url = f"https://{repository.split('/')[0]}.github.io/{repository.split('/')[1]}"
228 report_url = f"{base_url}/validation_reports/{report_filename}"
230 # Add report to archive manager
231 archive_manager.add_report(report_filename, report_url)
233 # Create error message based on which parts have errors
234 error_parts = []
235 if has_meta_errors:
236 error_parts.append("metadata")
237 if has_cits_errors:
238 error_parts.append("citations")
240 # Use index.html with report parameter for redirection
241 error_message = f"Validation errors found in {' and '.join(error_parts)}. Please check the detailed validation report: {base_url}/validation_reports/index.html?report={report_filename}"
242 return False, error_message
244 # If no validation errors, return success
245 # Check if this is a test deposit (localhost domain)
246 domain_match = re.search(r"deposit\s+(.+?)\s+[a-zA-Z]+:.+", issue_title, re.IGNORECASE)
247 is_test = domain_match and "localhost" in domain_match.group(1).lower()
249 if is_test:
250 success_message = "Test deposit validated successfully! This is recognized as a test deposit (localhost domain) and will not be uploaded to Zenodo or processed into OpenCitations Index/Meta. The data has been validated and the issue will be closed."
251 else:
252 success_message = "Thank you for your contribution! OpenCitations just processed the data you provided. The citations will soon be available on the [OpenCitations Index](https://opencitations.net/index) and metadata on [OpenCitations Meta](https://opencitations.net/meta)"
254 return (True, success_message)
256 except Exception as e:
257 logger.error(f"Validation error: {e}")
258 return (
259 False,
260 "Please ensure both metadata and citations are valid CSVs following the required format. For more information about the correct format, please check our guide: https://github.com/opencitations/crowdsourcing/blob/main/README.md",
261 )
262 finally:
263 # Clean up temporary files in all cases
264 cleanup_files = [
265 "temp_metadata.csv",
266 "temp_citations.csv",
267 ]
268 for file in cleanup_files:
269 if os.path.exists(file):
270 os.remove(file)
272 if os.path.exists(validation_output_dir):
273 shutil.rmtree(validation_output_dir)
276def answer(
277 is_valid: bool, message: str, issue_number: str, is_authorized: bool = True, is_test: bool = False
278) -> None:
279 """Update issue status and add comment using GitHub REST API.
281 Args:
282 is_valid: Whether the issue content is valid
283 message: Comment message to add
284 issue_number: GitHub issue number to update
285 is_authorized: Whether the user is authorized (in safe list)
286 is_test: Whether this is a test issue (localhost domain)
287 """
288 print(f"Updating issue #{issue_number}")
289 # Determine label based on validation and authorization
290 if not is_authorized:
291 label = "rejected"
292 elif not is_valid:
293 label = "invalid"
294 elif is_test:
295 label = "done" # Test issues are marked as done since they won't be processed
296 else:
297 label = "to be processed"
299 print(f"Adding label '{label}' to issue #{issue_number}")
301 # Get repository from environment
302 repository = os.environ["GITHUB_REPOSITORY"]
304 headers = {
305 "Accept": "application/vnd.github+json",
306 "Authorization": f"Bearer {os.environ['GH_TOKEN']}",
307 "X-GitHub-Api-Version": "2022-11-28",
308 }
310 base_url = f"https://api.github.com/repos/{repository}/issues"
312 # Add label
313 try:
314 response = requests.post(
315 f"{base_url}/{issue_number}/labels",
316 headers=headers,
317 json={"labels": [label]},
318 timeout=30,
319 )
320 response.raise_for_status()
321 print(f"Successfully added label '{label}' to issue #{issue_number}")
322 except requests.RequestException as e:
323 print(f"Error adding label to issue #{issue_number}: {e}")
324 raise
326 # Add comment and close issue
327 try:
328 # Add comment
329 response = requests.post(
330 f"{base_url}/{issue_number}/comments",
331 headers=headers,
332 json={"body": message},
333 timeout=30,
334 )
335 response.raise_for_status()
336 print(f"Successfully added comment to issue #{issue_number}")
338 # Close issue
339 response = requests.patch(
340 f"{base_url}/{issue_number}",
341 headers=headers,
342 json={"state": "closed"},
343 timeout=30,
344 )
345 response.raise_for_status()
346 print(f"Successfully closed issue #{issue_number}")
348 except requests.RequestException as e:
349 print(f"Error updating issue #{issue_number}: {e}")
350 raise
353def get_user_id(username: str) -> Optional[int]:
354 """Get GitHub user ID from username with retries on failure.
356 Args:
357 username: GitHub username to lookup
359 Returns:
360 The user's GitHub ID if found, None otherwise
361 """
362 MAX_RETRIES = 3
363 RETRY_DELAY = 5 # seconds
365 for attempt in range(MAX_RETRIES):
366 try:
367 response = requests.get(
368 f"https://api.github.com/users/{username}",
369 headers={
370 "Accept": "application/vnd.github+json",
371 "Authorization": f"Bearer {os.environ['GH_TOKEN']}",
372 },
373 timeout=30,
374 )
375 if response.status_code == 200:
376 return response.json().get("id")
377 elif response.status_code == 404:
378 return None
379 # Handle rate limiting
380 elif (
381 response.status_code == 403
382 and "X-RateLimit-Remaining" in response.headers
383 ):
384 if int(response.headers["X-RateLimit-Remaining"]) == 0:
385 reset_time = int(response.headers["X-RateLimit-Reset"])
386 sleep_time = max(reset_time - time.time(), 0)
387 time.sleep(sleep_time)
388 continue
389 # Altri status code indicano problemi con l'API, quindi continuiamo a riprovare
391 except requests.ReadTimeout:
392 continue
393 except requests.ConnectionError:
394 time.sleep(RETRY_DELAY)
395 continue
397 return None # Tutti i tentativi falliti
400def get_data_to_store(
401 issue_title: str,
402 issue_body: str,
403 created_at: str,
404 had_primary_source: str,
405 user_id: int,
406) -> dict:
407 """Get structured data from issue content for storage.
409 Args:
410 issue_title: Title of the GitHub issue
411 issue_body: Body content of the GitHub issue
412 created_at: ISO timestamp when issue was created
413 had_primary_source: URL of the original issue
414 user_id: GitHub user ID of issue author
416 Returns:
417 Dictionary containing structured issue data and provenance information
419 Raises:
420 ValueError: If issue body cannot be split or CSV data is invalid
421 """
422 try:
423 # Extract domain from title (e.g., "deposit localhost:330 doi:10.1234/..." -> "localhost:330")
424 domain_match = re.search(r"deposit\s+(.+?)\s+[a-zA-Z]+:.+", issue_title, re.IGNORECASE)
425 domain = domain_match.group(1) if domain_match else ""
427 # Split and clean the data sections
428 metadata_csv, citations_csv = [
429 section.strip() for section in issue_body.split("===###===@@@===")
430 ]
432 metadata = list(csv.DictReader(io.StringIO(metadata_csv)))
433 citations = list(csv.DictReader(io.StringIO(citations_csv)))
435 # Validate required data
436 if not metadata or not citations:
437 raise ValueError("Empty metadata or citations section")
439 return {
440 "data": {
441 "title": issue_title,
442 "domain": domain,
443 "metadata": metadata,
444 "citations": citations,
445 },
446 "provenance": {
447 "generatedAtTime": created_at,
448 "wasAttributedTo": f"https://api.github.com/user/{user_id}",
449 "hadPrimarySource": had_primary_source,
450 },
451 }
452 except Exception as e:
453 raise ValueError(f"Failed to process issue data: {str(e)}")
456def _get_zenodo_token() -> str:
457 """Get the appropriate Zenodo token based on environment."""
458 return get_zenodo_token()
461def _create_deposition_resource(
462 date: str, base_url: str = "https://zenodo.org/api"
463) -> Tuple[str, str]:
464 """Create a new deposition resource on Zenodo."""
465 metadata = {
466 "upload_type": "dataset",
467 "publication_date": date,
468 "title": f"OpenCitations crowdsourcing: deposits of {date[:7]}",
469 "creators": [
470 {
471 "name": "crocibot",
472 "affiliation": "Research Centre for Open Scholarly Metadata, Department of Classical Philology and Italian Studies, University of Bologna, Bologna, Italy",
473 }
474 ],
475 "description": f"OpenCitations collects citation data and related metadata from the community through issues on the GitHub repository <a href='https://github.com/opencitations/crowdsourcing'>https://github.com/opencitations/crowdsourcing</a>. In order to preserve long-term provenance information, such data is uploaded to Zenodo every month. This upload contains the data of deposit issues published in {date[:7]}.",
476 "access_right": "open",
477 "license": "CC0-1.0",
478 "prereserve_doi": True,
479 "keywords": [
480 "OpenCitations",
481 "crowdsourcing",
482 "provenance",
483 "GitHub issues",
484 ],
485 "related_identifiers": [
486 {
487 "identifier": "https://github.com/opencitations/crowdsourcing",
488 "relation": "isDerivedFrom",
489 "resource_type": "dataset",
490 }
491 ],
492 "version": "1.0.0",
493 }
494 return create_deposition_resource(date, metadata, base_url)
497def _upload_data(
498 date: str, bucket: str, base_url: str = "https://zenodo.org/api"
499) -> None:
500 """Upload data file to Zenodo bucket."""
501 filename = f"{date}_weekly_deposit.json"
503 with open("data_to_store.json", "rb") as fp:
504 response = requests.put(
505 f"{bucket}/{filename}",
506 data=fp,
507 params={"access_token": _get_zenodo_token()},
508 timeout=30,
509 )
510 response.raise_for_status()
513def deposit_on_zenodo(data_to_store: List[dict]) -> None:
514 """Deposit data on Zenodo based on environment."""
515 environment = os.environ.get("ENVIRONMENT", "development")
517 # In development, usa la Zenodo Sandbox
518 if environment == "development":
519 base_url = "https://sandbox.zenodo.org/api"
520 else:
521 base_url = "https://zenodo.org/api"
523 try:
524 # Salva i dati in un file temporaneo
525 with open("data_to_store.json", "w") as f:
526 json.dump(data_to_store, f)
528 # Crea una nuova deposizione
529 deposition_id, bucket = _create_deposition_resource(
530 datetime.now().strftime("%Y-%m-%d"), base_url=base_url
531 )
533 # Carica i dati
534 _upload_data(datetime.now().strftime("%Y-%m-%d"), bucket, base_url=base_url)
536 # Pubblica la deposizione
537 response = requests.post(
538 f"{base_url}/deposit/depositions/{deposition_id}/actions/publish",
539 params={"access_token": _get_zenodo_token()},
540 timeout=30,
541 )
543 if response.status_code != 202:
544 raise Exception(f"Failed to publish deposition: {response.text}")
546 finally:
547 # Pulisci i file temporanei
548 if os.path.exists("data_to_store.json"):
549 os.remove("data_to_store.json")
552def is_in_safe_list(user_id: int) -> bool:
553 """Check if a user ID is in the safe list.
555 Args:
556 user_id: GitHub user ID to check
558 Returns:
559 bool: True if user is in safe list, False otherwise
560 """
561 try:
562 with open(SAFE_LIST_PATH, "r") as f:
563 safe_list = yaml.safe_load(f)
564 # Extract just the IDs for comparison
565 allowed_ids = {str(user["id"]) for user in safe_list.get("users", [])}
566 return str(user_id) in allowed_ids
567 except FileNotFoundError:
568 print("Warning: safe_list.yaml not found, creating empty file")
569 # Create empty safe list file with proper structure
570 with open(SAFE_LIST_PATH, "w") as f:
571 yaml.dump({"users": []}, f)
572 return False
573 except yaml.YAMLError as e:
574 print(f"Error parsing safe_list.yaml: {e}")
575 return False
578def get_open_issues() -> List[dict]:
579 """Fetch open issues with title starting with 'deposit ' that have no labels.
581 Issues without labels have not been processed yet by the bot.
582 Once processed, the bot adds a label (rejected, invalid, to be processed).
583 """
584 print("Attempting to fetch open issues...")
586 # Get repository info from GitHub Actions environment
587 repository = os.environ["GITHUB_REPOSITORY"]
588 print(f"Checking repository: {repository}")
590 MAX_RETRIES = 3
591 RETRY_DELAY = 5
593 headers = {
594 "Accept": "application/vnd.github+json",
595 "Authorization": f"Bearer {os.environ['GH_TOKEN']}",
596 }
598 for attempt in range(MAX_RETRIES):
599 try:
600 print(f"Attempt {attempt + 1} of {MAX_RETRIES}")
601 response = requests.get(
602 f"https://api.github.com/repos/{repository}/issues",
603 params={"state": "open"},
604 headers=headers,
605 timeout=30,
606 )
608 print(f"Response status code: {response.status_code}")
610 if response.status_code == 200:
611 issues = response.json()
612 print(f"Found {len(issues)} total open issues")
614 # Filter issues by title and absence of labels
615 deposit_issues = []
616 for issue in issues:
617 # Skip if title doesn't start with 'deposit '
618 if not issue["title"].startswith("deposit "):
619 continue
621 # Skip if issue already has labels (already processed by bot)
622 if issue.get("labels"):
623 print(f"Skipping issue #{issue['number']} - already has labels")
624 continue
626 deposit_issues.append({
627 "title": issue["title"],
628 "body": issue["body"],
629 "number": str(issue["number"]),
630 "author": {"login": issue["user"]["login"]},
631 "createdAt": issue["created_at"],
632 "url": issue["html_url"],
633 })
635 print(f"Found {len(deposit_issues)} deposit issues to process")
636 return deposit_issues
638 elif response.status_code == 404:
639 print("Repository or endpoint not found (404)")
640 return []
642 elif (
643 response.status_code == 403
644 and "X-RateLimit-Remaining" in response.headers
645 ):
646 print(
647 f"Rate limit info: {response.headers.get('X-RateLimit-Remaining')} requests remaining"
648 )
649 if int(response.headers["X-RateLimit-Remaining"]) == 0:
650 reset_time = int(response.headers["X-RateLimit-Reset"])
651 current_time = time.time()
652 if reset_time > current_time:
653 sleep_time = reset_time - current_time
654 print(f"Rate limit exceeded. Waiting {sleep_time} seconds")
655 time.sleep(sleep_time)
656 continue
657 continue
658 else:
659 print(f"Unexpected status code: {response.status_code}")
660 print(f"Response body: {response.text}")
662 except (requests.RequestException, KeyError) as e:
663 print(f"Error during request: {str(e)}")
664 if attempt < MAX_RETRIES - 1:
665 print(f"Waiting {RETRY_DELAY} seconds before retry")
666 time.sleep(RETRY_DELAY)
667 continue
668 raise RuntimeError(
669 f"Failed to fetch issues after {MAX_RETRIES} attempts"
670 ) from e
672 return []
675def process_open_issues() -> None:
676 """Process all open issues."""
678 try:
679 print("Starting to process open issues")
680 issues = get_open_issues()
681 print(f"Found {len(issues)} open issues to process")
683 data_to_store = list()
685 for issue in issues:
686 issue_number = issue["number"]
687 print(f"Processing issue #{issue_number}")
689 username = issue["author"]["login"]
690 print(f"Getting user ID for {username}")
691 user_id = get_user_id(username)
692 print(f"User ID for {username}: {user_id}")
694 if not is_in_safe_list(user_id):
695 print(f"WARNING: User {username} (ID: {user_id}) not in safe list")
696 answer(
697 False,
698 "To make a deposit, please contact OpenCitations at <contact@opencitations.net> to register as a trusted user",
699 issue_number,
700 is_authorized=False,
701 )
702 continue
704 print(f"User {username} is authorized")
705 issue_title = issue["title"]
706 issue_body = issue["body"]
707 created_at = issue["createdAt"]
708 had_primary_source = issue["url"]
710 print(f"Validating issue #{issue_number}")
711 is_valid, message = validate(issue_title, issue_body, issue_number)
712 print(
713 f"Validation result for #{issue_number}: valid={is_valid}, message={message}"
714 )
716 # Check if this is a test issue (localhost domain)
717 domain_match = re.search(r"deposit\s+(.+?)\s+[a-zA-Z]+:.+", issue_title, re.IGNORECASE)
718 is_test = domain_match and "localhost" in domain_match.group(1).lower()
720 answer(is_valid, message, issue_number, is_authorized=True, is_test=is_test)
721 print(f"Posted answer to issue #{issue_number}")
723 if is_valid:
724 print(f"Getting data to store for issue #{issue_number}")
725 try:
726 issue_data = get_data_to_store(
727 issue_title, issue_body, created_at, had_primary_source, user_id
728 )
729 data_to_store.append(issue_data)
730 print(f"Successfully processed data for issue #{issue_number}")
731 except Exception as e:
732 print(f"ERROR: Processing data for issue #{issue_number}: {e}")
733 continue
735 if data_to_store:
736 # Filter out test issues (those with "localhost" in domain)
737 production_data = [
738 item for item in data_to_store
739 if "localhost" not in item.get("data", {}).get("domain", "").lower()
740 ]
741 test_data = [
742 item for item in data_to_store
743 if "localhost" in item.get("data", {}).get("domain", "").lower()
744 ]
746 if test_data:
747 print(f"Skipping Zenodo deposit for {len(test_data)} test issue(s) with localhost domain")
749 if production_data:
750 print(f"Attempting to deposit {len(production_data)} production items to Zenodo")
751 try:
752 deposit_on_zenodo(production_data)
753 print("Successfully deposited data to Zenodo")
754 except Exception as e:
755 print(f"ERROR: Failed to deposit data to Zenodo: {e}")
756 raise
757 else:
758 print("No production data to deposit (all issues are test issues with localhost domain)")
760 except Exception as e:
761 print(f"ERROR: Processing issues: {e}")
762 raise
763 finally:
764 print("Completed processing open issues")
767if __name__ == "__main__": # pragma: no cover
768 process_open_issues()