Coverage for crowdsourcing / process_issues.py: 98%

323 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-21 14:31 +0000

1# SPDX-FileCopyrightText: 2025 Arcangelo Massari <arcangelo.massari@unibo.it> 

2# 

3# SPDX-License-Identifier: ISC 

4 

5 

6import csv 

7import io 

8import json 

9import logging 

10import os 

11import re 

12import shutil 

13import time 

14from datetime import datetime 

15from typing import List, Optional, Tuple 

16 

17import requests 

18import yaml 

19from oc_ds_converter.oc_idmanager.base import IdentifierManager 

20from oc_ds_converter.oc_idmanager.doi import DOIManager 

21from oc_ds_converter.oc_idmanager.isbn import ISBNManager 

22from oc_ds_converter.oc_idmanager.openalex import OpenAlexManager 

23from oc_ds_converter.oc_idmanager.pmcid import PMCIDManager 

24from oc_ds_converter.oc_idmanager.pmid import PMIDManager 

25from oc_ds_converter.oc_idmanager.url import URLManager 

26from oc_ds_converter.oc_idmanager.wikidata import WikidataManager 

27from oc_ds_converter.oc_idmanager.wikipedia import WikipediaManager 

28from oc_validator.interface.gui import make_gui, merge_html_files 

29from oc_validator.main import ClosureValidator 

30from crowdsourcing.archive_manager import ArchiveManager 

31from crowdsourcing.zenodo_utils import create_deposition_resource, get_zenodo_token 

32 

33# Constants 

34SAFE_LIST_PATH = "safe_list.yaml" 

35 

36# Initialize archive manager 

37archive_manager = ArchiveManager() 

38 

39 

40def _validate_title(title: str) -> Tuple[bool, str]: 

41 """Validate the format and identifier in an issue title.""" 

42 basic_format = re.search( 

43 r"deposit\s+(.+?)\s+[a-zA-Z]+:.+", 

44 title, 

45 re.IGNORECASE, 

46 ) 

47 if not basic_format: 

48 return ( 

49 False, 

50 'The title of the issue was not structured correctly. Please, follow this format: deposit {domain name of journal} {doi or other supported identifier}. For example "deposit localhost:330 doi:10.1007/978-3-030-00668-6_8". The following identifiers are currently supported: doi, isbn, pmid, pmcid, url, wikidata, wikipedia, openalex, temp, and local', 

51 ) 

52 

53 match = re.search( 

54 r"deposit\s+(.+?)\s+([a-zA-Z]+):(.+)", 

55 title, 

56 re.IGNORECASE, 

57 ) 

58 

59 identifier_schema = match.group(2).lower() 

60 identifier = match.group(3) 

61 

62 # Map of identifier types to their manager classes 

63 manager_map = { 

64 "doi": DOIManager, 

65 "isbn": ISBNManager, 

66 "pmid": PMIDManager, 

67 "pmcid": PMCIDManager, 

68 "url": URLManager, 

69 "wikidata": WikidataManager, 

70 "wikipedia": WikipediaManager, 

71 "openalex": OpenAlexManager, 

72 } 

73 

74 # Special handling for temp and local identifiers 

75 if identifier_schema in ["temp", "local"]: 

76 # For temp and local identifiers, we just check they have some content after the colon 

77 if not identifier.strip(): 

78 return False, f"The {identifier_schema} identifier cannot be empty" 

79 return True, "" 

80 

81 manager_class = manager_map.get(identifier_schema) 

82 if not manager_class: 

83 return False, f"The identifier schema '{identifier_schema}' is not supported" 

84 

85 # Use API service for all identifiers that require online validation 

86 needs_api = {"doi", "pmid", "pmcid", "url", "wikidata", "wikipedia", "openalex"} 

87 id_manager: IdentifierManager = ( 

88 manager_class(use_api_service=True) 

89 if identifier_schema in needs_api 

90 else manager_class() 

91 ) 

92 is_valid = id_manager.is_valid(identifier) 

93 

94 if not is_valid: 

95 return ( 

96 False, 

97 f"The identifier with literal value {identifier} specified in the issue title is not a valid {identifier_schema.upper()}", 

98 ) 

99 return True, "" 

100 

101 

102def validate( 

103 issue_title: str, 

104 issue_body: str, 

105 issue_number: str, 

106 validation_output_dir: str = "validation_output", 

107 validation_reports_dir: str = "docs/validation_reports", 

108) -> Tuple[bool, str]: 

109 """Validate issue title and body content using oc_validator. 

110 

111 Args: 

112 issue_title: Title of the GitHub issue 

113 issue_body: Body content of the GitHub issue 

114 issue_number: GitHub issue number to update 

115 validation_output_dir: Directory for temporary validation output files 

116 validation_reports_dir: Directory where validation reports will be stored 

117 

118 Returns: 

119 Tuple containing: 

120 - bool: Whether the content is valid 

121 - str: Validation message or error details 

122 """ 

123 logger = logging.getLogger(__name__) 

124 

125 logger.info("Starting validation") 

126 logger.info(f"Validating title: {issue_title}") 

127 

128 # First validate the title format 

129 is_valid_title, title_message = _validate_title(issue_title) 

130 if not is_valid_title: 

131 logger.warning(f"Invalid title format: {title_message}") 

132 return False, title_message 

133 

134 # Check if body is empty 

135 if not issue_body: 

136 logger.warning("Empty issue body") 

137 return ( 

138 False, 

139 "The issue body cannot be empty. Please provide metadata and citations in CSV format separated by '===###===@@@===', as shown in the guide: https://github.com/opencitations/crowdsourcing/blob/main/README.md", 

140 ) 

141 

142 # Check for required separator 

143 if "===###===@@@===" not in issue_body: 

144 logger.warning("Missing required separator in issue body") 

145 return ( 

146 False, 

147 'Please use the separator "===###===@@@===" to divide metadata from citations, as shown in the following guide: https://github.com/opencitations/crowdsourcing/blob/main/README.md', 

148 ) 

149 

150 try: 

151 logger.info("Creating validation output directory") 

152 os.makedirs(validation_output_dir, exist_ok=True) 

153 os.makedirs(validation_reports_dir, exist_ok=True) 

154 

155 # Split the data into metadata and citations 

156 split_data = issue_body.split("===###===@@@===") 

157 metadata_csv = split_data[0].strip() 

158 citations_csv = split_data[1].strip() 

159 

160 # Create temporary files for validation 

161 with open("temp_metadata.csv", "w", encoding="utf-8") as f: 

162 f.write(metadata_csv) 

163 with open("temp_citations.csv", "w", encoding="utf-8") as f: 

164 f.write(citations_csv) 

165 

166 # Initialize and run validator 

167 validator = ClosureValidator( 

168 meta_csv_doc="temp_metadata.csv", 

169 meta_output_dir=validation_output_dir, 

170 cits_csv_doc="temp_citations.csv", 

171 cits_output_dir=validation_output_dir, 

172 strict_sequenciality=True, 

173 meta_kwargs={"verify_id_existence": True}, 

174 cits_kwargs={"verify_id_existence": True}, 

175 ) 

176 

177 # Get validation results 

178 validation_result = validator.validate() 

179 

180 # Check if there are any validation errors 

181 has_meta_errors = ( 

182 os.path.exists(f"{validation_output_dir}/meta_validation_summary.txt") 

183 and os.path.getsize(f"{validation_output_dir}/meta_validation_summary.txt") 

184 > 0 

185 ) 

186 has_cits_errors = ( 

187 os.path.exists(f"{validation_output_dir}/cits_validation_summary.txt") 

188 and os.path.getsize(f"{validation_output_dir}/cits_validation_summary.txt") 

189 > 0 

190 ) 

191 

192 if has_meta_errors or has_cits_errors: 

193 # Generate HTML report for validation errors 

194 report_filename = f"validation_issue_{issue_number}.html" 

195 report_path = f"{validation_reports_dir}/{report_filename}" 

196 

197 # Generate metadata report if there were metadata errors 

198 if has_meta_errors: 

199 make_gui( 

200 "temp_metadata.csv", 

201 f"{validation_output_dir}/out_validate_meta.json", 

202 f"{validation_output_dir}/meta_report.html", 

203 ) 

204 

205 # Generate citations report if there were citation errors 

206 if has_cits_errors: 

207 make_gui( 

208 "temp_citations.csv", 

209 f"{validation_output_dir}/out_validate_cits.json", 

210 f"{validation_output_dir}/cits_report.html", 

211 ) 

212 

213 # Merge reports if both exist, otherwise copy the single report 

214 if has_meta_errors and has_cits_errors: 

215 merge_html_files( 

216 f"{validation_output_dir}/meta_report.html", 

217 f"{validation_output_dir}/cits_report.html", 

218 report_path, 

219 ) 

220 elif has_meta_errors: 

221 shutil.copy(f"{validation_output_dir}/meta_report.html", report_path) 

222 else: # has_cits_errors 

223 shutil.copy(f"{validation_output_dir}/cits_report.html", report_path) 

224 

225 # Get repository from environment and construct report URL 

226 repository = os.environ["GITHUB_REPOSITORY"] 

227 base_url = f"https://{repository.split('/')[0]}.github.io/{repository.split('/')[1]}" 

228 report_url = f"{base_url}/validation_reports/{report_filename}" 

229 

230 # Add report to archive manager 

231 archive_manager.add_report(report_filename, report_url) 

232 

233 # Create error message based on which parts have errors 

234 error_parts = [] 

235 if has_meta_errors: 

236 error_parts.append("metadata") 

237 if has_cits_errors: 

238 error_parts.append("citations") 

239 

240 # Use index.html with report parameter for redirection 

241 error_message = f"Validation errors found in {' and '.join(error_parts)}. Please check the detailed validation report: {base_url}/validation_reports/index.html?report={report_filename}" 

242 return False, error_message 

243 

244 # If no validation errors, return success 

245 # Check if this is a test deposit (localhost domain) 

246 domain_match = re.search(r"deposit\s+(.+?)\s+[a-zA-Z]+:.+", issue_title, re.IGNORECASE) 

247 is_test = domain_match and "localhost" in domain_match.group(1).lower() 

248 

249 if is_test: 

250 success_message = "Test deposit validated successfully! This is recognized as a test deposit (localhost domain) and will not be uploaded to Zenodo or processed into OpenCitations Index/Meta. The data has been validated and the issue will be closed." 

251 else: 

252 success_message = "Thank you for your contribution! OpenCitations just processed the data you provided. The citations will soon be available on the [OpenCitations Index](https://opencitations.net/index) and metadata on [OpenCitations Meta](https://opencitations.net/meta)" 

253 

254 return (True, success_message) 

255 

256 except Exception as e: 

257 logger.error(f"Validation error: {e}") 

258 return ( 

259 False, 

260 "Please ensure both metadata and citations are valid CSVs following the required format. For more information about the correct format, please check our guide: https://github.com/opencitations/crowdsourcing/blob/main/README.md", 

261 ) 

262 finally: 

263 # Clean up temporary files in all cases 

264 cleanup_files = [ 

265 "temp_metadata.csv", 

266 "temp_citations.csv", 

267 ] 

268 for file in cleanup_files: 

269 if os.path.exists(file): 

270 os.remove(file) 

271 

272 if os.path.exists(validation_output_dir): 

273 shutil.rmtree(validation_output_dir) 

274 

275 

276def answer( 

277 is_valid: bool, message: str, issue_number: str, is_authorized: bool = True, is_test: bool = False 

278) -> None: 

279 """Update issue status and add comment using GitHub REST API. 

280 

281 Args: 

282 is_valid: Whether the issue content is valid 

283 message: Comment message to add 

284 issue_number: GitHub issue number to update 

285 is_authorized: Whether the user is authorized (in safe list) 

286 is_test: Whether this is a test issue (localhost domain) 

287 """ 

288 print(f"Updating issue #{issue_number}") 

289 # Determine label based on validation and authorization 

290 if not is_authorized: 

291 label = "rejected" 

292 elif not is_valid: 

293 label = "invalid" 

294 elif is_test: 

295 label = "done" # Test issues are marked as done since they won't be processed 

296 else: 

297 label = "to be processed" 

298 

299 print(f"Adding label '{label}' to issue #{issue_number}") 

300 

301 # Get repository from environment 

302 repository = os.environ["GITHUB_REPOSITORY"] 

303 

304 headers = { 

305 "Accept": "application/vnd.github+json", 

306 "Authorization": f"Bearer {os.environ['GH_TOKEN']}", 

307 "X-GitHub-Api-Version": "2022-11-28", 

308 } 

309 

310 base_url = f"https://api.github.com/repos/{repository}/issues" 

311 

312 # Add label 

313 try: 

314 response = requests.post( 

315 f"{base_url}/{issue_number}/labels", 

316 headers=headers, 

317 json={"labels": [label]}, 

318 timeout=30, 

319 ) 

320 response.raise_for_status() 

321 print(f"Successfully added label '{label}' to issue #{issue_number}") 

322 except requests.RequestException as e: 

323 print(f"Error adding label to issue #{issue_number}: {e}") 

324 raise 

325 

326 # Add comment and close issue 

327 try: 

328 # Add comment 

329 response = requests.post( 

330 f"{base_url}/{issue_number}/comments", 

331 headers=headers, 

332 json={"body": message}, 

333 timeout=30, 

334 ) 

335 response.raise_for_status() 

336 print(f"Successfully added comment to issue #{issue_number}") 

337 

338 # Close issue 

339 response = requests.patch( 

340 f"{base_url}/{issue_number}", 

341 headers=headers, 

342 json={"state": "closed"}, 

343 timeout=30, 

344 ) 

345 response.raise_for_status() 

346 print(f"Successfully closed issue #{issue_number}") 

347 

348 except requests.RequestException as e: 

349 print(f"Error updating issue #{issue_number}: {e}") 

350 raise 

351 

352 

353def get_user_id(username: str) -> Optional[int]: 

354 """Get GitHub user ID from username with retries on failure. 

355 

356 Args: 

357 username: GitHub username to lookup 

358 

359 Returns: 

360 The user's GitHub ID if found, None otherwise 

361 """ 

362 MAX_RETRIES = 3 

363 RETRY_DELAY = 5 # seconds 

364 

365 for attempt in range(MAX_RETRIES): 

366 try: 

367 response = requests.get( 

368 f"https://api.github.com/users/{username}", 

369 headers={ 

370 "Accept": "application/vnd.github+json", 

371 "Authorization": f"Bearer {os.environ['GH_TOKEN']}", 

372 }, 

373 timeout=30, 

374 ) 

375 if response.status_code == 200: 

376 return response.json().get("id") 

377 elif response.status_code == 404: 

378 return None 

379 # Handle rate limiting 

380 elif ( 

381 response.status_code == 403 

382 and "X-RateLimit-Remaining" in response.headers 

383 ): 

384 if int(response.headers["X-RateLimit-Remaining"]) == 0: 

385 reset_time = int(response.headers["X-RateLimit-Reset"]) 

386 sleep_time = max(reset_time - time.time(), 0) 

387 time.sleep(sleep_time) 

388 continue 

389 # Altri status code indicano problemi con l'API, quindi continuiamo a riprovare 

390 

391 except requests.ReadTimeout: 

392 continue 

393 except requests.ConnectionError: 

394 time.sleep(RETRY_DELAY) 

395 continue 

396 

397 return None # Tutti i tentativi falliti 

398 

399 

400def get_data_to_store( 

401 issue_title: str, 

402 issue_body: str, 

403 created_at: str, 

404 had_primary_source: str, 

405 user_id: int, 

406) -> dict: 

407 """Get structured data from issue content for storage. 

408 

409 Args: 

410 issue_title: Title of the GitHub issue 

411 issue_body: Body content of the GitHub issue 

412 created_at: ISO timestamp when issue was created 

413 had_primary_source: URL of the original issue 

414 user_id: GitHub user ID of issue author 

415 

416 Returns: 

417 Dictionary containing structured issue data and provenance information 

418 

419 Raises: 

420 ValueError: If issue body cannot be split or CSV data is invalid 

421 """ 

422 try: 

423 # Extract domain from title (e.g., "deposit localhost:330 doi:10.1234/..." -> "localhost:330") 

424 domain_match = re.search(r"deposit\s+(.+?)\s+[a-zA-Z]+:.+", issue_title, re.IGNORECASE) 

425 domain = domain_match.group(1) if domain_match else "" 

426 

427 # Split and clean the data sections 

428 metadata_csv, citations_csv = [ 

429 section.strip() for section in issue_body.split("===###===@@@===") 

430 ] 

431 

432 metadata = list(csv.DictReader(io.StringIO(metadata_csv))) 

433 citations = list(csv.DictReader(io.StringIO(citations_csv))) 

434 

435 # Validate required data 

436 if not metadata or not citations: 

437 raise ValueError("Empty metadata or citations section") 

438 

439 return { 

440 "data": { 

441 "title": issue_title, 

442 "domain": domain, 

443 "metadata": metadata, 

444 "citations": citations, 

445 }, 

446 "provenance": { 

447 "generatedAtTime": created_at, 

448 "wasAttributedTo": f"https://api.github.com/user/{user_id}", 

449 "hadPrimarySource": had_primary_source, 

450 }, 

451 } 

452 except Exception as e: 

453 raise ValueError(f"Failed to process issue data: {str(e)}") 

454 

455 

456def _get_zenodo_token() -> str: 

457 """Get the appropriate Zenodo token based on environment.""" 

458 return get_zenodo_token() 

459 

460 

461def _create_deposition_resource( 

462 date: str, base_url: str = "https://zenodo.org/api" 

463) -> Tuple[str, str]: 

464 """Create a new deposition resource on Zenodo.""" 

465 metadata = { 

466 "upload_type": "dataset", 

467 "publication_date": date, 

468 "title": f"OpenCitations crowdsourcing: deposits of {date[:7]}", 

469 "creators": [ 

470 { 

471 "name": "crocibot", 

472 "affiliation": "Research Centre for Open Scholarly Metadata, Department of Classical Philology and Italian Studies, University of Bologna, Bologna, Italy", 

473 } 

474 ], 

475 "description": f"OpenCitations collects citation data and related metadata from the community through issues on the GitHub repository <a href='https://github.com/opencitations/crowdsourcing'>https://github.com/opencitations/crowdsourcing</a>. In order to preserve long-term provenance information, such data is uploaded to Zenodo every month. This upload contains the data of deposit issues published in {date[:7]}.", 

476 "access_right": "open", 

477 "license": "CC0-1.0", 

478 "prereserve_doi": True, 

479 "keywords": [ 

480 "OpenCitations", 

481 "crowdsourcing", 

482 "provenance", 

483 "GitHub issues", 

484 ], 

485 "related_identifiers": [ 

486 { 

487 "identifier": "https://github.com/opencitations/crowdsourcing", 

488 "relation": "isDerivedFrom", 

489 "resource_type": "dataset", 

490 } 

491 ], 

492 "version": "1.0.0", 

493 } 

494 return create_deposition_resource(date, metadata, base_url) 

495 

496 

497def _upload_data( 

498 date: str, bucket: str, base_url: str = "https://zenodo.org/api" 

499) -> None: 

500 """Upload data file to Zenodo bucket.""" 

501 filename = f"{date}_weekly_deposit.json" 

502 

503 with open("data_to_store.json", "rb") as fp: 

504 response = requests.put( 

505 f"{bucket}/{filename}", 

506 data=fp, 

507 params={"access_token": _get_zenodo_token()}, 

508 timeout=30, 

509 ) 

510 response.raise_for_status() 

511 

512 

513def deposit_on_zenodo(data_to_store: List[dict]) -> None: 

514 """Deposit data on Zenodo based on environment.""" 

515 environment = os.environ.get("ENVIRONMENT", "development") 

516 

517 # In development, usa la Zenodo Sandbox 

518 if environment == "development": 

519 base_url = "https://sandbox.zenodo.org/api" 

520 else: 

521 base_url = "https://zenodo.org/api" 

522 

523 try: 

524 # Salva i dati in un file temporaneo 

525 with open("data_to_store.json", "w") as f: 

526 json.dump(data_to_store, f) 

527 

528 # Crea una nuova deposizione 

529 deposition_id, bucket = _create_deposition_resource( 

530 datetime.now().strftime("%Y-%m-%d"), base_url=base_url 

531 ) 

532 

533 # Carica i dati 

534 _upload_data(datetime.now().strftime("%Y-%m-%d"), bucket, base_url=base_url) 

535 

536 # Pubblica la deposizione 

537 response = requests.post( 

538 f"{base_url}/deposit/depositions/{deposition_id}/actions/publish", 

539 params={"access_token": _get_zenodo_token()}, 

540 timeout=30, 

541 ) 

542 

543 if response.status_code != 202: 

544 raise Exception(f"Failed to publish deposition: {response.text}") 

545 

546 finally: 

547 # Pulisci i file temporanei 

548 if os.path.exists("data_to_store.json"): 

549 os.remove("data_to_store.json") 

550 

551 

552def is_in_safe_list(user_id: int) -> bool: 

553 """Check if a user ID is in the safe list. 

554 

555 Args: 

556 user_id: GitHub user ID to check 

557 

558 Returns: 

559 bool: True if user is in safe list, False otherwise 

560 """ 

561 try: 

562 with open(SAFE_LIST_PATH, "r") as f: 

563 safe_list = yaml.safe_load(f) 

564 # Extract just the IDs for comparison 

565 allowed_ids = {str(user["id"]) for user in safe_list.get("users", [])} 

566 return str(user_id) in allowed_ids 

567 except FileNotFoundError: 

568 print("Warning: safe_list.yaml not found, creating empty file") 

569 # Create empty safe list file with proper structure 

570 with open(SAFE_LIST_PATH, "w") as f: 

571 yaml.dump({"users": []}, f) 

572 return False 

573 except yaml.YAMLError as e: 

574 print(f"Error parsing safe_list.yaml: {e}") 

575 return False 

576 

577 

578def get_open_issues() -> List[dict]: 

579 """Fetch open issues with title starting with 'deposit ' that have no labels. 

580 

581 Issues without labels have not been processed yet by the bot. 

582 Once processed, the bot adds a label (rejected, invalid, to be processed). 

583 """ 

584 print("Attempting to fetch open issues...") 

585 

586 # Get repository info from GitHub Actions environment 

587 repository = os.environ["GITHUB_REPOSITORY"] 

588 print(f"Checking repository: {repository}") 

589 

590 MAX_RETRIES = 3 

591 RETRY_DELAY = 5 

592 

593 headers = { 

594 "Accept": "application/vnd.github+json", 

595 "Authorization": f"Bearer {os.environ['GH_TOKEN']}", 

596 } 

597 

598 for attempt in range(MAX_RETRIES): 

599 try: 

600 print(f"Attempt {attempt + 1} of {MAX_RETRIES}") 

601 response = requests.get( 

602 f"https://api.github.com/repos/{repository}/issues", 

603 params={"state": "open"}, 

604 headers=headers, 

605 timeout=30, 

606 ) 

607 

608 print(f"Response status code: {response.status_code}") 

609 

610 if response.status_code == 200: 

611 issues = response.json() 

612 print(f"Found {len(issues)} total open issues") 

613 

614 # Filter issues by title and absence of labels 

615 deposit_issues = [] 

616 for issue in issues: 

617 # Skip if title doesn't start with 'deposit ' 

618 if not issue["title"].startswith("deposit "): 

619 continue 

620 

621 # Skip if issue already has labels (already processed by bot) 

622 if issue.get("labels"): 

623 print(f"Skipping issue #{issue['number']} - already has labels") 

624 continue 

625 

626 deposit_issues.append({ 

627 "title": issue["title"], 

628 "body": issue["body"], 

629 "number": str(issue["number"]), 

630 "author": {"login": issue["user"]["login"]}, 

631 "createdAt": issue["created_at"], 

632 "url": issue["html_url"], 

633 }) 

634 

635 print(f"Found {len(deposit_issues)} deposit issues to process") 

636 return deposit_issues 

637 

638 elif response.status_code == 404: 

639 print("Repository or endpoint not found (404)") 

640 return [] 

641 

642 elif ( 

643 response.status_code == 403 

644 and "X-RateLimit-Remaining" in response.headers 

645 ): 

646 print( 

647 f"Rate limit info: {response.headers.get('X-RateLimit-Remaining')} requests remaining" 

648 ) 

649 if int(response.headers["X-RateLimit-Remaining"]) == 0: 

650 reset_time = int(response.headers["X-RateLimit-Reset"]) 

651 current_time = time.time() 

652 if reset_time > current_time: 

653 sleep_time = reset_time - current_time 

654 print(f"Rate limit exceeded. Waiting {sleep_time} seconds") 

655 time.sleep(sleep_time) 

656 continue 

657 continue 

658 else: 

659 print(f"Unexpected status code: {response.status_code}") 

660 print(f"Response body: {response.text}") 

661 

662 except (requests.RequestException, KeyError) as e: 

663 print(f"Error during request: {str(e)}") 

664 if attempt < MAX_RETRIES - 1: 

665 print(f"Waiting {RETRY_DELAY} seconds before retry") 

666 time.sleep(RETRY_DELAY) 

667 continue 

668 raise RuntimeError( 

669 f"Failed to fetch issues after {MAX_RETRIES} attempts" 

670 ) from e 

671 

672 return [] 

673 

674 

675def process_open_issues() -> None: 

676 """Process all open issues.""" 

677 

678 try: 

679 print("Starting to process open issues") 

680 issues = get_open_issues() 

681 print(f"Found {len(issues)} open issues to process") 

682 

683 data_to_store = list() 

684 

685 for issue in issues: 

686 issue_number = issue["number"] 

687 print(f"Processing issue #{issue_number}") 

688 

689 username = issue["author"]["login"] 

690 print(f"Getting user ID for {username}") 

691 user_id = get_user_id(username) 

692 print(f"User ID for {username}: {user_id}") 

693 

694 if not is_in_safe_list(user_id): 

695 print(f"WARNING: User {username} (ID: {user_id}) not in safe list") 

696 answer( 

697 False, 

698 "To make a deposit, please contact OpenCitations at <contact@opencitations.net> to register as a trusted user", 

699 issue_number, 

700 is_authorized=False, 

701 ) 

702 continue 

703 

704 print(f"User {username} is authorized") 

705 issue_title = issue["title"] 

706 issue_body = issue["body"] 

707 created_at = issue["createdAt"] 

708 had_primary_source = issue["url"] 

709 

710 print(f"Validating issue #{issue_number}") 

711 is_valid, message = validate(issue_title, issue_body, issue_number) 

712 print( 

713 f"Validation result for #{issue_number}: valid={is_valid}, message={message}" 

714 ) 

715 

716 # Check if this is a test issue (localhost domain) 

717 domain_match = re.search(r"deposit\s+(.+?)\s+[a-zA-Z]+:.+", issue_title, re.IGNORECASE) 

718 is_test = domain_match and "localhost" in domain_match.group(1).lower() 

719 

720 answer(is_valid, message, issue_number, is_authorized=True, is_test=is_test) 

721 print(f"Posted answer to issue #{issue_number}") 

722 

723 if is_valid: 

724 print(f"Getting data to store for issue #{issue_number}") 

725 try: 

726 issue_data = get_data_to_store( 

727 issue_title, issue_body, created_at, had_primary_source, user_id 

728 ) 

729 data_to_store.append(issue_data) 

730 print(f"Successfully processed data for issue #{issue_number}") 

731 except Exception as e: 

732 print(f"ERROR: Processing data for issue #{issue_number}: {e}") 

733 continue 

734 

735 if data_to_store: 

736 # Filter out test issues (those with "localhost" in domain) 

737 production_data = [ 

738 item for item in data_to_store 

739 if "localhost" not in item.get("data", {}).get("domain", "").lower() 

740 ] 

741 test_data = [ 

742 item for item in data_to_store 

743 if "localhost" in item.get("data", {}).get("domain", "").lower() 

744 ] 

745 

746 if test_data: 

747 print(f"Skipping Zenodo deposit for {len(test_data)} test issue(s) with localhost domain") 

748 

749 if production_data: 

750 print(f"Attempting to deposit {len(production_data)} production items to Zenodo") 

751 try: 

752 deposit_on_zenodo(production_data) 

753 print("Successfully deposited data to Zenodo") 

754 except Exception as e: 

755 print(f"ERROR: Failed to deposit data to Zenodo: {e}") 

756 raise 

757 else: 

758 print("No production data to deposit (all issues are test issues with localhost domain)") 

759 

760 except Exception as e: 

761 print(f"ERROR: Processing issues: {e}") 

762 raise 

763 finally: 

764 print("Completed processing open issues") 

765 

766 

767if __name__ == "__main__": # pragma: no cover 

768 process_open_issues()