Coverage for oc_ds_converter / pubmed / finder_nih.py: 37%
65 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
1# SPDX-FileCopyrightText: 2023-2024 Arcangelo Massari <arcangelo.massari@unibo.it>
2#
3# SPDX-License-Identifier: ISC
5import re
6from datetime import datetime
7from urllib.parse import quote, unquote
9from bs4 import BeautifulSoup
10from oc_ds_converter.oc_idmanager.issn import ISSNManager
11from oc_ds_converter.oc_idmanager.pmid import PMIDManager
12from requests import get
15class NIHResourceFinder():
16 """This class implements an api pmid resource finder for NIH"""
18 def __init__(self, data={}, use_api_service=True):
19 """National Institute of Health resource finder constructor."""
20 self._api = "https://pubmed.ncbi.nlm.nih.gov/"
21 self._use_api_service = use_api_service
22 self._p = "pmid:"
23 self._data = data
24 self._im = ISSNManager()
25 self._pm = PMIDManager()
26 self._headers = {
27 "User-Agent": "ResourceFinder / OpenCitations Indexes "
28 "(http://opencitations.net; mailto:contact@opencitations.net)"
29 }
30 self._issn_regex = r"(?<=^IS\s{2}-\s)[0-9]{4}-[0-9]{3}[0-9X]"
31 self._jtitle_regex = r"(?<=^JT\s{2}-\s)(.*)(?<!\n)$"
32 self._date_regex = r"DP\s+-\s+(\d{4}(\s?(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))?(\s?((3[0-1])|([1-2][0-9])|([0]?[1-9])))?)"
33 super(NIHResourceFinder, self).__init__()
35 def _get_issn(self, txt_obj):
36 result = set()
37 fa_issn = re.finditer(self._issn_regex, txt_obj, re.MULTILINE)
38 for matchNum_issn, match_issn in enumerate(fa_issn, start=1):
39 m_issn = match_issn.group()
40 if m_issn:
41 norm_issn = self._im.normalise(m_issn, include_prefix=True)
42 if norm_issn is not None:
43 result.add(norm_issn)
44 return result
46 def _get_extended_j_title(self, txt_obj):
47 matches = re.finditer(self._jtitle_regex, txt_obj, re.MULTILINE)
48 for matchNum, match in enumerate(matches, start=1):
49 m = match.group()
50 if m:
51 m = unquote(m.strip())
52 return m
54 def _get_date(self, txt_obj):
55 pmid_date = None
56 date = re.search(self._date_regex,
57 txt_obj,
58 re.IGNORECASE,
59 ).group(1)
60 re_search = re.search(
61 r"(\d{4})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+((3[0-1])|([1-2][0-9])|([0]?[1-9]))",
62 date,
63 re.IGNORECASE,
64 )
65 if re_search is not None:
66 src = re_search.group(0)
67 datetime_object = datetime.strptime(src, "%Y %b %d")
68 pmid_date = datetime.strftime(datetime_object, "%Y-%m-%d")
69 else:
70 re_search = re.search(
71 r"(\d{4})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)",
72 date,
73 re.IGNORECASE,
74 )
75 if re_search is not None:
76 src = re_search.group(0)
77 datetime_object = datetime.strptime(src, "%Y %b")
78 pmid_date = datetime.strftime(datetime_object, "%Y-%m")
79 else:
80 re_search = re.search(r"(\d{4})", date)
81 if re_search is not None:
82 src = re.search(r"(\d{4})", date).group(0)
83 datetime_object = datetime.strptime(src, "%Y")
84 pmid_date = datetime.strftime(datetime_object, "%Y")
85 return pmid_date
88 def _call_api(self, pmid_full):
89 if self._use_api_service:
90 pmid = self._pm.normalise(pmid_full)
91 r = get(
92 self._api + quote(pmid) + "/?format=pubmed",
93 headers=self._headers,
94 timeout=30,
95 )
96 if r.status_code == 200:
97 r.encoding = "utf-8"
98 soup = BeautifulSoup(r.text, features="lxml")
99 mdata = str(soup.find(id="article-details"))
100 return mdata