Coverage for oc_ds_converter / pubmed / finder_nih.py: 37%

65 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-25 18:06 +0000

1# SPDX-FileCopyrightText: 2023-2024 Arcangelo Massari <arcangelo.massari@unibo.it> 

2# 

3# SPDX-License-Identifier: ISC 

4 

5import re 

6from datetime import datetime 

7from urllib.parse import quote, unquote 

8 

9from bs4 import BeautifulSoup 

10from oc_ds_converter.oc_idmanager.issn import ISSNManager 

11from oc_ds_converter.oc_idmanager.pmid import PMIDManager 

12from requests import get 

13 

14 

15class NIHResourceFinder(): 

16 """This class implements an api pmid resource finder for NIH""" 

17 

18 def __init__(self, data={}, use_api_service=True): 

19 """National Institute of Health resource finder constructor.""" 

20 self._api = "https://pubmed.ncbi.nlm.nih.gov/" 

21 self._use_api_service = use_api_service 

22 self._p = "pmid:" 

23 self._data = data 

24 self._im = ISSNManager() 

25 self._pm = PMIDManager() 

26 self._headers = { 

27 "User-Agent": "ResourceFinder / OpenCitations Indexes " 

28 "(http://opencitations.net; mailto:contact@opencitations.net)" 

29 } 

30 self._issn_regex = r"(?<=^IS\s{2}-\s)[0-9]{4}-[0-9]{3}[0-9X]" 

31 self._jtitle_regex = r"(?<=^JT\s{2}-\s)(.*)(?<!\n)$" 

32 self._date_regex = r"DP\s+-\s+(\d{4}(\s?(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))?(\s?((3[0-1])|([1-2][0-9])|([0]?[1-9])))?)" 

33 super(NIHResourceFinder, self).__init__() 

34 

35 def _get_issn(self, txt_obj): 

36 result = set() 

37 fa_issn = re.finditer(self._issn_regex, txt_obj, re.MULTILINE) 

38 for matchNum_issn, match_issn in enumerate(fa_issn, start=1): 

39 m_issn = match_issn.group() 

40 if m_issn: 

41 norm_issn = self._im.normalise(m_issn, include_prefix=True) 

42 if norm_issn is not None: 

43 result.add(norm_issn) 

44 return result 

45 

46 def _get_extended_j_title(self, txt_obj): 

47 matches = re.finditer(self._jtitle_regex, txt_obj, re.MULTILINE) 

48 for matchNum, match in enumerate(matches, start=1): 

49 m = match.group() 

50 if m: 

51 m = unquote(m.strip()) 

52 return m 

53 

54 def _get_date(self, txt_obj): 

55 pmid_date = None 

56 date = re.search(self._date_regex, 

57 txt_obj, 

58 re.IGNORECASE, 

59 ).group(1) 

60 re_search = re.search( 

61 r"(\d{4})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+((3[0-1])|([1-2][0-9])|([0]?[1-9]))", 

62 date, 

63 re.IGNORECASE, 

64 ) 

65 if re_search is not None: 

66 src = re_search.group(0) 

67 datetime_object = datetime.strptime(src, "%Y %b %d") 

68 pmid_date = datetime.strftime(datetime_object, "%Y-%m-%d") 

69 else: 

70 re_search = re.search( 

71 r"(\d{4})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)", 

72 date, 

73 re.IGNORECASE, 

74 ) 

75 if re_search is not None: 

76 src = re_search.group(0) 

77 datetime_object = datetime.strptime(src, "%Y %b") 

78 pmid_date = datetime.strftime(datetime_object, "%Y-%m") 

79 else: 

80 re_search = re.search(r"(\d{4})", date) 

81 if re_search is not None: 

82 src = re.search(r"(\d{4})", date).group(0) 

83 datetime_object = datetime.strptime(src, "%Y") 

84 pmid_date = datetime.strftime(datetime_object, "%Y") 

85 return pmid_date 

86 

87 

88 def _call_api(self, pmid_full): 

89 if self._use_api_service: 

90 pmid = self._pm.normalise(pmid_full) 

91 r = get( 

92 self._api + quote(pmid) + "/?format=pubmed", 

93 headers=self._headers, 

94 timeout=30, 

95 ) 

96 if r.status_code == 200: 

97 r.encoding = "utf-8" 

98 soup = BeautifulSoup(r.text, features="lxml") 

99 mdata = str(soup.find(id="article-details")) 

100 return mdata