Coverage for ramose.py: 52%

1145 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-04 15:04 +0000

1# SPDX-FileCopyrightText: 2018-2021 essepuntato <essepuntato@gmail.com> 

2# SPDX-FileCopyrightText: 2020-2021 marilena <marilena.daquino2@unibo.it> 

3# SPDX-FileCopyrightText: 2022 dbrembilla <davide.brembilla98@gmail.com> 

4# SPDX-FileCopyrightText: 2024 Ivan Heibi <ivanhb.ita@gmail.com> 

5# SPDX-FileCopyrightText: 2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

6# 

7# SPDX-License-Identifier: ISC 

8 

9 

10from abc import abstractmethod 

11from re import search, DOTALL, findall, sub, match, split 

12from requests import get, post, put, delete 

13from requests.exceptions import RequestException 

14from requests import Session as _RequestsSession 

15_http_session = _RequestsSession() 

16from csv import DictReader, reader, writer 

17from json import dumps 

18from io import StringIO 

19from sys import exc_info, maxsize, path 

20from collections import OrderedDict 

21from markdown import markdown 

22from importlib import import_module 

23from urllib.parse import parse_qs, urlsplit, quote, unquote 

24from operator import add, itemgetter, gt, eq, lt 

25from dateutil.parser import parse 

26from datetime import datetime 

27from isodate import parse_duration 

28from argparse import ArgumentParser 

29import json 

30import logging 

31import pysparql_anything 

32import re 

33import time 

34import yaml 

35from os.path import abspath, dirname, basename 

36from os import path as pt 

37from os import sep, getcwd 

38from itertools import product 

39 

40 

41FIELD_TYPE_RE = r"([^\(\s]+)\(([^\)]+)\)" 

42PARAM_NAME = r"{([^{}\(\)]+)}" 

43DEFAULT_HTTP_TIMEOUT = 60 

44 

45 

46class HashFormatHandler(object): 

47 """This class creates an object capable to read files stored in Hash Format (see 

48 https://github.com/opencitations/ramose#Hashformat-configuration-file). A Hash Format 

49 file (.hf) is a specification file that includes information structured using the following 

50 syntax: 

51 

52 ``` 

53 #<field_name_1> <field_value_1> 

54 #<field_name_1> <field_value_2> 

55 #<field_name_3> <field_value_3> 

56 [...] 

57 #<field_name_n> <field_value_n> 

58 ```""" 

59 

60 def read(self, file_path): 

61 """This method takes in input a path of a file containing a document specified in 

62 Hash Format, and returns its representation as list of dictionaries.""" 

63 result = [] 

64 

65 with open(file_path, "r", newline=None) as f: 

66 first_field_name = None 

67 cur_object = None 

68 cur_field_name = None 

69 cur_field_content = None 

70 for line in f.readlines(): 

71 cur_matching = search(r"^#([^\s]+)\s(.+)$", line, DOTALL) 

72 if cur_matching is not None: 

73 cur_field_name = cur_matching.group(1) 

74 cur_field_content = cur_matching.group(2) 

75 

76 # If both the name and the content are defined, continue to process 

77 if cur_field_name and cur_field_content: 

78 # Identify the separator key 

79 if first_field_name is None: 

80 first_field_name = cur_field_name 

81 

82 # If the current field is equal to the separator key, 

83 # then create a new object 

84 if cur_field_name == first_field_name: 

85 # If there is an already defined object, add it to the 

86 # final result 

87 if cur_object is not None: 

88 result.append(cur_object) 

89 cur_object = {} 

90 

91 # Add the new key to the object 

92 cur_object[cur_field_name] = cur_field_content 

93 elif cur_object is not None and len(cur_object) > 0: 

94 cur_object[cur_field_name] += line 

95 

96 # Insert the last object in the result 

97 if cur_object is not None and len(cur_object) > 0: 

98 result.append(cur_object) 

99 

100 # Clean the final \n 

101 for item in result: 

102 for key in item: 

103 item[key] = item[key].rstrip() 

104 

105 return result 

106 

107 

108class DocumentationHandler(object): 

109 def __init__(self, api_manager): 

110 """This class provides the main structure for returning a human-readable documentation of all 

111 the operations described in the configuration files handled by the APIManager specified as input.""" 

112 self.conf_doc = api_manager.all_conf 

113 

114 @abstractmethod 

115 def get_documentation(self, *args, **dargs): 

116 """An abstract method that returns a string defining the human-readable documentation of the operations 

117 available in the input APIManager.""" 

118 pass # pragma: no cover 

119 

120 @abstractmethod 

121 def store_documentation(self, file_path, *args, **dargs): 

122 """An abstract method that store in the input file path (parameter 'file_path') the human-readable 

123 documentation of the operations available in the input APIManager.""" 

124 pass # pragma: no cover 

125 

126 @abstractmethod 

127 def get_index(self, *args, **dargs): 

128 """An abstract method that returns a string defining the index of all the various configuration files 

129 handled by the input APIManager.""" 

130 pass # pragma: no cover 

131 

132 

133class HTMLDocumentationHandler(DocumentationHandler): 

134 # HTML documentation: START 

135 def __title(self, conf): 

136 """This method returns the title string defined in the API specification.""" 

137 return conf["conf_json"][0]["title"] 

138 

139 def __htmlmetadescription(self, conf): 

140 """This method returns the HTML meta-description tag defined in the API specification.""" 

141 desc = conf["conf_json"][0].get("html_meta_description") 

142 if desc: 

143 return '<meta name="description" content="%s"/>' % desc 

144 return "" # pragma: no cover 

145 

146 def __sidebar(self, conf): 

147 """This method builds the sidebar of the API documentation""" 

148 result = "" 

149 

150 i = conf["conf_json"][0] 

151 result += """ 

152 

153 <h4>%s</h4> 

154 <ul id="sidebar_menu" class="sidebar_menu"> 

155 <li><a class="btn active" href="#description">DESCRIPTION</a></li> 

156 <li><a class="btn" href="#parameters">PARAMETERS</a></li> 

157 <li><a class="btn" href="#operations">OPERATIONS</a> 

158 <ul class="sidebar_submenu">%s</ul> 

159 </li> 

160 <li><a class="btn active" href="/">HOME</a></li> 

161 </ul> 

162 """ % \ 

163 (i["title"], "".join(["<li><a class='btn' href='#%s'>%s</a></li>" % (op["url"], op["url"]) 

164 for op in conf["conf_json"][1:]])) 

165 return result 

166 

167 def __header(self, conf): 

168 """This method builds the header of the API documentation""" 

169 result = "" 

170 

171 i = conf["conf_json"][0] 

172 result += """ 

173<a id='toc'></a> 

174# %s 

175 

176**Version:** %s <br/> 

177**API URL:** <a href="%s">%s</a><br/> 

178**Contact:** %s<br/> 

179**License:** %s<br/> 

180 

181 

182 

183## <a id="description"></a>Description [back to top](#toc) 

184 

185%s 

186 

187%s""" % \ 

188 (i["title"], i["version"], i["base"] + i["url"], i["base"] + i["url"], i["contacts"], i["license"], 

189 

190 i["description"], self.__parameters()) 

191 # (i["title"], i["version"], i["base"] + i["url"], i["base"] + i["url"], i["contacts"], i["contacts"], i["license"], 

192 # "".join(["<li>[%s](#%s): %s</li>" % (op["url"], op["url"], op["description"].split("\n")[0]) 

193 # for op in self.conf_json[1:]]), 

194 # i["description"], self.__parameters()) 

195 return markdown(result) 

196 

197 def __parameters(self): 

198 result = """## <a id="parameters"></a>Parameters [back to top](#toc) 

199 

200Parameters can be used to filter and control the results returned by the API. They are passed as normal HTTP parameters in the URL of the call. They are: 

201 

2021. `require=<field_name>`: all the rows that have an empty value in the `<field_name>` specified are removed from the result set - e.g. `require=given_name` removes all the rows that do not have any string specified in the `given_name` field. 

203 

2042. `filter=<field_name>:<operator><value>`: only the rows compliant with `<value>` are kept in the result set. The parameter `<operation>` is not mandatory. If `<operation>` is not specified, `<value>` is interpreted as a regular expression, otherwise it is compared by means of the specified operation. Possible operators are "=", "<", and ">". For instance, `filter=title:semantics?` returns all the rows that contain the string "semantic" or "semantics" in the field `title`, while `filter=date:>2016-05` returns all the rows that have a `date` greater than May 2016. 

205 

2063. `sort=<order>(<field_name>)`: sort in ascending (`<order>` set to "asc") or descending (`<order>` set to "desc") order the rows in the result set according to the values in `<field_name>`. For instance, `sort=desc(date)` sorts all the rows according to the value specified in the field `date` in descending order. 

207 

2084. `format=<format_type>`: the final table is returned in the format specified in `<format_type>` that can be either "csv" or "json" - e.g. `format=csv` returns the final table in CSV format. This parameter has higher priority of the type specified through the "Accept" header of the request. Thus, if the header of a request to the API specifies `Accept: text/csv` and the URL of such request includes `format=json`, the final table is returned in JSON. 

209 

2105. `json=<operation_type>("<separator>",<field>,<new_field_1>,<new_field_2>,...)`: in case a JSON format is requested in return, tranform each row of the final JSON table according to the rule specified. If `<operation_type>` is set to "array", the string value associated to the field name `<field>` is converted into an array by splitting the various textual parts by means of `<separator>`. For instance, considering the JSON table `[ { "names": "Doe, John; Doe, Jane" }, ... ]`, the execution of `array("; ",names)` returns `[ { "names": [ "Doe, John", "Doe, Jane" ], ... ]`. Instead, if `<operation_type>` is set to "dict", the string value associated to the field name `<field>` is converted into a dictionary by splitting the various textual parts by means of `<separator>` and by associating the new fields `<new_field_1>`, `<new_field_2>`, etc., to these new parts. For instance, considering the JSON table `[ { "name": "Doe, John" }, ... ]`, the execution of `dict(", ",name,fname,gname)` returns `[ { "name": { "fname": "Doe", "gname": "John" }, ... ]`. 

211 

212It is possible to specify one or more filtering operation of the same kind (e.g. `require=given_name&require=family_name`). In addition, these filtering operations are applied in the order presented above - first all the `require` operation, then all the `filter` operations followed by all the `sort` operation, and finally the `format` and the `json` operation (if applicable). It is worth mentioning that each of the aforementioned rules is applied in order, and it works on the structure returned after the execution of the previous rule. 

213 

214Example: `<api_operation_url>?require=doi&filter=date:>2015&sort=desc(date)`.""" 

215 return markdown(result) 

216 

217 def __operations(self, conf): 

218 """This method returns the description of all the operations defined in the API.""" 

219 result = """## Operations [back to top](#toc) 

220The operations that this API implements are: 

221""" 

222 ops = "\n" 

223 

224 for op in conf["conf_json"][1:]: 

225 params = [] 

226 for p in findall(PARAM_NAME, op["url"]): 

227 p_type = "str" 

228 p_shape = ".+" 

229 if p in op: 

230 p_type, p_shape = findall( 

231 r"^\s*([^\(]+)\((.+)\)\s*$", op[p])[0] 

232 

233 params.append( 

234 "<em>%s</em>: type <em>%s</em>, regular expression shape <code>%s</code>" % (p, p_type, p_shape)) 

235 result += "\n* [%s](#%s): %s" % (op["url"], 

236 op["url"], op["description"].split("\n")[0]) 

237 ops += """<div id="%s"> 

238<h3>%s <a href="#operations">back to operations</a></h3> 

239 

240%s 

241 

242<p class="attr"><strong>Accepted HTTP method(s)</strong> <span class="attr_val method">%s</span></p> 

243<p class="attr params"><strong>Parameter(s)</strong> <span class="attr_val">%s</span></p> 

244<p class="attr"><strong>Result fields type</strong><span class="attr_val">%s</span></p> 

245<p class="attr"><strong>Example</strong><span class="attr_val"><a target="_blank" href="%s">%s</a></span></p> 

246<p class="ex attr"><strong>Exemplar output (in JSON)</strong></p> 

247<pre><code>%s</code></pre></div>""" % (op["url"], op["url"], markdown(op["description"]), 

248 ", ".join( 

249 split(r"\s+", op["method"].strip())), "</li><li>".join(params), 

250 ", ".join(["%s <em>(%s)</em>" % (f, t) for t, f in 

251 findall(FIELD_TYPE_RE, op["field_type"])]), 

252 conf["website"] + conf["base_url"] + op["call"], op["call"], op["output_json"]) 

253 return markdown(result) + ops 

254 

255 def __footer(self): 

256 """This method returns the footer of the API documentation.""" 

257 result = """This API and the related documentation has been created with <a href="https://github.com/opencitations/ramose" target="_blank">RAMOSE</a>, the *Restful API Manager Over SPARQL Endpoints*, developed by <a href="http://orcid.org/0000-0003-0530-4305" target="_blank">Silvio Peroni</a> and <a href="https://marilenadaquino.github.io">Marilena Daquino</a>.""" 

258 return markdown(result) 

259 

260 def __css(self): 

261 return """ 

262 @import url('https://fonts.googleapis.com/css2?family=Karla:wght@300;400&display=swap'); 

263 @media screen and (max-width: 850px) { 

264 aside { display: none; } 

265 main, #operations, .dashboard, body>footer {margin-left: 15% !important;} 

266 #operations > ul:nth-of-type(1) li { display:block !important; max-width: 100% !important; } 

267 h3 a[href] {display:block !important; float: none !important; font-size: 0.5em !important;} 

268 a {overflow: hidden; text-overflow: ellipsis;} 

269 .info_api, .api_calls {display: block !important; max-width: 100% !important;} 

270 } 

271 

272 * { 

273 font-family: 'Karla', Geneva, sans-serif; 

274 } 

275 

276 body { 

277 margin: 3% 15% 7% 0px; 

278 line-height: 1.5em; 

279 letter-spacing: 0.02em; 

280 font-size : 1em; 

281 font-weight:300; 

282 color: #303030; 

283 text-align: justify; 

284 background-color: #edf0f2; 

285 } 

286 

287 aside { 

288 height : 100%; 

289 width: 20%; 

290 position: fixed; 

291 z-index: 1; 

292 top: 0; 

293 left: 0; 

294 /*background-color: #404040;*/ 

295 overflow-x: hidden; 

296 background-color: white; 

297 box-shadow:0px 10px 30px 0px rgba(133,66,189,0.1); 

298 } 

299 p strong { 

300 text-transform: uppercase; 

301 font-size: 0.9em; 

302 } 

303 aside h4 { 

304 padding: 20px 9%; 

305 margin: 0px !important; 

306 color: #9931FC; 

307 text-align: left !important; 

308 } 

309 

310 .sidebar_menu , .sidebar_submenu { 

311 list-style-type: none; 

312 padding-left:0px !important; 

313 margin-top: 10px; 

314 

315 } 

316 

317 .sidebar_menu > li { 

318 padding: 2% 0px; 

319 border-top : solid 0.7px grey; 

320 } 

321 

322 .sidebar_menu a { 

323 padding: 1% 9%; 

324 background-image: none !important; 

325 color: grey; 

326 display: block; 

327 } 

328 

329 .sidebar_menu a:hover { 

330 border-left: solid 5px rgba(154, 49, 252,.5); 

331 font-weight: 400; 

332 } 

333 

334 .sidebar_submenu > li { 

335 padding-left:0px !important; 

336 background-color:#edf0f2; 

337 font-size: 0.8em; 

338 } 

339 

340 main , #operations , .dashboard, body>footer { 

341 margin-left: 33%; 

342 } 

343 .dashboard {text-align: center;} 

344 main h1+p , .info_api{ 

345 

346 padding-left: 3%; 

347 font-size: 0.9em; 

348 line-height: 1.4em; 

349 } 

350 

351 main h1+p {border-left: solid 5px rgba(154, 49, 252,.5);} 

352 

353 #operations h3 { 

354 color: #9931FC; 

355 margin-bottom: 0px; 

356 padding: 10px; 

357 } 

358 

359 #operations > ul:nth-of-type(1) { 

360 padding-left: 0px !important; 

361 text-align: center; 

362 } 

363 

364 #operations > ul:nth-of-type(1) li { 

365 background-color: white; 

366 text-align: left; 

367 display: inline-block; 

368 overflow: hidden; 

369 text-overflow: ellipsis; 

370 max-width: 35%; 

371 height: 200px; 

372 padding:4%; 

373 margin: 1% 2% 1% 0px; 

374 border-radius: 10px; 

375 box-shadow: 0px 10px 30px 0px rgba(133,66,189,0.1); 

376 vertical-align:top; 

377 } 

378 

379 #operations > div { 

380 background-color: white; 

381 margin-top: 20px; 

382 padding: 2%; 

383 border-radius: 18px; 

384 box-shadow: 0px 10px 30px 0px rgba(133,66,189,0.1); 

385 } 

386 

387 #operations > div > * { 

388 padding: 0px 2%; 

389 } 

390 

391 #operations > div ul, .params+ul{ 

392 list-style-type: none; 

393 font-size: 0.85em; 

394 } 

395 #operations > div ul:nth-of-type(1) li, .params+ul li { 

396 margin: 10px 0px; 

397 } 

398 

399 #operations > div ul:nth-of-type(1) li em, .params+ul li em { 

400 font-style: normal; 

401 font-weight: 400; 

402 color: #9931FC; 

403 border-left: solid 2px #9931FC; 

404 padding:5px; 

405 } 

406 

407 .attr { 

408 border-top: solid 1px rgba(133,66,189,0.1); 

409 padding: 2% !important; 

410 display:block; 

411 vertical-align: top; 

412 font-size: 0.8em; 

413 text-align: left; 

414 } 

415 

416 .attr strong { 

417 width: 30%; 

418 color: grey; 

419 font-weight: 400; 

420 font-style: normal; 

421 display:inline-block; 

422 vertical-align: top; 

423 } 

424 

425 .attr_val { 

426 max-width: 50%; 

427 display:inline-table; 

428 height: 100%; 

429 vertical-align: top; 

430 } 

431 

432 .method { 

433 text-transform: uppercase; 

434 } 

435 

436 .params { 

437 margin-bottom: 0; 

438 } 

439 

440 pre { 

441 background-color: #f0f0f5; 

442 padding: 10px; 

443 margin-top: 0; 

444 margin-bottom: 0; 

445 border-radius: 0 0 14px 14px; 

446 font-family: monospace !important; 

447 overflow: scroll; 

448 line-height: 1.2em; 

449 height: 250px; 

450 } 

451 

452 pre code { 

453 font-family: monospace !important; 

454 } 

455 

456 p.ex { 

457 background-color: #f0f0f5; 

458 margin-bottom: 0px; 

459 padding-top: 5px; 

460 padding-bottom: 5px; 

461 } 

462 

463 h2:first-of-type { 

464 margin-bottom: 15px; 

465 } 

466 

467 ol:first-of-type { 

468 margin-top: 0; 

469 } 

470 

471 :not(pre) > code { 

472 background-color: #f0f0f5; 

473 color: #8585ad; 

474 padding: 0 2px 0 2px; 

475 border-radius: 3px; 

476 font-family : monospace; 

477 font-size: 1.2em !important; 

478 } 

479 

480 /**:not(div) > p { 

481 margin-left: 1.2%; 

482 }*/ 

483 

484 h1 {font-size: 2.5em;} 

485 h1, h2 { 

486 text-transform: uppercase; 

487 } 

488 

489 h1, h2, h3, h4, h5, h6 { 

490 line-height: 1.2em; 

491 padding-top:1em; 

492 text-align: left !important; 

493 font-weight:400; 

494 } 

495 

496 h2 ~ h2, section > h2 { 

497 

498 padding-top: 5px; 

499 margin-top: 40px; 

500 } 

501 

502 h2 a[href], h3 a[href] { 

503 background-image: none; 

504 text-transform:uppercase; 

505 padding: 1px 3px 1px 3px; 

506 font-size: 12pt; 

507 float: right; 

508 position:relative; 

509 top: -3px; 

510 } 

511 

512 h2 a[href]::before , h3 a[href]::before { 

513 content: " \u2191"; 

514 width: 20px; 

515 height: 20px; 

516 display:inline-block; 

517 color: #9931FC; 

518 text-align:center; 

519 margin-right: 10px; 

520 } 

521 

522 /*h3 a[href] { 

523 color:white 

524 background-image: none; 

525 text-transform:uppercase; 

526 padding: 1px 3px 1px 3px; 

527 font-size: 8pt !important; 

528 border: 1px solid #9931FC; 

529 float: right; 

530 position:relative; 

531 top: -11px; 

532 right: -11px; 

533 border-radius: 0 14px 0 0; 

534 }*/ 

535 

536 p { 

537 overflow-wrap: break-word; 

538 word-wrap: break-word; 

539 } 

540 

541 a { 

542 color : black; 

543 text-decoration: none; 

544 background-image: -webkit-gradient(linear,left top, left bottom,color-stop(50%, transparent),color-stop(0, rgba(154, 49, 252,.5))); 

545 background-image: linear-gradient(180deg,transparent 50%,rgba(154, 49, 252,.5) 0); 

546 background-position-y: 3px; 

547 background-position-x: 0px; 

548 background-repeat: no-repeat; 

549 -webkit-transition: .15s ease; 

550 transition: .15s ease; 

551 } 

552 

553 a:hover { 

554 color: #282828; 

555 background-position: top 6px right 0px; 

556 background-image: -webkit-gradient(linear,left top, left bottom,color-stop(60%, transparent),color-stop(0, #9931FC)); 

557 background-image: linear-gradient(180deg,transparent 60%,#9931FC 0); 

558 } 

559 

560 footer { 

561 margin-top: 20px; 

562 border-top: 1px solid lightgrey; 

563 text-align: center; 

564 color: grey; 

565 font-size: 9pt; 

566 } 

567 /* dashboard */ 

568 

569 .info_api { 

570 max-width: 35%; 

571 border-radius: 15px; 

572 text-align: left; 

573 vertical-align: top; 

574 background-color: #9931FC; 

575 color: white; 

576 } 

577 

578 .info_api, .api_calls { 

579 display: inline-block; 

580 text-align: left; 

581 height: 200px; 

582 padding:4%; 

583 margin: 1% 2% 1% 0px; 

584 border-radius: 10px; 

585 box-shadow: 0px 10px 30px 0px rgba(133,66,189,0.1); 

586 vertical-align:top; 

587 } 

588 

589 .api_calls { 

590 max-width: 40%; 

591 background-color: white; 

592 scroll-behavior: smooth; 

593 overflow: auto; 

594 overflow-y: scroll; 

595 scrollbar-color: #9931FC rgb(154, 49, 252); 

596 border-radius: 10px; 

597 } 

598 .api_calls div {padding-bottom:2%;} 

599 

600 .api_calls:hover { 

601 overflow-y: scroll; 

602 } 

603 .api_calls h4, .info_api h2 {padding-top: 0px !important; margin-top: 0px !important;} 

604 .api_calls div p { 

605 padding: 0.2em 0.5em; 

606 border-top: solid 1px #F8F8F8; 

607 } 

608 

609 .date_log , .method_log { 

610 color: grey; 

611 font-size: 0.8em; 

612 

613 } 

614 .method_log {margin-left: 15px;} 

615 .date_log {display:inline-grid;} 

616 

617 .group_log:nth-child(odd) { 

618 margin-right:5px; 

619 font-size: 0.9em; 

620 } 

621 

622 .group_log:nth-child(even) { 

623 display: inline-grid; 

624 vertical-align: top; 

625 } 

626 .status_log {padding-right:15px;} 

627 .status_log::before { 

628 content: ''; 

629 display: inline-block; 

630 width: 1em; 

631 height: 1em; 

632 vertical-align: middle; 

633 -moz-border-radius: 50%; 

634 -webkit-border-radius: 50%; 

635 border-radius: 50%; 

636 background-color: grey; 

637 margin-right: 0.8em; 

638 } 

639 

640 .code_200::before { 

641 background-color: #00cc00; 

642 } 

643 

644 .code_404::before { 

645 background-color: #cccc00; 

646 } 

647 

648 .code_500::before { 

649 background-color: #cc0000; 

650 } 

651 

652 """ 

653 

654 def __css_path(self, css_path=None): 

655 """Add link to a css file if specified in argument -css""" 

656 return """<link rel="stylesheet" type="text/css" href='"""+css_path+"""'>""" if css_path else "" 

657 

658 def logger_ramose(self): # pragma: no cover 

659 """This method adds logging info to a local file""" 

660 # logging 

661 logFormatter = logging.Formatter( 

662 "[%(asctime)s] [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s") 

663 rootLogger = logging.getLogger() 

664 

665 fileHandler = logging.FileHandler("ramose.log") 

666 fileHandler.setFormatter(logFormatter) 

667 rootLogger.addHandler(fileHandler) 

668 

669 consoleHandler = logging.StreamHandler() 

670 consoleHandler.setFormatter(logFormatter) 

671 rootLogger.addHandler(consoleHandler) 

672 

673 def __parse_logger_ramose(self): 

674 """This method reads logging info stored into a local file, so as to be browsed in the dashboard. 

675 Returns: the html including the list of URLs of current working APIs and basic logging info """ 

676 try: 

677 with open("ramose.log") as l_f: 

678 logs = ''.join(l_f.readlines()) 

679 except FileNotFoundError: 

680 logs = "" 

681 rev_list = set() 

682 rev_list_add = rev_list.add 

683 rev_list = [x for x in list(reversed(logs.splitlines())) if not ( 

684 x in rev_list or rev_list_add(x))] 

685 

686 html = """ 

687 <p></p> 

688 <aside> 

689 <h4>RAMOSE API DASHBOARD</h4> 

690 <ul id="sidebar_menu" class="sidebar_menu">""" 

691 

692 for api_url, api_dict in self.conf_doc.items(): 

693 html += """ 

694 <li><a class="btn active" href="%s">%s</a></li> 

695 """ % (api_url, api_dict["conf_json"][0]["title"]) 

696 

697 html += """ 

698 </ul> 

699 </aside> 

700 <header class="dashboard"> 

701 <h1>API MONITORING</h1>""" 

702 

703 for api_url, api_dict in self.conf_doc.items(): 

704 clean_list = [ 

705 l for l in rev_list if api_url in l and "debug" not in l] 

706 api_logs_list = ''.join(["<p>"+self.clean_log(l, api_url) 

707 + "</p>" for l in clean_list if self.clean_log(l, api_url) != '']) 

708 api_title = api_dict["conf_json"][0]["title"] 

709 html += """ 

710 <div class="info_api"> 

711 <h2>%s</h2> 

712 <a id="view_doc" href="%s">VIEW DOCUMENTATION</a><br/> 

713 <a href="%s">GO TO SPARQL ENDPOINT</a><br/> 

714 </div> 

715 <div class="api_calls"> 

716 <h4>Last calls</h4> 

717 <div> 

718 %s 

719 </div> 

720 

721 </div> 

722 """ % (api_title, api_url, api_dict["tp"], api_logs_list) 

723 return html 

724 

725 def get_documentation(self, css_path=None, base_url=None): 

726 """This method generates the HTML documentation of an API described in configuration file.""" 

727 if base_url is None: 

728 first_key = next(iter(self.conf_doc)) 

729 conf = self.conf_doc[first_key] 

730 else: 

731 conf = self.conf_doc['/'+base_url] 

732 

733 return 200, """<!DOCTYPE html> 

734<html xmlns="http://www.w3.org/1999/xhtml"> 

735 <head> 

736 <title>%s</title> 

737 %s 

738 <meta http-equiv="content-type" content="text/html; charset=utf-8"/> 

739 <meta name="viewport" content="width=device-width" /> 

740 <style>%s</style> 

741 %s 

742 </head> 

743 <body> 

744 <aside>%s</aside> 

745 <main>%s</main> 

746 <section id="operations">%s</section> 

747 <footer>%s</footer> 

748 </body> 

749</html>""" % ( 

750 self.__title(conf), 

751 self.__htmlmetadescription(conf), 

752 self.__css(), 

753 self.__css_path(css_path), 

754 self.__sidebar(conf), 

755 self.__header(conf), 

756 self.__operations(conf), 

757 self.__footer() 

758 ) 

759 

760 def get_index(self, css_path=None): 

761 """This method generates the index of all the HTML documentations that can be 

762 created from the configuration file.""" 

763 

764 return """ 

765 <!doctype html> 

766 <html lang="en"> 

767 <head> 

768 <meta charset="utf-8"> 

769 <title>RAMOSE</title> 

770 <meta name="description" content="Documentation of RAMOSE API Manager"> 

771 <style>%s</style> 

772 %s 

773 </head> 

774 <body> 

775 %s 

776 <footer>%s</footer> 

777 </body> 

778 </html> 

779 """ % (self.__css(), self.__css_path(css_path), self.__parse_logger_ramose(), self.__footer()) 

780 

781 def store_documentation(self, file_path, css_path=None): 

782 """This method stores the HTML documentation of an API in a file.""" 

783 _, html = self.get_documentation(css_path) 

784 with open(file_path, "w") as f: 

785 f.write(html) 

786 

787 def clean_log(self, l, api_url): 

788 """This method parses logs lines into structured data.""" 

789 if "- - " not in l: 

790 return '' 

791 s = l.split("- - ", 1)[1] 

792 date = s[s.find("[")+1:s.find("]")] 

793 method = s.split('"')[1::2][0].split()[0] 

794 cur_call = s.split('"')[1::2][0].split()[1].strip() 

795 status = sub(r"\D+", "", s.split('"', 2)[2]) 

796 if cur_call != api_url+'/': 

797 full_str = "<span class='group_log'><span class='status_log code_"+status+"'>"+status+"</span>"+"<span class='date_log'>"+date+"</span><span class='method_log'>" + \ 

798 method+"</span></span>"+"<span class='group_log'><span class='call_log'><a href='" + \ 

799 cur_call+"' target='_blank'>"+cur_call+"</a></span></span>" 

800 else: 

801 full_str = '' 

802 return full_str 

803 

804 

805class OpenAPIDocumentationHandler(DocumentationHandler): 

806 """ 

807 Export RAMOSE .hf configuration(s) to an OpenAPI 3.0 YAML specification. 

808 

809 Notes: 

810 - OpenAPI is a surface contract. RAMOSE implementation details are preserved as vendor extensions. 

811 - Extra RAMOSE config fields from Tables 1-2 are kept as x-ramose-* where OpenAPI has no native field. 

812 """ 

813 

814 # ------------------------- 

815 # Small utilities 

816 # ------------------------- 

817 def _normalize_base_url(self, base_url): 

818 if base_url is None: 

819 return None 

820 return base_url[1:] if base_url.startswith("/") else base_url 

821 

822 def _get_conf(self, base_url=None): 

823 if base_url is None: 

824 first_key = next(iter(self.conf_doc)) 

825 return self.conf_doc[first_key] 

826 base_url = self._normalize_base_url(base_url) 

827 return self.conf_doc["/" + base_url] 

828 

829 def _schema_for_ramose_type(self, t): 

830 t = (t or "str").strip().lower() 

831 if t == "int": 

832 return {"type": "integer"} 

833 if t == "float": 

834 return {"type": "number"} 

835 if t == "datetime": 

836 return {"type": "string", "format": "date-time"} 

837 if t == "duration": 

838 # OpenAPI doesn't standardize duration; still useful as hint. 

839 return {"type": "string", "format": "duration"} 

840 return {"type": "string"} 

841 

842 def _parse_param_type_shape(self, s): 

843 # expected "type(regex)" 

844 try: 

845 t, shape = findall(r"^\s*([^\(]+)\((.+)\)\s*$", s)[0] 

846 return t.strip(), shape.strip() 

847 except Exception: 

848 return "str", ".+" 

849 

850 def _guess_contact(self, contacts_value): 

851 """ 

852 Table 1: '#contacts <contact_url>' but in practice it's often an email. 

853 Prefer OpenAPI contact.email when it looks like an email. 

854 """ 

855 if not contacts_value: 

856 return None 

857 c = str(contacts_value).strip() 

858 if "@" in c and " " not in c and "/" not in c: 

859 return {"email": c} 

860 return {"name": c} 

861 

862 def _clean_text(self, v): 

863 """ 

864 Normalize text coming from .hf parsing so Swagger/ YAML render nicely: 

865 - remove wrapping quotes if they were included as part of the value 

866 - turn literal '\\n' into real newlines 

867 - trim whitespace 

868 """ 

869 if v is None: 

870 return None 

871 s = str(v).strip() 

872 # Strip wrapping quotes if parser stored them as part of the value 

873 if len(s) >= 2 and ((s[0] == s[-1] == '"') or (s[0] == s[-1] == "'")): 

874 s = s[1:-1].strip() 

875 # Convert literal backslash-n sequences to actual newlines 

876 s = s.replace("\\n", "\n") 

877 return s 

878 

879 def _param_hint_from_preprocess(self, preprocess_str, param_name): 

880 """ 

881 Table 2: preprocess functions like 'lower(doi) --> split_dois(dois)'. 

882 Not formalizable in OpenAPI, but helpful as a hint. 

883 """ 

884 if not preprocess_str: 

885 return "" 

886 s = str(preprocess_str) 

887 # Any function call mentioning the param inside (...)? 

888 if re.search(r"\([^)]*\b" + re.escape(param_name) + r"\b[^)]*\)", s): 

889 return f"Note: input is pre-processed by RAMOSE: {s}" 

890 return "" 

891 

892 def _try_parse_output_json(self, output_json_value): 

893 """ 

894 Table 2: '#output_json <ex_response>' (JSON example). 

895 """ 

896 if not output_json_value: 

897 return None 

898 try: 

899 return json.loads(output_json_value) 

900 except Exception: 

901 return None 

902 

903 # ------------------------- 

904 # Formats / media-types 

905 # ------------------------- 

906 def _collect_format_tokens(self, conf): 

907 # always supported by RAMOSE docs 

908 formats = {"csv", "json"} 

909 for op in conf["conf_json"][1:]: 

910 if "format" in op: 

911 fm_val = op["format"] 

912 fm_list = fm_val if isinstance(fm_val, list) else [fm_val] 

913 for fm in fm_list: 

914 for part in str(fm).split(";"): 

915 part = part.strip() 

916 if not part: 

917 continue 

918 # expected "fmt,func" 

919 fmt = part.split(",", 1)[0].strip() 

920 if fmt: 

921 formats.add(fmt) 

922 return sorted(formats) 

923 

924 def _media_type_for_format(self, fmt): 

925 fmt = (fmt or "").strip().lower() 

926 mapping = { 

927 "json": "application/json", 

928 "csv": "text/csv", 

929 "xml": "application/xml", 

930 "rdfxml": "application/rdf+xml", 

931 "rdf+xml": "application/rdf+xml", 

932 "ttl": "text/turtle", 

933 "turtle": "text/turtle", 

934 "nt": "application/n-triples", 

935 "ntriples": "application/n-triples", 

936 "n-triples": "application/n-triples", 

937 "nq": "application/n-quads", 

938 "n-quads": "application/n-quads", 

939 "trig": "application/trig", 

940 } 

941 return mapping.get(fmt, None) 

942 

943 def _build_response_content(self, ok_schema, formats_enum, ok_example=None, err_schema_ref=None): 

944 """ 

945 Build OpenAPI 'content' dict for responses based on supported formats. 

946 JSON gets structured schema. Others are represented as string payloads. 

947 If err_schema_ref is provided, also returns an error-content dict. 

948 """ 

949 content = OrderedDict() 

950 

951 # JSON: structured 

952 content["application/json"] = {"schema": ok_schema} 

953 if ok_example is not None: 

954 content["application/json"]["examples"] = {"example": {"value": ok_example}} 

955 

956 # CSV: textual 

957 content["text/csv"] = {"schema": {"type": "string"}} 

958 

959 # Other formats discovered in .hf (#format) 

960 for fmt in formats_enum or []: 

961 mt = self._media_type_for_format(fmt) 

962 if mt is None or mt in content: 

963 continue 

964 if mt in ("application/json", "text/csv"): 

965 continue 

966 content[mt] = {"schema": {"type": "string"}} 

967 

968 if err_schema_ref: 

969 err_content = OrderedDict() 

970 err_content["application/json"] = {"schema": {"$ref": err_schema_ref}} 

971 err_content["text/csv"] = {"schema": {"type": "string"}} 

972 for fmt in formats_enum or []: 

973 mt = self._media_type_for_format(fmt) 

974 if mt is None or mt in err_content: 

975 continue 

976 if mt in ("application/json", "text/csv"): 

977 continue 

978 err_content[mt] = {"schema": {"type": "string"}} 

979 return content, err_content 

980 

981 return content 

982 

983 # ------------------------- 

984 # Examples from #call 

985 # ------------------------- 

986 def _extract_param_examples_from_call(self, path_template, call_value): 

987 """ 

988 Given a template like '/metadata/{dois}' and a call like 

989 '/metadata/10.1/abc__10.2/xyz', return {'dois': '10.1/abc__10.2/xyz'}. 

990 

991 IMPORTANT: RAMOSE allows slashes inside the last param because it routes 

992 everything via <path:api_url>. OpenAPI tooling typically expects these 

993 slashes to be URL-encoded in examples. 

994 """ 

995 if not call_value: 

996 return {} 

997 

998 call_path = str(call_value).split("?", 1)[0].strip() 

999 

1000 if not path_template.startswith("/"): 

1001 path_template = "/" + path_template 

1002 if not call_path.startswith("/"): 

1003 call_path = "/" + call_path 

1004 

1005 parts = path_template.split("/") 

1006 re_parts = [] 

1007 

1008 # Allow '/' inside the LAST parameter segment (captures the rest of the path) 

1009 last_index = len(parts) - 1 

1010 

1011 for i, part in enumerate(parts): 

1012 if part.startswith("{") and part.endswith("}"): 

1013 name = part[1:-1] 

1014 if i == last_index: 

1015 # last param: capture everything to end, including slashes 

1016 re_parts.append(r"(?P<%s>.+)" % name) 

1017 else: 

1018 # middle params: standard segment (no slash) 

1019 re_parts.append(r"(?P<%s>[^/]+)" % name) 

1020 else: 

1021 re_parts.append(re.escape(part)) 

1022 

1023 pat = "^" + "/".join(re_parts) + "$" 

1024 m = re.match(pat, call_path) 

1025 if not m: 

1026 return {} 

1027 return {k: v for k, v in m.groupdict().items() if v is not None} 

1028 

1029 # ------------------------- 

1030 # Schema from field_type 

1031 # ------------------------- 

1032 def _build_row_schema_from_field_type(self, field_type_str): 

1033 props = OrderedDict() 

1034 for t, f in findall(FIELD_TYPE_RE, field_type_str or ""): 

1035 props[f] = self._schema_for_ramose_type(t) 

1036 return {"type": "object", "properties": props} 

1037 

1038 # ------------------------- 

1039 # Main builder 

1040 # ------------------------- 

1041 def _build_openapi(self, base_url=None): 

1042 conf = self._get_conf(base_url) 

1043 api_meta = conf["conf_json"][0] 

1044 formats_enum = self._collect_format_tokens(conf) 

1045 

1046 spec = OrderedDict() 

1047 spec["openapi"] = "3.0.3" 

1048 

1049 # info 

1050 spec["info"] = OrderedDict( 

1051 [ 

1052 ("title", api_meta.get("title", "RAMOSE API")), 

1053 ("version", api_meta.get("version", "0.0.0")), 

1054 ] 

1055 ) 

1056 if "description" in api_meta: 

1057 spec["info"]["description"] = api_meta["description"] 

1058 if "license" in api_meta: 

1059 spec["info"]["license"] = {"name": api_meta["license"]} 

1060 if "contacts" in api_meta: 

1061 contact_obj = self._guess_contact(api_meta.get("contacts")) 

1062 if contact_obj: 

1063 spec["info"]["contact"] = contact_obj 

1064 

1065 # servers 

1066 base = api_meta.get("base", "") 

1067 root = api_meta.get("url", "") 

1068 spec["servers"] = [{"url": f"{base}{root}"}] 

1069 

1070 # Preserve additional Table 1 fields as vendor extensions 

1071 if "endpoint" in api_meta: 

1072 spec["x-ramose-endpoint"] = api_meta.get("endpoint") 

1073 if "addon" in api_meta: 

1074 spec["x-ramose-addon"] = api_meta.get("addon") 

1075 if "method" in api_meta: 

1076 # Table 1: method used to send request to SPARQL endpoint 

1077 spec["x-ramose-sparql-method"] = api_meta.get("method") 

1078 

1079 # components 

1080 spec["components"] = {"schemas": {}, "parameters": {}} 

1081 

1082 spec["components"]["schemas"]["Error"] = { 

1083 "type": "object", 

1084 "properties": {"error": {"type": "integer"}, "message": {"type": "string"}}, 

1085 "required": ["error", "message"], 

1086 } 

1087 

1088 # Common query params (as in HTML docs) 

1089 spec["components"]["parameters"]["require"] = { 

1090 "name": "require", 

1091 "in": "query", 

1092 "description": "Remove rows that have an empty value in the specified field. Repeatable.", 

1093 "required": False, 

1094 "style": "form", 

1095 "explode": True, 

1096 "schema": {"type": "array", "items": {"type": "string"}}, 

1097 } 

1098 spec["components"]["parameters"]["filter"] = { 

1099 "name": "filter", 

1100 "in": "query", 

1101 "description": ( 

1102 "Filter rows. Repeatable.\n\n" 

1103 "Syntax: `field:opvalue` where `op` is one of `=`, `<`, `>`.\n" 

1104 "If `op` is omitted, `value` is treated as a regex." 

1105 ), 

1106 "required": False, 

1107 "style": "form", 

1108 "explode": True, 

1109 "schema": {"type": "array", "items": {"type": "string"}}, 

1110 } 

1111 spec["components"]["parameters"]["sort"] = { 

1112 "name": "sort", 

1113 "in": "query", 

1114 "description": "Sort rows. Syntax: asc(field) or desc(field). Repeatable.", 

1115 "required": False, 

1116 "style": "form", 

1117 "explode": True, 

1118 "schema": {"type": "array", "items": {"type": "string"}}, 

1119 } 

1120 spec["components"]["parameters"]["format"] = { 

1121 "name": "format", 

1122 "in": "query", 

1123 "description": "Force output format (overrides Accept header).", 

1124 "required": False, 

1125 "schema": {"type": "string", "enum": formats_enum}, 

1126 } 

1127 spec["components"]["parameters"]["json"] = { 

1128 "name": "json", 

1129 "in": "query", 

1130 "description": ( 

1131 "Transform JSON output rows. Repeatable.\n\n" 

1132 "Syntax:\n" 

1133 "- `array(\"<sep>\", field)`\n" 

1134 "- `dict(\"<sep>\", field, new_field_1, new_field_2, ...)`\n\n" 

1135 "Where `<sep>` is a string separator (e.g. `,` or `__`)." 

1136 ), 

1137 "required": False, 

1138 "style": "form", 

1139 "explode": True, 

1140 "schema": {"type": "array", "items": {"type": "string"}}, 

1141 } 

1142 

1143 common_param_refs = [ 

1144 {"$ref": "#/components/parameters/require"}, 

1145 {"$ref": "#/components/parameters/filter"}, 

1146 {"$ref": "#/components/parameters/sort"}, 

1147 {"$ref": "#/components/parameters/format"}, 

1148 {"$ref": "#/components/parameters/json"}, 

1149 ] 

1150 

1151 # paths 

1152 spec["paths"] = OrderedDict() 

1153 tag_name = api_meta.get("title", "RAMOSE API") 

1154 

1155 for op in conf["conf_json"][1:]: 

1156 raw_path = op.get("url", "") 

1157 if not raw_path.startswith("/"): 

1158 raw_path = "/" + raw_path 

1159 

1160 if raw_path not in spec["paths"]: 

1161 spec["paths"][raw_path] = OrderedDict() 

1162 

1163 # path parameters 

1164 path_params = [] 

1165 for p in findall(PARAM_NAME, raw_path): 

1166 t = "str" 

1167 shape = ".+" 

1168 if p in op: 

1169 t, shape = self._parse_param_type_shape(op[p]) 

1170 

1171 schema = self._schema_for_ramose_type(t) 

1172 if schema.get("type") == "string" and shape: 

1173 schema["pattern"] = shape 

1174 

1175 param_obj = { 

1176 "name": p, 

1177 "in": "path", 

1178 "required": True, 

1179 "schema": schema, 

1180 } 

1181 

1182 hint = self._param_hint_from_preprocess(op.get("preprocess"), p) 

1183 if hint: 

1184 param_obj["description"] = hint 

1185 

1186 path_params.append(param_obj) 

1187 

1188 # Examples from Table 2 '#call' 

1189 call_examples = self._extract_param_examples_from_call(raw_path, op.get("call")) 

1190 for param in path_params: 

1191 nm = param.get("name") 

1192 if nm in call_examples: 

1193 # Encode slashes etc. so Swagger UI / generated clients behave correctly 

1194 param["example"] = quote(call_examples[nm], safe="-._~__") 

1195 if "__" in call_examples[nm] and "description" not in param: 

1196 param["description"] = "Multiple values can be provided separated by '__'." 

1197 

1198 # response schema: array of row objects 

1199 row_schema = self._build_row_schema_from_field_type(op.get("field_type", "")) 

1200 ok_schema = {"type": "array", "items": row_schema} 

1201 ok_example = self._try_parse_output_json(op.get("output_json")) 

1202 

1203 ok_content, err_content = self._build_response_content( 

1204 ok_schema=ok_schema, 

1205 formats_enum=formats_enum, 

1206 ok_example=ok_example, 

1207 err_schema_ref="#/components/schemas/Error", 

1208 ) 

1209 

1210 # methods can be space-separated in RAMOSE 

1211 methods = split(r"\s+", op.get("method", "get").strip()) 

1212 for m in [mm for mm in methods if mm]: 

1213 m = m.lower() 

1214 

1215 summary = "" 

1216 if "description" in op and op["description"]: 

1217 summary = op["description"].split("\n")[0].strip() 

1218 

1219 # Build a nicer description (and optionally include SPARQL as a markdown code block) 

1220 desc = self._clean_text(op.get("description")) or "" 

1221 spr = self._clean_text(op.get("sparql")) 

1222 

1223 if spr: 

1224 desc += "\n\n---\n\n### RAMOSE SPARQL\n\n```sparql\n" + spr + "\n```" 

1225 

1226 op_obj = OrderedDict( 

1227 [ 

1228 ("tags", [tag_name]), 

1229 ("summary", summary), 

1230 ("description", desc), 

1231 ("parameters", path_params + common_param_refs), 

1232 ( 

1233 "responses", 

1234 OrderedDict( 

1235 [ 

1236 ( 

1237 "200", 

1238 { 

1239 "description": "Successful response", 

1240 "content": ok_content, 

1241 }, 

1242 ), 

1243 ( 

1244 "default", 

1245 { 

1246 "description": "Error", 

1247 "content": err_content, 

1248 }, 

1249 ), 

1250 ] 

1251 ), 

1252 ), 

1253 ] 

1254 ) 

1255 

1256 # Option B: keep RAMOSE-specific stuff under one vendor extension object 

1257 ramose_ext = OrderedDict() 

1258 

1259 pre = self._clean_text(op.get("preprocess")) 

1260 post_val = self._clean_text(op.get("postprocess")) 

1261 call = self._clean_text(op.get("call")) 

1262 

1263 if pre: 

1264 ramose_ext["preprocess"] = pre 

1265 if post_val: 

1266 ramose_ext["postprocess"] = post_val 

1267 if call: 

1268 ramose_ext["call"] = call 

1269 

1270 # Instead of embedding the giant SPARQL here (which makes the YAML hard to read), 

1271 # we indicate where it is rendered. 

1272 if spr: 

1273 ramose_ext["sparql_in_description"] = True 

1274 

1275 if ramose_ext: 

1276 op_obj["x-ramose"] = ramose_ext 

1277 

1278 # Assign the operation 

1279 spec["paths"][raw_path][m] = op_obj 

1280 

1281 return spec 

1282 

1283 # ------------------------- 

1284 # PyYAML compatibility 

1285 # ------------------------- 

1286 def _to_builtin(self, obj): 

1287 """Recursively convert OrderedDict (and other non-builtin containers) 

1288 to plain Python builtins so that yaml.safe_dump can serialize it.""" 

1289 if isinstance(obj, OrderedDict): 

1290 obj = dict(obj) 

1291 if isinstance(obj, dict): 

1292 return {k: self._to_builtin(v) for k, v in obj.items()} 

1293 if isinstance(obj, (list, tuple, set)): 

1294 return [self._to_builtin(v) for v in obj] 

1295 return obj 

1296 

1297 def _dump_yaml(self, spec): 

1298 """ 

1299 Dump OpenAPI spec to YAML with nice formatting: 

1300 - multiline strings become block scalars (|) 

1301 - keys keep insertion order (sort_keys=False) 

1302 """ 

1303 class _RamoseYamlDumper(yaml.SafeDumper): 

1304 pass 

1305 

1306 def _str_presenter(dumper, data): 

1307 if "\n" in data: 

1308 return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") 

1309 return dumper.represent_scalar("tag:yaml.org,2002:str", data) 

1310 

1311 _RamoseYamlDumper.add_representer(str, _str_presenter) 

1312 return yaml.dump(spec, Dumper=_RamoseYamlDumper, sort_keys=False, allow_unicode=True) 

1313 

1314 def get_documentation(self, base_url=None): 

1315 spec = self._build_openapi(base_url=base_url) 

1316 spec = self._to_builtin(spec) 

1317 yml = self._dump_yaml(spec) 

1318 return 200, yml 

1319 

1320 def store_documentation(self, file_path, base_url=None): 

1321 yml = self.get_documentation(base_url=base_url)[1] 

1322 with open(file_path, "w", encoding="utf8") as f: 

1323 f.write(yml) 

1324 

1325 def get_index(self, *args, **dargs): 

1326 # Not used by the current UI. Keep a minimal placeholder. 

1327 return "OpenAPI exporter available." 

1328 

1329 

1330class DataType(object): 

1331 def __init__(self): 

1332 """This class implements all the possible data types that can be used within 

1333 the configuration file of RAMOSE. In particular, it provides methods for converting 

1334 a string into the related Python data type representation.""" 

1335 self.func = { 

1336 "str": DataType.str, 

1337 "int": DataType.int, 

1338 "float": DataType.float, 

1339 "duration": DataType.duration, 

1340 "datetime": DataType.datetime 

1341 } 

1342 

1343 def get_func(self, name_str): 

1344 """This method returns the method for handling a given data type expressed as a string name.""" 

1345 return self.func.get(name_str) 

1346 

1347 @staticmethod 

1348 def duration(s): 

1349 """This method returns the data type for durations according to the XML Schema 

1350 Recommendation (https://www.w3.org/TR/xmlschema11-2/#duration) from the input string. 

1351 In case the input string is None or it is empty, an high duration value 

1352 (i.e. 2000 years) is returned.""" 

1353 if s is None or s == "": 

1354 d = parse_duration("P2000Y") 

1355 else: 

1356 d = parse_duration(s) 

1357 

1358 return datetime(1983, 1, 15) + d 

1359 

1360 @staticmethod 

1361 def datetime(s): 

1362 """This method returns the data type for datetime according to the ISO 8601 

1363 (https://en.wikipedia.org/wiki/ISO_8601) from the input string. In case the input string is None or 

1364 it is empty, a low date value (i.e. 0001-01-01) is returned.""" 

1365 default = datetime(1, 1, 1, 0, 0) 

1366 if s is None or s == "": 

1367 d = parse("0001-01-01", default=default) 

1368 else: 

1369 d = parse(s, default=default) 

1370 

1371 return d 

1372 

1373 @staticmethod 

1374 def str(s): 

1375 """This method returns the data type for strings. In case the input string is None, an empty string 

1376 is returned.""" 

1377 if s is None: 

1378 l = "" 

1379 else: 

1380 l = str(s).lower() 

1381 

1382 return l 

1383 

1384 @staticmethod 

1385 def int(s): 

1386 """This method returns the data type for integer numbers from the input string. In case the input string is 

1387 None or it is empty, a low integer value is returned.""" 

1388 if s is None or s == "": 

1389 i = -maxsize 

1390 else: 

1391 i = int(s) 

1392 

1393 return i 

1394 

1395 @staticmethod 

1396 def float(s): 

1397 """This method returns the data type for float numbers from the input string. In case the input string is 

1398 None or it is empty, a low float value is returned.""" 

1399 if s is None or s == "": 

1400 f = float(-maxsize) 

1401 else: 

1402 f = float(s) 

1403 

1404 return f 

1405 

1406 

1407class Operation(object): 

1408 def __init__(self, op_complete_url, op_key, i, tp, sparql_http_method, addon, 

1409 format=None, sources_map=None, allow_inline_endpoints=False, engine="sparql"): 

1410 """This class is responsible for materialising a API operation to be run against a SPARQL endpoint 

1411 (or, depending on configuration, through the SPARQL.Anything engine). 

1412 

1413 It takes in input a full URL referring to a call to an operation (parameter 'op_complete_url'), 

1414 the particular shape representing an operation (parameter 'op_key'), the definition (in JSON) of such 

1415 operation (parameter 'i'), the URL of the triplestore to contact (parameter 'tp'), the HTTP method 

1416 to use for the SPARQL request (parameter 'sparql_http_method', set to either 'get' or 'post'), the path 

1417 of the Python file which defines additional functions for use in the operation (parameter 'addon'), and formats 

1418 with the names of the corresponding functions responsible for converting CSV data into the specified formats 

1419 (parameter 'format'). 

1420 It also accepts a mapping of named sources to endpoint URLs referenced by @@with directives 

1421 (parameter 'sources_map'), a flag controlling whether @@endpoint directives are allowed to override 

1422 endpoints inline (parameter 'allow_inline_endpoints'), and the engine identifier selecting the execution 

1423 backend (parameter 'engine').""" 

1424 self.url_parsed = urlsplit(op_complete_url) 

1425 self.op_url = self.url_parsed.path 

1426 self.op = op_key 

1427 self.i = i 

1428 self.tp = tp 

1429 self.sparql_http_method = sparql_http_method 

1430 self.addon = addon 

1431 self.format = format or {} 

1432 self.sources_map = sources_map or {} 

1433 self.allow_inline_endpoints = allow_inline_endpoints 

1434 self.engine = engine 

1435 self._sa_engine = None 

1436 

1437 self.operation = { 

1438 "=": eq, 

1439 "<": lt, 

1440 ">": gt 

1441 } 

1442 

1443 self.dt = DataType() 

1444 

1445 # START: Ancillary methods 

1446 @staticmethod 

1447 def get_content_type(ct): 

1448 """It returns the mime type of a given textual representation of a format, being it either 

1449 'csv' or 'json.""" 

1450 content_type = ct 

1451 

1452 if ct == "csv": 

1453 content_type = "text/csv" 

1454 elif ct == "json": 

1455 content_type = "application/json" 

1456 

1457 return content_type 

1458 

1459 def conv(self, s, query_string, c_type="text/csv"): 

1460 """This method takes a string representing a CSV document and converts it in the requested format according 

1461 to what content type is specified as input.""" 

1462 

1463 content_type = Operation.get_content_type(c_type) 

1464 

1465 # Overwrite if requesting a particular format via the URL 

1466 if "format" in query_string: 

1467 req_formats = query_string["format"] 

1468 

1469 for req_format in req_formats: 

1470 content_type = Operation.get_content_type(req_format) 

1471 

1472 if req_format in self.format: 

1473 converter_func = getattr(self.addon, self.format[req_format]) 

1474 return converter_func(s), content_type 

1475 

1476 # If a non built-in format was requested but no converter ran, 

1477 # force CSV Content-Type instead of echoing the requested token. 

1478 if content_type not in ("text/csv", "application/json"): 

1479 content_type = "text/csv" 

1480 

1481 if "application/json" in content_type: 

1482 with StringIO(s) as f: 

1483 r = [] 

1484 for i in DictReader(f): 

1485 r.append(dict(i)) 

1486 

1487 # See if any restructuring of the final JSON is required 

1488 r = Operation.structured(query_string, r) 

1489 

1490 return dumps(r, ensure_ascii=False, indent=4), content_type 

1491 else: 

1492 return s, content_type 

1493 

1494 @staticmethod 

1495 def pv(i, r=None): 

1496 """This method returns the plain value of a particular item 'i' of the result returned by the SPARQL query. 

1497 

1498 In case 'r' is specified (i.e. a row containing a set of results), then 'i' must be the index of the item 

1499 within that row.""" 

1500 if r is None: 

1501 return i[1] 

1502 else: 

1503 return Operation.pv(r[i]) 

1504 

1505 @staticmethod 

1506 def tv(i, r=None): 

1507 """This method returns the typed value of a particular item 'i' of the result returned by the SPARQL query. 

1508 The type associated to that value is actually specified by means of the particular configuration provided 

1509 in the specification file of the API - field 'field_type'. 

1510 

1511 In case 'r' is specified (i.e. a row containing a set of results), then 'i' must be the index of the item 

1512 within that row.""" 

1513 if r is None: 

1514 return i[0] 

1515 else: 

1516 return Operation.tv(r[i]) 

1517 

1518 @staticmethod 

1519 def do_overlap(r1, r2): 

1520 """This method returns a boolean that says if the two ranges (i.e. two pairs of integers) passed as inputs 

1521 actually overlap one with the other.""" 

1522 r1_s, r1_e = r1 

1523 r2_s, r2_e = r2 

1524 

1525 return r1_s <= r2_s <= r1_e or r2_s <= r1_s <= r2_e 

1526 

1527 @staticmethod 

1528 def get_item_in_dict(d_or_l, key_list, prev=None): 

1529 """This method takes as input a dictionary or a list of dictionaries and browses it until the value 

1530 specified following the chain indicated in 'key_list' is not found. It returns a list of all the 

1531 values that matched with such search.""" 

1532 if prev is None: 

1533 res = [] 

1534 else: 

1535 res = prev.copy() 

1536 

1537 if type(d_or_l) is dict: 

1538 d_list = [d_or_l] 

1539 if type(d_or_l) is list: 

1540 d_list = d_or_l 

1541 

1542 for d in d_list: 

1543 key_list_len = len(key_list) 

1544 

1545 if key_list_len >= 1: 

1546 key = key_list[0] 

1547 if key in d: 

1548 if key_list_len == 1: 

1549 res.append(d[key]) 

1550 else: 

1551 res = Operation.get_item_in_dict(d[key], key_list[1:], res) 

1552 

1553 return res 

1554 

1555 @staticmethod 

1556 def add_item_in_dict(d_or_l, key_list, item, idx): 

1557 """This method takes as input a dictionary or a list of dictionaries, browses it until the value 

1558 specified following the chain indicated in 'key_list' is not found, and then substitutes it with 'item'. 

1559 In case the final object retrieved is a list, it selects the object in position 'idx' before the 

1560 substitution.""" 

1561 key_list_len = len(key_list) 

1562 

1563 if key_list_len >= 1: 

1564 key = key_list[0] 

1565 

1566 if type(d_or_l) is list: 

1567 if key_list_len == 1: 

1568 d_or_l[idx][key] = item 

1569 else: 

1570 for i in d_or_l: 

1571 Operation.add_item_in_dict(i, key_list, item, idx) 

1572 else: 

1573 if key in d_or_l: 

1574 if key_list_len == 1: 

1575 d_or_l[key] = item 

1576 else: 

1577 Operation.add_item_in_dict(d_or_l[key], key_list[1:], item, idx) 

1578 

1579 @staticmethod 

1580 def structured(params, json_table): 

1581 """This method checks if there are particular transformation rules specified in 'params' for a JSON output, 

1582 and convert each row of the input table ('json_table') according to these rules. 

1583 There are two specific rules that can be applied: 

1584 

1585 1. array("<separator>",<field>): it converts the string value associated to the field name '<field>' into 

1586 an array by splitting the various textual parts by means of '<separator>'. For instance, consider the 

1587 following JSON structure: 

1588 

1589 [ 

1590 { "names": "Doe, John; Doe, Jane" }, 

1591 { "names": "Doe, John; Smith, John" } 

1592 ] 

1593 

1594 Executing the rule 'array("; ",names)' returns the following new JSON structure: 

1595 

1596 [ 

1597 { "names": [ "Doe, John", "Doe, Jane" ], 

1598 { "names": [ "Doe, John", "Smith, John" ] 

1599 ] 

1600 

1601 2. dict("separator",<field>,<new_field_1>,<new_field_2>,...): it converts the string value associated to 

1602 the field name '<field>' into an dictionary by splitting the various textual parts by means of 

1603 '<separator>' and by associating the new fields '<new_field_1>', '<new_field_2>', etc., to these new 

1604 parts. For instance, consider the following JSON structure: 

1605 

1606 [ 

1607 { "name": "Doe, John" }, 

1608 { "name": "Smith, John" } 

1609 ] 

1610 

1611 Executing the rule 'array(", ",name,family_name,given_name)' returns the following new JSON structure: 

1612 

1613 [ 

1614 { "name": { "family_name": "Doe", "given_name: "John" } }, 

1615 { "name": { "family_name": "Smith", "given_name: "John" } } 

1616 ] 

1617 

1618 Each of the specified rules is applied in order, and it works on the JSON structure returned after 

1619 the execution of the previous rule.""" 

1620 if "json" in params: 

1621 fields = params["json"] 

1622 for field in fields: 

1623 ops = findall(r'([a-z]+)\(("[^"]+"),([^\)]+)\)', field) 

1624 for op_type, s, es in ops: 

1625 separator = sub('"(.+)"', "\\1", s) 

1626 entries = [i.strip() for i in es.split(",")] 

1627 keys = entries[0].split(".") 

1628 

1629 for row in json_table: 

1630 v_list = Operation.get_item_in_dict(row, keys) 

1631 for idx, v in enumerate(v_list): 

1632 if op_type == "array": 

1633 if type(v) is str: 

1634 Operation.add_item_in_dict(row, keys, 

1635 v.split(separator) if v != "" else [], idx) 

1636 elif op_type == "dict": 

1637 new_fields = entries[1:] 

1638 new_fields_max_split = len(new_fields) - 1 

1639 if type(v) is str: 

1640 new_values = v.split( 

1641 separator, new_fields_max_split) 

1642 Operation.add_item_in_dict(row, keys, 

1643 dict( 

1644 zip(new_fields, new_values)) if v != "" else {}, 

1645 idx) 

1646 elif type(v) is list: 

1647 new_list = [] 

1648 for i in v: 

1649 new_values = i.split(separator, new_fields_max_split) 

1650 new_list.append(dict(zip(new_fields, new_values))) 

1651 Operation.add_item_in_dict(row, keys, new_list, idx) 

1652 

1653 return json_table 

1654 # END: Ancillary methods 

1655 

1656 # START: Processing methods 

1657 def preprocess(self, par_dict, op_item, addon): 

1658 """This method takes the a dictionary of parameters with the current typed values associated to them and 

1659 the item of the API specification defining the behaviour of that operation, and preprocesses the parameters 

1660 according to the functions specified in the '#preprocess' field (e.g. "#preprocess lower(doi)"), which is 

1661 applied to the specified parameters as input of the function in consideration (e.g. 

1662 "/api/v1/citations/10.1108/jd-12-2013-0166", converting the DOI in lowercase). 

1663 

1664 It is possible to run multiple functions sequentially by concatenating them with "-->" in the API 

1665 specification document. In this case the output of the function f_i will becomes the input operation URL 

1666 of the function f_i+1. 

1667 

1668 Finally, it is worth mentioning that all the functions specified in the "#preprocess" field must return 

1669 a tuple of values defining how the particular value passed in the dictionary must be changed.""" 

1670 result = par_dict 

1671 

1672 if "preprocess" in op_item: 

1673 

1674 for pre in [sub(r"\s+", "", i) for i in op_item["preprocess"].split(" --> ")]: 

1675 func_name = sub(r"^([^\(\)]+)\(.+$", r"\1", pre).strip() 

1676 params_name = sub(r"^.+\(([^\(\)]+)\).*", r"\1", pre).split(",") 

1677 

1678 param_list = () 

1679 for param_name in params_name: 

1680 param_list += (result[param_name],) 

1681 

1682 # run function 

1683 func = getattr(addon, func_name) 

1684 res = func(*param_list) 

1685 

1686 # substitute res to the current parameter in result 

1687 for idx in range(len(res)): 

1688 result[params_name[idx]] = res[idx] 

1689 

1690 return result 

1691 

1692 def postprocess(self, res, op_item, addon): 

1693 """This method takes the result table returned by running the SPARQL query in an API operation (specified 

1694 as input) and change some of such results according to the functions specified in the '#postprocess' 

1695 field (e.g. "#postprocess remove_date("2018")"). These functions can take parameters as input, while the first 

1696 unspecified parameters will be always the result table. It is worth mentioning that this result table (i.e. 

1697 a list of tuples) actually contains, in each cell, a tuple defining the plain value as well as the typed 

1698 value for enabling better comparisons and operations if needed. An example of this table of result is shown as 

1699 follows: 

1700 

1701 [ 

1702 ("id", "date"), 

1703 ("my_id_1", "my_id_1"), (datetime(2018, 3, 2), "2018-03-02"), 

1704 ... 

1705 ] 

1706 

1707 Note that the typed value and the plain value of each cell can be selected by using the methods "tv" and "pv" 

1708 respectively. In addition, it is possible to run multiple functions sequentially by concatenating them 

1709 with "-->" in the API specification document. In this case the output of the function f_i will becomes 

1710 the input result table of the function f_i+1.""" 

1711 result = res 

1712 

1713 if "postprocess" in op_item: 

1714 for post in [i.strip() for i in op_item["postprocess"].split(" --> ")]: 

1715 func_name = sub(r"^([^\(\)]+)\(.+$", r"\1", post).strip() 

1716 param_str = sub(r"^.+\(([^\(\)]*)\).*", r"\1", post) 

1717 if param_str == "": 

1718 params_values = () 

1719 else: 

1720 params_values = next(reader(param_str.splitlines(), skipinitialspace=True)) 

1721 

1722 func = getattr(addon, func_name) 

1723 func_params = (result,) + tuple(params_values) 

1724 result, do_type_fields = func(*func_params) 

1725 if do_type_fields: 

1726 result = self.type_fields(result, op_item) 

1727 

1728 return result 

1729 

1730 def handling_params(self, params, table): 

1731 """This method is used for filtering the results that are returned after the post-processing 

1732 phase. In particular, it is possible to: 

1733 

1734 1. [require=<field_name>] exclude all the rows that have an empty value in the field specified - e.g. the 

1735 "require=doi" remove all the rows that do not have any string specified in the "doi" field; 

1736 

1737 2. [filter=<field_name>:<operator><value>] consider only the rows where the string in the input field 

1738 is compliant with the value specified. If no operation is specified, the value is interpreted as a 

1739 regular expression, otherwise it is compared according to the particular type associated to that field. 

1740 Possible operators are "=", "<", and ">" - e.g. "filter=title:semantics?" returns all the rows that contain 

1741 the string "semantic" or "semantics" in the field title, while "filter=date:>2016-05" returns all the rows 

1742 that have a date greater than May 2016; 

1743 

1744 3. [sort=<order>(<field_name>)] sort all the results according to the value and type of the particular 

1745 field specified in input. It is possible to sort the rows either in ascending ("asc") or descending 

1746 ("desc") order - e.g. "sort=desc(date)" sort all the rows according to the value specified in the 

1747 field "date" in descending order. 

1748 

1749 Note that these filtering operations are applied in the order presented above - first the "require", then 

1750 the "filter", and finally the "sort". It is possible to specify one or more filtering operation of the 

1751 same kind (e.g. "require=doi&require=title"). 

1752 """ 

1753 header = table[0] 

1754 result = table[1:] 

1755 

1756 if "exclude" in params or "require" in params: 

1757 fields = params["exclude"] if "exclude" in params else params["require"] 

1758 for field in fields: 

1759 field_idx = header.index(field) 

1760 tmp_result = [] 

1761 for row in result: 

1762 value = Operation.pv(field_idx, row) 

1763 if value is not None and value != "": 

1764 tmp_result.append(row) 

1765 result = tmp_result 

1766 

1767 if "filter" in params: 

1768 fields = params["filter"] 

1769 for field in fields: 

1770 field_name, field_value = field.split(":", 1) 

1771 

1772 try: 

1773 field_idx = header.index(field_name) 

1774 flag = field_value[0] 

1775 if flag in ("<", ">", "="): 

1776 value = field_value[1:].lower() 

1777 tmp_result = [] 

1778 for row in result: 

1779 v_result = Operation.tv(field_idx, row) 

1780 v_to_compare = self.dt.get_func(type(v_result).__name__)(value) 

1781 

1782 if self.operation[flag](v_result, v_to_compare): 

1783 tmp_result.append(row) 

1784 result = tmp_result 

1785 

1786 else: 

1787 result = list(filter( 

1788 lambda i: search(field_value.lower(), 

1789 Operation.pv(field_idx, i).lower()), result)) 

1790 except ValueError: 

1791 pass # do nothing 

1792 

1793 if "sort" in params: 

1794 fields = sorted(params["sort"], reverse=True) 

1795 field_names = [] 

1796 order = [] 

1797 for field in fields: 

1798 order_names = findall(r"^(desc|asc)\(([^\(\)]+)\)$", field) 

1799 if order_names: 

1800 order.append(order_names[0][0]) 

1801 field_names.append(order_names[0][1]) 

1802 else: 

1803 order.append("asc") 

1804 field_names.append(field) 

1805 

1806 for idx in range(len(field_names)): 

1807 field_name = field_names[idx] 

1808 try: 

1809 desc_order = False 

1810 if idx < len(order): 

1811 field_order = order[idx].lower().strip() 

1812 desc_order = True if field_order == "desc" else False 

1813 

1814 field_idx = header.index(field_name) 

1815 result = sorted(result, key=itemgetter(field_idx), reverse=desc_order) 

1816 except ValueError: 

1817 pass # do nothing 

1818 

1819 return [header] + result 

1820 

1821 def type_fields(self, res, op_item): 

1822 """It creates a version of the results 'res' that adds, to each value of the fields, the same value interpreted 

1823 with the type specified in the specification file (field 'field_type'). Note that 'str' is used as default in 

1824 case no further specifications are provided.""" 

1825 result = [] 

1826 cast_func = {} 

1827 header = res[0] 

1828 for heading in header: 

1829 cast_func[heading] = DataType.str 

1830 

1831 if "field_type" in op_item: 

1832 for f, p in findall(FIELD_TYPE_RE, op_item["field_type"]): 

1833 cast_func[p] = self.dt.get_func(f) 

1834 

1835 for row in res[1:]: 

1836 new_row = [] 

1837 for idx in range(len(header)): 

1838 heading = header[idx] 

1839 cur_value = row[idx] 

1840 if type(cur_value) is tuple: 

1841 cur_value = cur_value[1] 

1842 new_row.append((cast_func[heading](cur_value), cur_value)) 

1843 result.append(new_row) 

1844 

1845 return [header] + result 

1846 

1847 def remove_types(self, res): 

1848 """This method takes the results 'res' that include also the typed value and returns a version of such 

1849 results without the types that is ready to be stored on the file system.""" 

1850 result = [res[0]] 

1851 

1852 for row in res[1:]: 

1853 result.append(tuple(Operation.pv(idx, row) for idx in range(len(row)))) 

1854 

1855 return result 

1856 

1857 @staticmethod 

1858 def _is_directive(line): 

1859 return line.strip().startswith("@@") 

1860 

1861 def _parse_steps(self, text, default_endpoint, params): 

1862 """ 

1863 Returns a list of steps: 

1864 - ("QUERY", endpoint_url, query_text) 

1865 - ("JOIN", left_var, right_var, how) # how in {"inner","left"} 

1866 - ("REMOVE", [vars]) 

1867 - ("WITH", endpoint_url) # resolved from sources_map 

1868 - ("ENDPOINT", endpoint_url) # explicit url (if allowed) 

1869 - ("VALUES_INJECT", [vars]) # @@values ?var1 ?var2 ... 

1870 - ("FOREACH_SETUP", alias, var_name) # @@values ?var:alias 

1871 - ("FOREACH_MARK", alias, delay_seconds) # @@foreach alias [delay] 

1872 """ 

1873 steps = [] 

1874 cur_query = [] 

1875 current_endpoint = default_endpoint 

1876 

1877 def flush_query(): 

1878 if cur_query: 

1879 q = "\n".join(cur_query).strip() 

1880 if not q: 

1881 cur_query.clear() 

1882 return 

1883 # parameter substitution [[...]] 

1884 for p, v in params.items(): 

1885 q = q.replace(f"[[{p}]]", str(v)) 

1886 steps.append(("QUERY", current_endpoint, q)) 

1887 cur_query.clear() 

1888 

1889 for raw in text.splitlines(): 

1890 line = raw.rstrip("\n") 

1891 if not self._is_directive(line): 

1892 cur_query.append(line) 

1893 continue 

1894 

1895 # directive line -> first close any pending query 

1896 flush_query() 

1897 

1898 body = line.strip()[2:].strip() # remove leading @@ 

1899 parts = body.split() 

1900 cmd = parts[0].lower() 

1901 

1902 if cmd == "with": 

1903 name = parts[1] 

1904 if name not in self.sources_map: 

1905 raise ValueError(f"Unknown source '{name}' in @@with; declare it in #sources.") 

1906 current_endpoint = self.sources_map[name] 

1907 

1908 elif cmd == "endpoint": 

1909 url = parts[1] 

1910 if not self.allow_inline_endpoints: 

1911 raise ValueError("@@endpoint not allowed (enable #allow_inline_endpoints).") 

1912 current_endpoint = url 

1913 

1914 elif cmd == "join": 

1915 left = parts[1] 

1916 right = parts[2] 

1917 how = "inner" 

1918 if len(parts) >= 4 and parts[3].startswith("type="): 

1919 how = parts[3].split("=", 1)[1].lower() 

1920 steps.append(("JOIN", left, right, how)) 

1921 

1922 elif cmd == "remove": 

1923 vars_ = parts[1:] 

1924 steps.append(("REMOVE", vars_)) 

1925 

1926 elif cmd == "values": 

1927 # syntax: 

1928 # @@values ?var1 ?var2 ... 

1929 # @@values ?var:alias -> FOREACH_SETUP (for @@foreach) 

1930 tokens = parts[1:] 

1931 if not tokens: 

1932 raise ValueError("@@values needs at least one variable") 

1933 

1934 alias_specs = [t for t in tokens if ":" in t] 

1935 if alias_specs: 

1936 # We only support exactly one ?var:alias pair for now 

1937 if len(tokens) != 1 or len(alias_specs) != 1: 

1938 raise ValueError( 

1939 "@@values with alias supports exactly one ?var:alias pair" 

1940 ) 

1941 var_token = alias_specs[0] 

1942 var_name, alias = var_token.split(":", 1) 

1943 steps.append(("FOREACH_SETUP", alias, var_name)) 

1944 else: 

1945 vars_ = tokens 

1946 steps.append(("VALUES_INJECT", vars_)) 

1947 

1948 elif cmd == "foreach": 

1949 # syntax: @@foreach alias [delay_seconds] 

1950 if len(parts) < 2: 

1951 raise ValueError("@@foreach requires an alias name") 

1952 alias = parts[1] 

1953 delay = 0.0 

1954 if len(parts) >= 3: 

1955 try: 

1956 delay = float(parts[2]) 

1957 except ValueError: 

1958 raise ValueError(f"Invalid delay value in @@foreach: {parts[2]!r}") 

1959 steps.append(("FOREACH_MARK", alias, delay)) 

1960 

1961 else: 

1962 raise ValueError(f"Unknown directive @@{cmd}") 

1963 

1964 flush_query() 

1965 return steps 

1966 

1967 def _run_sparql_dicts(self, endpoint_url, query_text): 

1968 """Run a SELECT query against a SPARQL endpoint and return a list of dict rows. 

1969 

1970 This always requests CSV and parses it via DictReader, to stay consistent 

1971 with RAMOSE's legacy pipeline. 

1972 """ 

1973 try: 

1974 if self.sparql_http_method == "get": 

1975 r = _http_session.get( 

1976 endpoint_url + "?query=" + quote(query_text), 

1977 headers={ 

1978 "Accept": "text/csv", 

1979 "User-Agent": "RAMOSE/2.0.0", 

1980 }, 

1981 timeout=DEFAULT_HTTP_TIMEOUT, 

1982 ) 

1983 else: 

1984 r = _http_session.post( 

1985 endpoint_url, 

1986 data=query_text, 

1987 headers={ 

1988 "Accept": "text/csv", 

1989 "Content-Type": "application/sparql-query", 

1990 "User-Agent": "RAMOSE/2.0.0", 

1991 }, 

1992 timeout=DEFAULT_HTTP_TIMEOUT, 

1993 ) 

1994 except RequestException as e: 

1995 raise RuntimeError(f"SPARQL request failed: {e}") from e 

1996 

1997 r.encoding = "utf-8" 

1998 if r.status_code != 200: 

1999 raise RuntimeError(f"SPARQL {r.status_code}: {r.reason}") 

2000 text = r.content.decode("utf-8-sig", errors="replace") 

2001 list_of_lines = text.splitlines() 

2002 return list(DictReader(list_of_lines)) 

2003 

2004 def _run_sparql_anything_dicts(self, query_text, values=None): 

2005 """ 

2006 Execute a SPARQL Anything SELECT query via PySPARQL-Anything and return 

2007 a list of dicts (one per row), in the same shape as _run_sparql_dicts. 

2008 

2009 query_text: full SPARQL (Anything) query string 

2010 (typically containing SERVICE <x-sparql-anything:...>). 

2011 values: optional dict of template parameters for the query 

2012 (name -> value), passed to SPARQL Anything's `values=`. 

2013 """ 

2014 # Lazily create and cache the engine so we don't re-initialise the JVM 

2015 engine = getattr(self, "_sa_engine", None) 

2016 if engine is None: 

2017 engine = pysparql_anything.SparqlAnything() 

2018 self._sa_engine = engine 

2019 

2020 # Build kwargs for PySPARQL-Anything 

2021 kwargs = {"query": query_text} 

2022 if values: 

2023 # SPARQL Anything expects a dict[str, str] 

2024 kwargs["values"] = {str(k): str(v) for k, v in values.items()} 

2025 

2026 # Ask PySPARQL-Anything for a Python dict structure 

2027 result = engine.select(output_type=dict, **kwargs) 

2028 

2029 # --- Normalisation to list[dict] ----------------------------------- 

2030 # 1) If it's already a list of dicts, just return it. 

2031 if isinstance(result, list): 

2032 if result and isinstance(result[0], dict): 

2033 return result 

2034 # list but not dicts (tuples, etc.): coerce 

2035 return [dict(row) for row in result] 

2036 

2037 # 2) If it's not a dict at all, just wrap it as a single-row result. 

2038 if not isinstance(result, dict): 

2039 return [dict(result=result)] 

2040 

2041 # 3) Try standard SPARQL JSON ResultSet shape: { "head": {vars}, "results": { "bindings": [...] } } 

2042 head = result.get("head") 

2043 results = result.get("results") 

2044 if isinstance(head, dict) and isinstance(results, dict) and "bindings" in results: 

2045 vars_ = head.get("vars") or [] 

2046 rows = [] 

2047 for b in results.get("bindings", []): 

2048 row = {} 

2049 for v in vars_: 

2050 cell = b.get(v) 

2051 if isinstance(cell, dict): 

2052 # standard SPARQL JSON: { "type": "...", "value": "..." , ... } 

2053 row[v] = cell.get("value") 

2054 else: 

2055 row[v] = cell 

2056 rows.append(row) 

2057 return rows 

2058 

2059 # 4) Otherwise assume it is a mapping column_name -> list-of-values (or scalars) 

2060 rows = [] 

2061 cols = list(result.keys()) 

2062 

2063 # Find maximum column length, if columns are lists/tuples 

2064 max_len = 0 

2065 for c in cols: 

2066 v = result[c] 

2067 if isinstance(v, (list, tuple)): 

2068 max_len = max(max_len, len(v)) 

2069 

2070 if max_len: 

2071 for i in range(max_len): 

2072 row = {} 

2073 for c in cols: 

2074 v = result[c] 

2075 if isinstance(v, (list, tuple)): 

2076 row[c] = v[i] if i < len(v) else None 

2077 else: 

2078 # scalar: repeat in every row 

2079 row[c] = v 

2080 rows.append(row) 

2081 return rows 

2082 

2083 # 5) Fallback: treat the dict as a single-row result 

2084 return [result] 

2085 

2086 def _run_query_dicts(self, endpoint_url, query_text): 

2087 """ 

2088 Dispatch query execution to the appropriate backend, with support 

2089 for per-query engine selection in multi-source mode. 

2090 

2091 Rules: 

2092 - If endpoint_url is the special string "sparql-anything" (case-insensitive), 

2093 then always use SPARQL.ANYTHING (PySPARQL-Anything) for this query. 

2094 - Otherwise, fall back to the operation-level engine: 

2095 * engine == "sparql-anything" -> SPARQL.ANYTHING 

2096 * else -> standard HTTP SPARQL 

2097 """ 

2098 

2099 # Per-query override: @@endpoint sparql-anything 

2100 if endpoint_url and str(endpoint_url).strip().lower() == "sparql-anything": 

2101 return self._run_sparql_anything_dicts(query_text) 

2102 

2103 # Default behaviour: operation-level engine 

2104 if self.engine == "sparql-anything": 

2105 return self._run_sparql_anything_dicts(query_text) 

2106 else: 

2107 return self._run_sparql_dicts(endpoint_url, query_text) 

2108 

2109 def _inject_values_clause(self, query_text, vars_, acc_rows): 

2110 # build distinct tuples for requested vars from the accumulator 

2111 cols = [v.lstrip("?") for v in vars_] 

2112 tuples, seen = [], set() 

2113 for row in (acc_rows or []): 

2114 tup = tuple(row.get(c, "") for c in cols) 

2115 if all(tup) and tup not in seen: 

2116 seen.add(tup) 

2117 tuples.append(tup) 

2118 if not tuples: 

2119 return query_text # nothing to inject 

2120 

2121 # format literals vs IRIs 

2122 def fmt(x): 

2123 s = str(x) 

2124 if s.startswith("http://") or s.startswith("https://"): 

2125 return f"<{s}>" 

2126 return '"' + s.replace('\\', '\\\\').replace('"', '\\"') + '"' 

2127 

2128 head = "VALUES (" + " ".join(vars_) + ") {\n" 

2129 body = "\n".join(" (" + " ".join(fmt(v) for v in tup) + ")" for tup in tuples) 

2130 tail = "\n}\n" 

2131 

2132 i = query_text.find("{") 

2133 if i == -1: 

2134 # no WHERE brace: put VALUES at top (legal SPARQL) 

2135 return head + body + tail + query_text 

2136 j = i + 1 

2137 return query_text[:j] + "\n" + head + body + tail + query_text[j:] 

2138 

2139 @staticmethod 

2140 def _drop_columns(rows, vars_): 

2141 if not rows: 

2142 return rows 

2143 vars_set = set(v.lstrip("?") for v in vars_) 

2144 out = [] 

2145 for r in rows: 

2146 out.append({k: v for k, v in r.items() if k not in vars_set and ("?" + k) not in vars_set}) 

2147 return out 

2148 

2149 def _norm_join_key(self, v): 

2150 if v is None: 

2151 return None 

2152 s = str(v).strip() 

2153 # unify scheme for w3id IRIs (and similar) 

2154 if s.startswith("http://"): 

2155 s = "https://" + s[len("http://"):] 

2156 # drop a single trailing slash for stability 

2157 if s.endswith("/"): 

2158 s = s[:-1] 

2159 return s 

2160 

2161 def _join(self, left_rows, right_rows, lkey, rkey, how="inner"): 

2162 """ 

2163 Merge two row sets on lkey (from left_rows) and rkey (from right_rows). 

2164 - lkey/rkey may be passed as '?var' or 'var' -> we normalize to bare names. 

2165 - Keys are normalized with _norm_join_key (e.g., http -> https, trim slash). 

2166 - When 'left', all left rows are preserved even if no match on the right. 

2167 - Right-hand columns are copied into the merged row; collisions are avoided. 

2168 """ 

2169 # 1) Normalize column names (strip leading '?') 

2170 lcol = lkey.lstrip("?") 

2171 rcol = rkey.lstrip("?") 

2172 

2173 left_rows = left_rows or [] 

2174 right_rows = right_rows or [] 

2175 

2176 # 2) Build an index for right_rows on normalized rcol values 

2177 rindex = {} 

2178 for r in right_rows: 

2179 rk = self._norm_join_key(r.get(rcol)) 

2180 if rk is None: 

2181 continue 

2182 rindex.setdefault(rk, []).append(r) 

2183 

2184 # determine right columns to copy (excluding the join key) 

2185 right_cols = [c for c in (right_rows[0].keys() if right_rows else []) if c != rcol] 

2186 

2187 out = [] 

2188 for l in left_rows: 

2189 lk = self._norm_join_key(l.get(lcol)) 

2190 matches = rindex.get(lk, []) 

2191 if matches: 

2192 for r in matches: 

2193 merged = dict(l) 

2194 for c in right_cols: 

2195 rv = r.get(c) 

2196 if rv is None: 

2197 continue 

2198 if c not in merged or merged[c] in ("", None): 

2199 merged[c] = rv 

2200 else: 

2201 alt = f"{c}_r" 

2202 if alt not in merged or merged[alt] in ("", None): 

2203 merged[alt] = rv 

2204 out.append(merged) 

2205 else: 

2206 if how == "left": 

2207 out.append(dict(l)) 

2208 # inner: drop 

2209 return out 

2210 

2211 @staticmethod 

2212 def _header_from_field_type(op_item, acc): 

2213 # Respect #field_type order if provided, else derive from data 

2214 if "field_type" in op_item: 

2215 # FIELD_TYPE_RE is global in this file 

2216 return [f for (_, f) in findall(FIELD_TYPE_RE, op_item["field_type"])] 

2217 # fallback to keys of first row 

2218 return list(acc[0].keys()) if acc else [] 

2219 

2220 @staticmethod 

2221 def _to_csv_rows(header, acc): 

2222 rows = [header] 

2223 for d in acc: 

2224 rows.append([d.get(h, "") for h in header]) 

2225 return rows 

2226 

2227 def exec(self, method="get", content_type="application/json"): 

2228 """This method takes in input the HTTP method to use for the call 

2229 and the content type to return, and execute the operation as indicated 

2230 in the specification file, by running (in the following order): 

2231 

2232 1. the methods to preprocess the query; 

2233 2. the SPARQL query related to the operation called, by using the parameters indicated in the URL; 

2234 3. the specification of all the types of the various rows returned; 

2235 4. the methods to postprocess the result; 

2236 5. the application of the filter to remove, filter, sort the result; 

2237 6. the removal of the types added at the step 3, so as to have a data structure ready to be returned; 

2238 7. the conversion in the format requested by the user.""" 

2239 str_method = method.lower() 

2240 m = self.i["method"].split() 

2241 

2242 if str_method in m: 

2243 try: 

2244 par_dict = {} 

2245 par_man = match(self.op, self.op_url).groups() 

2246 for idx, par in enumerate(findall("{([^{}]+)}", self.i["url"])): 

2247 try: 

2248 par_type = self.i[par].split("(")[0] 

2249 if par_type == "str": 

2250 par_value = par_man[idx] 

2251 else: 

2252 par_value = self.dt.get_func(par_type)(par_man[idx]) 

2253 except KeyError: 

2254 par_value = par_man[idx] 

2255 par_dict[par] = par_value 

2256 

2257 if self.addon is not None: 

2258 self.preprocess(par_dict, self.i, self.addon) 

2259 

2260 sparql_text = self.i["sparql"] 

2261 

2262 if "@@" not in sparql_text: 

2263 # Fast path: single-query (legacy behavior) 

2264 

2265 if self.engine == "sparql-anything": 

2266 query = sparql_text 

2267 for param in par_dict: 

2268 query = query.replace("[[%s]]" % param, str(par_dict[param])) 

2269 rows = self._run_sparql_anything_dicts(query) 

2270 header = self._header_from_field_type(self.i, rows or []) 

2271 csv_rows = self._to_csv_rows(header, rows or []) 

2272 res = self.type_fields(csv_rows, self.i) 

2273 if self.addon is not None: 

2274 res = self.postprocess(res, self.i, self.addon) 

2275 q_string = parse_qs(quote(self.url_parsed.query, safe="&=")) 

2276 res = self.handling_params(q_string, res) 

2277 res = self.remove_types(res) 

2278 s_res = StringIO() 

2279 writer(s_res).writerows(res) 

2280 body, ctype = self.conv(s_res.getvalue(), q_string, content_type) 

2281 return 200, body, ctype 

2282 

2283 # Handle in case the parameters are lists, we need to generate all possible combinations 

2284 par_dict = {p_k: [par_dict[p_k]] if not isinstance(par_dict[p_k], list) else par_dict[p_k] for p_k in par_dict} 

2285 combinations = product(*par_dict.values()) 

2286 

2287 parameters_comb = [] 

2288 for combination in combinations: 

2289 parameters_comb.append(dict(zip(list(par_dict.keys()), list(combination)))) 

2290 

2291 # the __parameters_comb__ varaible is a list of dictionaries, 

2292 # each dictionary stores a possible combination of parameter values 

2293 # 

2294 # Example: {"id":"5","area":["A1","A2"]} -> [ {"id":"5","area":"A1"}, {"id":"5","area":"A2"} ] 

2295 # Example: {"id":"5","area":"A1"} -> [ {"id":"5","area":"A1"} ] 

2296 

2297 # iterate over __parameters_comb__ 

2298 

2299 list_of_res = [] 

2300 include_header_line = True 

2301 for par_dict in parameters_comb: 

2302 

2303 query = self.i["sparql"] 

2304 for param in par_dict: 

2305 query = query.replace("[[%s]]" % param, str(par_dict[param])) 

2306 

2307 # GET and POST are sync 

2308 # TODO: use threads to make it parallel 

2309 

2310 if self.sparql_http_method == "get": 

2311 r = _http_session.get(self.tp + "?query=" + quote(query), 

2312 headers={"Accept": "text/csv"}, timeout=DEFAULT_HTTP_TIMEOUT) 

2313 else: 

2314 r = _http_session.post(self.tp, data=query, headers={"Accept": "text/csv", 

2315 "Content-Type": "application/sparql-query"}, timeout=DEFAULT_HTTP_TIMEOUT) 

2316 r.encoding = "utf-8" 

2317 

2318 sc = r.status_code 

2319 if sc == 200: 

2320 # This line has been added to avoid a strage behaviour of the 'splitlines' method in 

2321 # presence of strange characters (non-UTF8). 

2322 list_of_lines = [line.decode("utf-8") for line in r.text.encode("utf-8").splitlines()] 

2323 

2324 else: 

2325 return sc, "HTTP status code %s: %s" % (sc, r.reason), "text/plain" 

2326 

2327 # each res will have a list of list_of_line 

2328 # include the header of the first result only 

2329 if not include_header_line: 

2330 list_of_lines = list_of_lines[1:] 

2331 include_header_line = False 

2332 

2333 # list_of_res Example: 

2334 # [ ["id,val","01,a","02,b"] , ["id,val","05,u","08,p"] ] 

2335 list_of_res += list_of_lines 

2336 

2337 # 

2338 # ----- DELEGATE to POST PROCESSING operations 

2339 # return 200, "HTTP print for debug %s: %s" % (200, list_of_res), "text/plain" 

2340 

2341 res = self.type_fields(list(reader(list_of_res)), self.i) 

2342 if self.addon is not None: 

2343 res = self.postprocess(res, self.i, self.addon) 

2344 q_string = parse_qs(quote(self.url_parsed.query, safe="&=")) 

2345 res = self.handling_params(q_string, res) 

2346 res = self.remove_types(res) 

2347 s_res = StringIO() 

2348 writer(s_res).writerows(res) 

2349 return (sc,) + self.conv(s_res.getvalue(), q_string, content_type) 

2350 

2351 else: 

2352 # Multi-source path: @@ directives present 

2353 try: 

2354 steps = self._parse_steps(sparql_text, self.tp, par_dict) 

2355 

2356 acc = None # list of dict rows 

2357 pending_join = None 

2358 pending_values_vars = None 

2359 

2360 foreach_sources = {} # alias -> column name (without '?') 

2361 pending_foreach = None # (alias, delay_seconds) 

2362 

2363 for st in steps: 

2364 tag = st[0] 

2365 

2366 if tag == "QUERY": 

2367 _, endpoint_url, qtxt = st 

2368 if not qtxt or not qtxt.strip(): 

2369 continue # defensive: skip any empty query steps 

2370 

2371 # FOREACH mode: run one query per value 

2372 if pending_foreach is not None: 

2373 alias, delay = pending_foreach 

2374 

2375 if alias not in foreach_sources: 

2376 raise ValueError( 

2377 f"@@foreach refers to unknown alias '{alias}'. " 

2378 f"Declare it with @@values ?var:{alias} before @@foreach." 

2379 ) 

2380 

2381 source_col = foreach_sources[alias] # e.g. "br" 

2382 

2383 # Collect distinct non-empty values from the accumulator 

2384 values = [] 

2385 seen = set() 

2386 for row in (acc or []): 

2387 v = row.get(source_col) 

2388 if v and v not in seen: 

2389 seen.add(v) 

2390 values.append(v) 

2391 

2392 all_rows = [] 

2393 for idx_val, val in enumerate(values): 

2394 # Substitute [[alias]] in the query text 

2395 q_one = qtxt.replace(f"[[{alias}]]", str(val)) 

2396 sub_rows = self._run_query_dicts(endpoint_url, q_one) 

2397 if sub_rows: 

2398 all_rows.extend(sub_rows) 

2399 # Sleep between calls if requested 

2400 if delay and idx_val + 1 < len(values): 

2401 time.sleep(delay) 

2402 

2403 rows = all_rows 

2404 # FOREACH applies only to this single QUERY 

2405 pending_foreach = None 

2406 # In FOREACH mode we ignore any pending VALUES_INJECT 

2407 pending_values_vars = None 

2408 

2409 else: 

2410 # Normal multi-source behaviour 

2411 if pending_values_vars: 

2412 # acc is the current accumulator rows 

2413 qtxt = self._inject_values_clause(qtxt, pending_values_vars, acc) 

2414 pending_values_vars = None # only affects this single query 

2415 rows = self._run_query_dicts(endpoint_url, qtxt) 

2416 

2417 if acc is None: 

2418 # first query defines the accumulator 

2419 acc = rows 

2420 else: 

2421 if pending_join: 

2422 lvar, rvar, how = pending_join 

2423 acc = self._join(acc, rows, lvar, rvar, how) 

2424 pending_join = None 

2425 else: 

2426 raise ValueError( 

2427 "Multiple QUERY steps without an explicit @@join directive" 

2428 ) 

2429 

2430 elif tag == "JOIN": 

2431 pending_join = (st[1], st[2], st[3] if len(st) > 3 and st[3] else "inner") 

2432 

2433 elif tag == "REMOVE": 

2434 _, vars_ = st 

2435 acc = self._drop_columns(acc or [], vars_) 

2436 

2437 elif tag == "VALUES_INJECT": 

2438 # st = ("VALUES_INJECT", ["?br", ...]) 

2439 pending_values_vars = st[1] 

2440 

2441 elif tag == "FOREACH_SETUP": 

2442 # st = ("FOREACH_SETUP", alias, var_name) 

2443 _, alias, var_name = st 

2444 foreach_sources[alias] = var_name.lstrip("?") 

2445 

2446 elif tag == "FOREACH_MARK": 

2447 # st = ("FOREACH_MARK", alias, delay) 

2448 _, alias, delay = st 

2449 pending_foreach = (alias, delay) 

2450 

2451 else: 

2452 raise RuntimeError(f"Unknown step tag {tag}") 

2453 

2454 # Convert merged dict rows -> CSV rows; then run the usual pipeline 

2455 header = self._header_from_field_type(self.i, acc or []) 

2456 csv_rows = self._to_csv_rows(header, acc or []) 

2457 

2458 res = self.type_fields(csv_rows, self.i) 

2459 if self.addon is not None: 

2460 res = self.postprocess(res, self.i, self.addon) 

2461 q_string = parse_qs(quote(self.url_parsed.query, safe="&=")) 

2462 res = self.handling_params(q_string, res) 

2463 res = self.remove_types(res) 

2464 s_res = StringIO() 

2465 writer(s_res).writerows(res) 

2466 body, ctype = self.conv(s_res.getvalue(), q_string, content_type) 

2467 return 200, body, ctype 

2468 

2469 except ValueError as ve: 

2470 sc = 400 

2471 return sc, f"HTTP status code {sc}: {ve}", "text/plain" 

2472 except RuntimeError as re_err: 

2473 sc = 502 

2474 return sc, f"HTTP status code {sc}: {re_err}", "text/plain" 

2475 

2476 except TimeoutError: 

2477 exc_type, exc_obj, exc_tb = exc_info() 

2478 sc = 408 

2479 return sc, "HTTP status code %s: request timeout - %s: %s (line %s)" % \ 

2480 (sc, exc_type.__name__, exc_obj, 

2481 exc_tb.tb_lineno), "text/plain" 

2482 except TypeError: 

2483 exc_type, exc_obj, exc_tb = exc_info() 

2484 sc = 400 

2485 return sc, "HTTP status code %s: " \ 

2486 "parameter in the request not compliant with the type specified - %s: %s (line %s)" % \ 

2487 (sc, exc_type.__name__, exc_obj, 

2488 exc_tb.tb_lineno), "text/plain" 

2489 except: 

2490 exc_type, exc_obj, exc_tb = exc_info() 

2491 sc = 500 

2492 return sc, "HTTP status code %s: something unexpected happened - %s: %s (line %s)" % \ 

2493 (sc, exc_type.__name__, exc_obj, 

2494 exc_tb.tb_lineno), "text/plain" 

2495 else: 

2496 sc = 405 

2497 return sc, "HTTP status code %s: '%s' method not allowed" % (sc, str_method), "text/plain" 

2498 # END: Processing methods 

2499 

2500 

2501class APIManager(object): 

2502 # Fixing max size for CSV 

2503 @staticmethod 

2504 def __max_size_csv(): 

2505 from sys import maxsize 

2506 import csv 

2507 maxInt = maxsize 

2508 while True: 

2509 try: 

2510 csv.field_size_limit(maxInt) 

2511 break 

2512 except OverflowError: # pragma: no cover 

2513 maxInt = int(maxInt/10) 

2514 

2515 # Constructor: START 

2516 def __init__(self, conf_files, endpoint_override=None): 

2517 """This is the constructor of the APIManager class. It takes in input a list of API configuration files, each 

2518 defined according to the Hash Format and following a particular structure, and stores all the operations 

2519 defined within a dictionary. Optionally, an endpoint_override parameter can be provided to override the 

2520 SPARQL endpoint defined in the configuration files (useful for staging/production environments). 

2521 The structure of each item in the dictionary of the operations is defined as follows: 

2522 

2523 { 

2524 "/api/v1/references/(.+)": { 

2525 "sparql": "PREFIX ...", 

2526 "method": "get", 

2527 ... 

2528 }, 

2529 ... 

2530 } 

2531 

2532 In particular, each key in the dictionary identifies the full URL of a particular API operation, and it is 

2533 used so as to understand with operation should be called once an API call is done. The object associated 

2534 as value of this key is the transformation of the related operation defined in the input Hash Format file 

2535 into a dictionary. 

2536 

2537 In addition, it also defines additional structure, such as the functions to be used for interpreting the 

2538 values returned by a SPARQL query, some operations that can be used for filtering the results, and the 

2539 HTTP methods to call for making the request to the SPARQL endpoint specified in the configuration file.""" 

2540 APIManager.__max_size_csv() 

2541 

2542 self.all_conf = OrderedDict() 

2543 self.base_url = [] 

2544 for conf_file in conf_files: 

2545 conf = OrderedDict() 

2546 tp = None 

2547 conf_json = HashFormatHandler().read(conf_file) 

2548 base_url = None 

2549 addon = None 

2550 sources_map = {} 

2551 allow_inline_endpoints = False 

2552 engine = "sparql" 

2553 for item in conf_json: 

2554 if base_url is None: 

2555 base_url = item["url"] 

2556 self.base_url.append(item["url"]) 

2557 website = item["base"] 

2558 tp = endpoint_override if endpoint_override else item["endpoint"] 

2559 

2560 # Engine selection at API level (optional) 

2561 if "engine" in item: 

2562 engine = item["engine"].strip().lower() 

2563 

2564 # Optional: named sources registry 

2565 if "sources" in item: 

2566 # expected: "name1=url1; name2=url2" 

2567 for pair in item["sources"].split(";"): 

2568 pair = pair.strip() 

2569 if not pair: 

2570 continue 

2571 name, url = pair.split("=", 1) 

2572 sources_map[name.strip()] = url.strip() 

2573 

2574 # Optional: allow explicit @@endpoint <url> in #sparql 

2575 if "allow_inline_endpoints" in item: 

2576 allow_inline_endpoints = str(item["allow_inline_endpoints"]).strip().lower() in ("true", "1", "yes", "y") 

2577 

2578 if "addon" in item: 

2579 addon_abspath = abspath(dirname(conf_file) + sep + item["addon"]) 

2580 path.append(dirname(addon_abspath)) 

2581 addon = import_module(basename(addon_abspath)) 

2582 sparql_http_method = "post" 

2583 if "method" in item: 

2584 sparql_http_method = item["method"].strip().lower() 

2585 else: 

2586 conf[APIManager.nor_api_url(item, base_url)] = item 

2587 

2588 self.all_conf[base_url] = { 

2589 "conf": conf, 

2590 "tp": tp, 

2591 "conf_json": conf_json, 

2592 "base_url": base_url, 

2593 "website": website, 

2594 "addon": addon, 

2595 "sparql_http_method": sparql_http_method, 

2596 "sources_map": sources_map, 

2597 "allow_inline_endpoints": allow_inline_endpoints, 

2598 "engine": engine, 

2599 } 

2600 # Constructor: END 

2601 

2602 # START: Ancillary methods 

2603 @staticmethod 

2604 def nor_api_url(i, b=""): 

2605 """This method takes an API operation object and an optional base URL (e.g. "/api/v1") as input 

2606 and returns the URL composed by the base URL plus the API URL normalised according to specific rules. In 

2607 particular, these normalisation rules takes the operation URL (e.g. "#url /citations/{oci}") and the 

2608 specification of the shape of all the parameters between brackets in the URL (e.g. "#oci str([0-9]+-[0-9]+)"), 

2609 and returns a new operation URL where the parameters have been substituted with the regular expressions 

2610 defining them (e.g. "/citations/([0-9]+-[0-9]+)"). This URL will be used by RAMOSE for matching the 

2611 particular API calls with the specific operation to execute.""" 

2612 result = i["url"] 

2613 

2614 for term in findall(PARAM_NAME, result): 

2615 try: 

2616 t = i[term] 

2617 except KeyError: 

2618 t = "str(.+)" 

2619 result = result.replace("{%s}" % term, "%s" % sub(r"^[^\(]+(\(.+\))$", r"\1", t)) 

2620 

2621 return "%s%s" % (b, result) 

2622 

2623 def best_match(self, u): 

2624 """This method takes an URL of an API call in input and find the API operation URL and the related 

2625 configuration that best match with the API call, if any.""" 

2626 #u = u.decode('UTF8') if isinstance(u, (bytes, bytearray)) else u 

2627 cur_u = sub(r"\?.*$", "", u) 

2628 result = None, None 

2629 for base_url in self.all_conf: 

2630 if u.startswith(base_url): 

2631 conf = self.all_conf[base_url] 

2632 for pat in conf["conf"]: 

2633 if match("^%s$" % pat, cur_u): 

2634 result = conf, pat 

2635 break 

2636 return result 

2637 # END: Ancillary methods 

2638 

2639 # START: Processing methods 

2640 def get_op(self, op_complete_url): 

2641 """This method returns a new object of type Operation which represent the operation specified by 

2642 the input URL (parameter 'op_complete_url)'. In case no operation can be found according by checking 

2643 the configuration files available in the APIManager, a tuple with an HTTP error code and a message 

2644 is returned instead.""" 

2645 url_parsed = urlsplit(op_complete_url) 

2646 op_url = url_parsed.path 

2647 

2648 conf, op = self.best_match(op_url) 

2649 if op is not None: 

2650 op_conf = conf["conf"][op] 

2651 op_engine = conf.get("engine", "sparql") 

2652 if "engine" in op_conf: 

2653 op_engine = op_conf["engine"].strip().lower() 

2654 

2655 # Build op-level format map from the operation block 

2656 op_format_map = {} 

2657 if "format" in op_conf: 

2658 fm_val = op_conf["format"] 

2659 fm_list = fm_val if isinstance(fm_val, list) else [fm_val] 

2660 for fm in fm_list: 

2661 for part in fm.split(";"): 

2662 part = part.strip() 

2663 if not part: 

2664 continue 

2665 fmt, func = part.split(",", 1) 

2666 op_format_map[fmt.strip()] = func.strip() 

2667 

2668 return Operation( 

2669 op_complete_url, 

2670 op, 

2671 op_conf, 

2672 conf["tp"], 

2673 conf["sparql_http_method"], 

2674 conf["addon"], 

2675 op_format_map, 

2676 conf.get("sources_map", {}), 

2677 conf.get("allow_inline_endpoints", False), 

2678 op_engine, 

2679 ) 

2680 else: 

2681 sc = 404 

2682 return sc, "HTTP status code %s: the operation requested does not exist" % sc, "text/plain" 

2683 # END: Processing methods 

2684 

2685 

2686if __name__ == "__main__": 

2687 arg_parser = ArgumentParser("ramose.py", description="The 'Restful API Manager Over SPARQL Endpoints' (a.k.a. " 

2688 "'RAMOSE') is an application that allows one to expose a " 

2689 "Restful API interface, according to a particular " 

2690 "specification document, to interact with a SPARQL endpoint.") 

2691 

2692 arg_parser.add_argument("-s", "--spec", dest="spec", required=True, nargs='+', 

2693 help="The file(s) in hash format containing the specification of the API(s).") 

2694 arg_parser.add_argument("-m", "--method", dest="method", default="get", 

2695 help="The method to use to make a request to the API.") 

2696 arg_parser.add_argument("-c", "--call", dest="call", 

2697 help="The URL to call for querying the API.") 

2698 arg_parser.add_argument("-f", "--format", dest="format", default="application/json", 

2699 help="The format in which to get the response.") 

2700 arg_parser.add_argument("-d", "--doc", dest="doc", default=False, action="store_true", 

2701 help="Say to generate the HTML documentation of the API (if it is specified, all " 

2702 "the arguments '-m', '-c', and '-f' won't be considered).") 

2703 arg_parser.add_argument("--openapi", dest="openapi", default=False, action="store_true", 

2704 help="Export the API specification to OpenAPI 3.0 YAML.") 

2705 arg_parser.add_argument("--api-base", dest="api_base", default=None, 

2706 help="When exporting docs/OpenAPI with multiple specs loaded, choose which API base URL to export.") 

2707 arg_parser.add_argument("-o", "--output", dest="output", 

2708 help="A file where to store the response.") 

2709 arg_parser.add_argument("-w", "--webserver", dest="webserver", default=False, 

2710 help="The host:port where to deploy a Flask webserver for testing the API.") 

2711 arg_parser.add_argument("-css", "--css", dest="css", 

2712 help="The path of a .css file for styling the API documentation (to be specified either with '-w' or with '-d' and '-o' arguments).") 

2713 

2714 args = arg_parser.parse_args() 

2715 am = APIManager(args.spec) 

2716 dh = HTMLDocumentationHandler(am) 

2717 oah = OpenAPIDocumentationHandler(am) 

2718 

2719 css_path = args.css if args.css else None 

2720 

2721 if args.webserver: 

2722 try: 

2723 import logging 

2724 from flask import Flask, request, make_response, send_from_directory 

2725 from werkzeug.exceptions import HTTPException 

2726 

2727 # logs 

2728 dh.logger_ramose() 

2729 

2730 # web server 

2731 host_name = args.webserver.rsplit(':', 1)[0] if ':' in args.webserver else '127.0.0.1' 

2732 port = args.webserver.rsplit(':', 1)[1] if ':' in args.webserver else '8080' 

2733 

2734 app = Flask(__name__) 

2735 

2736 # This is due to Flask routing rules that do not accept URLs without the starting slash 

2737 # but ramose calls start with the slash, hence we remove it if the flag args.webserver is added 

2738 if args.call: 

2739 args.call = args.call[1:] 

2740 

2741 # routing 

2742 @app.route('/') 

2743 def home(): 

2744 

2745 index = dh.get_index(css_path) 

2746 return index 

2747 

2748 @app.route('/<path:api_url>') 

2749 def doc(api_url): 

2750 res, status = dh.get_index(css_path), 404 

2751 # --- OpenAPI export endpoint --- 

2752 # Example: /api/v1/openapi.yaml (or .yml) 

2753 if api_url.endswith("openapi.yaml") or api_url.endswith("openapi.yml"): 

2754 base = api_url.rsplit("/", 1)[0] # e.g. "api/v1" 

2755 if "/" + base in am.all_conf: 

2756 status, yml = oah.get_documentation(base_url=base) 

2757 response = make_response(yml, status) 

2758 response.headers.set("Content-Type", "application/yaml") 

2759 response.headers.set("Access-Control-Allow-Origin", "*") 

2760 response.headers.set("Access-Control-Allow-Credentials", "true") 

2761 return response 

2762 else: 

2763 return res, status 

2764 # --- end OpenAPI export endpoint --- 

2765 if any(api_u in '/'+api_url for api_u, api_dict in am.all_conf.items()): 

2766 # documentation 

2767 if any(api_u == '/'+api_url for api_u,api_dict in am.all_conf.items()): 

2768 status, res = dh.get_documentation(css_path, api_url) 

2769 return res, status 

2770 # api calls 

2771 else: 

2772 cur_call = '/'+api_url 

2773 format = request.args.get('format') 

2774 content_type = "text/csv" if format is not None and "csv" in format else "application/json" 

2775 

2776 op = am.get_op(cur_call+'?'+unquote(request.query_string.decode('utf8'))) 

2777 if type(op) is Operation: # Operation found 

2778 status, res, c_type = op.exec(content_type=content_type) 

2779 else: # HTTP error 

2780 status, res, c_type = op 

2781 

2782 if status == 200: 

2783 response = make_response(res, status) 

2784 response.headers.set('Content-Type', c_type) 

2785 else: 

2786 # The API Manager returns a text/plain message when there is an error. 

2787 # Now set to return the header requested by the user 

2788 if content_type == "text/csv": 

2789 si = StringIO() 

2790 cw = writer(si) 

2791 cw.writerows([["error","message"], [str(status),str(res)]]) 

2792 response = make_response(si.getvalue(), status) 

2793 response.headers.set("Content-Disposition", "attachment", filename="error.csv") 

2794 else: 

2795 m_res = {"error": status, "message": res} 

2796 mes = dumps(m_res) 

2797 response = make_response(mes, status) 

2798 response.headers.set('Content-Type', content_type) # overwrite text/plain 

2799 

2800 # allow CORS anyway 

2801 response.headers.set('Access-Control-Allow-Origin', '*') 

2802 response.headers.set('Access-Control-Allow-Credentials', 'true') 

2803 

2804 return response 

2805 else: 

2806 return res, status 

2807 

2808 app.run(host=str(host_name), debug=True, port=str(port)) 

2809 

2810 except Exception as e: 

2811 exc_type, exc_obj, exc_tb = exc_info() 

2812 fname = pt.split(exc_tb.tb_frame.f_code.co_filename)[1] 

2813 print("[ERROR]", exc_type, fname, exc_tb.tb_lineno) 

2814 

2815 else: 

2816 # run locally via shell 

2817 if args.openapi: 

2818 res = oah.get_documentation(base_url=args.api_base) + ("application/yaml", ) 

2819 elif args.doc: 

2820 res = dh.get_documentation(css_path) + ("text/html", ) 

2821 else: 

2822 op = am.get_op(args.call) 

2823 if type(op) is Operation: # Operation found 

2824 res = op.exec(args.method, args.format) 

2825 else: # HTTP error 

2826 res = op 

2827 

2828 if args.output is None: 

2829 print("# Response HTTP code: %s\n# Body:\n%s\n# Content-type: %s" % res) 

2830 else: 

2831 with open(args.output, "w") as f: 

2832 f.write(res[1])