Coverage for oc_botwatch / visualize.py: 98%

107 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-19 16:49 +0000

1# SPDX-FileCopyrightText: 2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

2# 

3# SPDX-License-Identifier: ISC 

4 

5import logging 

6from pathlib import Path 

7 

8import matplotlib.dates as mdates 

9import matplotlib.pyplot as plt 

10import matplotlib.ticker as mticker 

11import polars as pl 

12from matplotlib.axes import Axes 

13 

14logger = logging.getLogger(__name__) 

15 

16BASE_DIR = Path(__file__).resolve().parent.parent 

17OUTPUT_DIR = BASE_DIR / "output" 

18 

19_SERVICES: tuple[str, ...] = ("web", "api", "sparql") 

20_CATEGORY_LABELS: tuple[str, ...] = ("Human", "Generic bot", "LLM bot") 

21_CATEGORY_COLUMNS: tuple[str, ...] = ("human", "generic_bot", "llm_bot") 

22 

23 

24def _prepare(df: pl.DataFrame) -> pl.DataFrame: 

25 return df.with_columns(pl.col("date").str.to_date("%Y-%m-%d", strict=False)).drop_nulls("date").sort("date") 

26 

27 

28def _fmt_axis(x: float, _pos: int) -> str: 

29 return f"{x / 1e6:.0f}M" if x >= 1e6 else f"{x / 1e3:.0f}K" # noqa: PLR2004 

30 

31 

32def _setup_xaxis(ax: Axes) -> None: 

33 ax.xaxis.set_major_locator(mdates.MonthLocator()) 

34 ax.xaxis.set_major_formatter(mdates.DateFormatter("%b %Y")) 

35 

36 

37def _pivot_by_service(long_df: pl.DataFrame, service: str) -> pl.DataFrame: 

38 pivoted = ( 

39 long_df.filter(pl.col("service") == service).pivot(on="category", index="date", values="count").fill_null(0) 

40 ) 

41 for cat in _CATEGORY_COLUMNS: 

42 if cat not in pivoted.columns: 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true

43 pivoted = pivoted.with_columns(pl.lit(0).alias(cat)) 

44 return _prepare(pivoted.select("date", *_CATEGORY_COLUMNS)) 

45 

46 

47def plot_daily_traffic(df: pl.DataFrame, output: Path) -> None: 

48 df = _prepare(df) 

49 dates = df["date"].to_list() 

50 human = df["human"].to_list() 

51 generic_bot = df["generic_bot"].to_list() 

52 llm_bot = df["llm_bot"].to_list() 

53 

54 fig, ax = plt.subplots(figsize=(12, 6)) 

55 ax.stackplot(dates, human, generic_bot, llm_bot, labels=list(_CATEGORY_LABELS), alpha=0.8) 

56 ax.legend(loc="upper left") 

57 ax.set_ylabel("Requests") 

58 ax.yaxis.set_major_formatter(mticker.FuncFormatter(_fmt_axis)) 

59 _setup_xaxis(ax) 

60 fig.autofmt_xdate() 

61 fig.tight_layout() 

62 fig.savefig(output, dpi=150) 

63 plt.close(fig) 

64 logger.info("Output: %s", output) 

65 

66 

67def plot_daily_traffic_pct(df: pl.DataFrame, output: Path) -> None: 

68 df = _prepare(df) 

69 total = df["human"] + df["generic_bot"] + df["llm_bot"] 

70 human_pct = (df["human"] / total * 100).to_list() 

71 generic_pct = (df["generic_bot"] / total * 100).to_list() 

72 llm_pct = (df["llm_bot"] / total * 100).to_list() 

73 dates = df["date"].to_list() 

74 

75 fig, ax = plt.subplots(figsize=(12, 6)) 

76 ax.stackplot(dates, human_pct, generic_pct, llm_pct, labels=list(_CATEGORY_LABELS), alpha=0.8) 

77 ax.legend(loc="upper left") 

78 ax.set_ylabel("Share of daily requests (%)") 

79 ax.set_ylim(0, 100) 

80 ax.yaxis.set_major_formatter(mticker.PercentFormatter()) 

81 _setup_xaxis(ax) 

82 fig.autofmt_xdate() 

83 fig.tight_layout() 

84 fig.savefig(output, dpi=150) 

85 plt.close(fig) 

86 logger.info("Output: %s", output) 

87 

88 

89def plot_daily_traffic_by_service(long_df: pl.DataFrame, output: Path) -> None: 

90 fig, axes = plt.subplots(len(_SERVICES), 1, figsize=(12, 12), sharex=True) 

91 for ax, service in zip(axes, _SERVICES, strict=True): 

92 sub = _pivot_by_service(long_df, service) 

93 dates = sub["date"].to_list() 

94 layers = [sub[c].to_list() for c in _CATEGORY_COLUMNS] 

95 ax.stackplot(dates, *layers, labels=list(_CATEGORY_LABELS), alpha=0.8) 

96 ax.legend(loc="upper left") 

97 ax.set_title(service) 

98 ax.set_ylabel("Requests") 

99 ax.yaxis.set_major_formatter(mticker.FuncFormatter(_fmt_axis)) 

100 _setup_xaxis(axes[-1]) 

101 fig.autofmt_xdate() 

102 fig.tight_layout() 

103 fig.savefig(output, dpi=150) 

104 plt.close(fig) 

105 logger.info("Output: %s", output) 

106 

107 

108def plot_daily_traffic_by_service_pct(long_df: pl.DataFrame, output: Path) -> None: 

109 fig, axes = plt.subplots(len(_SERVICES), 1, figsize=(12, 12), sharex=True) 

110 for ax, service in zip(axes, _SERVICES, strict=True): 

111 sub = _pivot_by_service(long_df, service) 

112 total = sub["human"] + sub["generic_bot"] + sub["llm_bot"] 

113 layers = [(sub[c] / total * 100).fill_nan(0).to_list() for c in _CATEGORY_COLUMNS] 

114 dates = sub["date"].to_list() 

115 ax.stackplot(dates, *layers, labels=list(_CATEGORY_LABELS), alpha=0.8) 

116 ax.legend(loc="upper left") 

117 ax.set_title(service) 

118 ax.set_ylabel("Share (%)") 

119 ax.set_ylim(0, 100) 

120 ax.yaxis.set_major_formatter(mticker.PercentFormatter()) 

121 _setup_xaxis(axes[-1]) 

122 fig.autofmt_xdate() 

123 fig.tight_layout() 

124 fig.savefig(output, dpi=150) 

125 plt.close(fig) 

126 logger.info("Output: %s", output) 

127 

128 

129def main(output_dir: Path = OUTPUT_DIR) -> None: 

130 logging.basicConfig(level=logging.INFO) 

131 df = pl.read_csv(output_dir / "daily_traffic.csv") 

132 plot_daily_traffic(df, output_dir / "daily_traffic.png") 

133 plot_daily_traffic_pct(df, output_dir / "daily_traffic_pct.png") 

134 

135 long_df = pl.read_csv(output_dir / "daily_traffic_by_service.csv") 

136 plot_daily_traffic_by_service(long_df, output_dir / "daily_traffic_by_service.png") 

137 plot_daily_traffic_by_service_pct(long_df, output_dir / "daily_traffic_by_service_pct.png") 

138 

139 

140if __name__ == "__main__": 

141 main() # pragma: no cover