Coverage for mlos_viz/mlos_viz/base.py: 90%

156 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-21 01:50 +0000

1# 

2# Copyright (c) Microsoft Corporation. 

3# Licensed under the MIT License. 

4# 

5"""Base functions for visualizing, explain, and gain insights from results.""" 

6 

7import re 

8import warnings 

9from collections.abc import Callable, Iterable 

10from importlib.metadata import version 

11from typing import Any, Literal 

12 

13import pandas 

14import seaborn as sns 

15from matplotlib import pyplot as plt 

16from pandas.api.types import is_numeric_dtype 

17from pandas.core.groupby.generic import SeriesGroupBy 

18 

19from mlos_bench.storage.base_experiment_data import ExperimentData 

20from mlos_viz.util import expand_results_data_args 

21 

22_SEABORN_VERS = version("seaborn") 

23 

24 

25def _get_kwarg_defaults(target: Callable, **kwargs: Any) -> dict[str, Any]: 

26 """ 

27 Assembles a smaller kwargs dict for the specified target function. 

28 

29 Note: this only works with non-positional kwargs (e.g., those after a * arg). 

30 """ 

31 target_kwargs = {} 

32 for kword in target.__kwdefaults__: # or {} # intentionally omitted for now 

33 if kword in kwargs: 

34 target_kwargs[kword] = kwargs[kword] 

35 return target_kwargs 

36 

37 

38def ignore_plotter_warnings() -> None: 

39 """Suppress some annoying warnings from third-party data visualization packages by 

40 adding them to the warnings filter. 

41 """ 

42 warnings.filterwarnings("ignore", category=FutureWarning) 

43 if _SEABORN_VERS <= "0.13.1": 

44 warnings.filterwarnings( 

45 "ignore", 

46 category=DeprecationWarning, 

47 module="seaborn", # but actually comes from pandas 

48 message="is_categorical_dtype is deprecated and will be removed in a future version.", 

49 ) 

50 # See Also: https://github.com/mwaskom/seaborn/issues/3804 

51 warnings.filterwarnings( 

52 "ignore", 

53 category=PendingDeprecationWarning, 

54 module="seaborn", # but actually comes from matplotlib 

55 message=( 

56 "vert: bool will be deprecated in a future version. " 

57 "Use orientation: {'vertical', 'horizontal'} instead." 

58 ), 

59 ) 

60 

61 

62def _add_groupby_desc_column( 

63 results_df: pandas.DataFrame, 

64 groupby_columns: list[str] | None = None, 

65) -> tuple[pandas.DataFrame, list[str], str]: 

66 """ 

67 Adds a group descriptor column to the results_df. 

68 

69 Parameters 

70 ---------- 

71 results_df: ExperimentData 

72 The experiment data to add the descriptor column to. 

73 groupby_columns: Optional[list[str]] 

74 """ 

75 # Compose a new groupby_column for display purposes that is the 

76 # concatenation of the min trial_id (the first one) of each config trial 

77 # group and the config_id. 

78 # Note: It's need to be a string (e.g., categorical) for boxplot and lineplot to 

79 # be on the same axis anyways. 

80 if groupby_columns is None: 

81 groupby_columns = ["tunable_config_trial_group_id", "tunable_config_id"] 

82 groupby_column = ",".join(groupby_columns) 

83 results_df[groupby_column] = ( 

84 results_df[groupby_columns].astype(str).apply(",".join, axis=1) 

85 ) # pylint: disable=unnecessary-lambda 

86 groupby_columns.append(groupby_column) 

87 return (results_df, groupby_columns, groupby_column) 

88 

89 

90def augment_results_df_with_config_trial_group_stats( 

91 exp_data: ExperimentData | None = None, 

92 *, 

93 results_df: pandas.DataFrame | None = None, 

94 requested_result_cols: Iterable[str] | None = None, 

95) -> pandas.DataFrame: 

96 # pylint: disable=too-complex 

97 """ 

98 Add a number of useful statistical measure columns to the results dataframe. 

99 

100 In particular, for each numeric result, we add the following columns for each 

101 requested result column: 

102 

103 - ".p50": the median of each config trial group results 

104 

105 - ".p75": the p75 of each config trial group results 

106 

107 - ".p90": the p90 of each config trial group results 

108 

109 - ".p95": the p95 of each config trial group results 

110 

111 - ".p99": the p95 of each config trial group results 

112 

113 - ".mean": the mean of each config trial group results 

114 

115 - ".stddev": the mean of each config trial group results 

116 

117 - ".var": the variance of each config trial group results 

118 

119 - ".var_zscore": the zscore of this group (i.e., variance relative to the stddev 

120 of all group variances). This can be useful for filtering out outliers (e.g., 

121 configs with high variance relative to others by restricting to abs < 2 to 

122 remove those two standard deviations from the mean variance across all config 

123 trial groups). 

124 

125 Additionally, we add a "tunable_config_trial_group_size" column that indicates 

126 the number of trials using a particular config. 

127 

128 Parameters 

129 ---------- 

130 exp_data : ExperimentData 

131 The ExperimentData (e.g., obtained from the storage layer) to plot. 

132 results_df : pandas.DataFrame | None 

133 The results dataframe to augment, by default None to use the results_df property. 

134 requested_result_cols : Optional[Iterable[str]] 

135 Which results columns to augment, by default None to use all results columns 

136 that look numeric. 

137 

138 Returns 

139 ------- 

140 pandas.DataFrame 

141 The augmented results dataframe. 

142 """ 

143 if results_df is None: 

144 if exp_data is None: 

145 raise ValueError("Either exp_data or results_df must be provided.") 

146 results_df = exp_data.results_df 

147 results_groups = results_df.groupby("tunable_config_id") 

148 if len(results_groups) <= 1: 

149 raise ValueError(f"Not enough data: {len(results_groups)}") 

150 

151 if requested_result_cols is None: 

152 result_cols = { 

153 col 

154 for col in results_df.columns 

155 if col.startswith(ExperimentData.RESULT_COLUMN_PREFIX) 

156 } 

157 else: 

158 result_cols = { 

159 col 

160 for col in requested_result_cols 

161 if col.startswith(ExperimentData.RESULT_COLUMN_PREFIX) and col in results_df.columns 

162 } 

163 result_cols.update( 

164 { 

165 ExperimentData.RESULT_COLUMN_PREFIX + col 

166 for col in requested_result_cols 

167 if ExperimentData.RESULT_COLUMN_PREFIX in results_df.columns 

168 } 

169 ) 

170 

171 def compute_zscore_for_group_agg( 

172 results_groups_perf: "SeriesGroupBy", 

173 stats_df: pandas.DataFrame, 

174 result_col: str, 

175 agg: Literal["mean"] | Literal["var"] | Literal["std"], 

176 ) -> None: 

177 results_groups_perf_aggs = results_groups_perf.agg(agg) # TODO: avoid recalculating? 

178 # Compute the zscore of the chosen aggregate performance of each group into 

179 # each row in the dataframe. 

180 stats_df[result_col + f".{agg}_mean"] = results_groups_perf_aggs.mean() 

181 stats_df[result_col + f".{agg}_stddev"] = results_groups_perf_aggs.std() 

182 stats_df[result_col + f".{agg}_zscore"] = ( 

183 stats_df[result_col + f".{agg}"] - stats_df[result_col + f".{agg}_mean"] 

184 ) / stats_df[result_col + f".{agg}_stddev"] 

185 stats_df.drop( 

186 columns=[result_col + ".var_" + agg for agg in ("mean", "stddev")], inplace=True 

187 ) 

188 

189 augmented_results_df = results_df 

190 augmented_results_df["tunable_config_trial_group_size"] = results_groups["trial_id"].transform( 

191 "count" 

192 ) 

193 for result_col in result_cols: 

194 if not result_col.startswith(ExperimentData.RESULT_COLUMN_PREFIX): 

195 continue 

196 if re.search(r"(start|end).*time", result_col, flags=re.IGNORECASE): 

197 # Ignore computing variance on things like that look like timestamps. 

198 continue 

199 if not is_numeric_dtype(results_df[result_col]): 

200 continue 

201 if results_df[result_col].unique().size == 1: 

202 continue 

203 results_groups_perf = results_groups[result_col] 

204 stats_df = pandas.DataFrame() 

205 stats_df[result_col + ".mean"] = results_groups_perf.transform("mean", numeric_only=True) 

206 stats_df[result_col + ".var"] = results_groups_perf.transform("var") 

207 stats_df[result_col + ".stddev"] = stats_df[result_col + ".var"].apply(lambda x: x**0.5) 

208 

209 compute_zscore_for_group_agg(results_groups_perf, stats_df, result_col, "var") 

210 quantiles = [0.50, 0.75, 0.90, 0.95, 0.99] 

211 for quantile in quantiles: # TODO: can we do this in one pass? 

212 quantile_col = f"{result_col}.p{int(quantile * 100)}" 

213 stats_df[quantile_col] = results_groups_perf.transform("quantile", quantile) 

214 augmented_results_df = pandas.concat([augmented_results_df, stats_df], axis=1) 

215 return augmented_results_df 

216 

217 

218def limit_top_n_configs( 

219 exp_data: ExperimentData | None = None, 

220 *, 

221 results_df: pandas.DataFrame | None = None, 

222 objectives: dict[str, Literal["min", "max"]] | None = None, 

223 top_n_configs: int = 10, 

224 method: Literal["mean", "p50", "p75", "p90", "p95", "p99"] = "mean", 

225) -> tuple[pandas.DataFrame, list[int], dict[str, bool]]: 

226 # pylint: disable=too-many-locals 

227 """ 

228 Utility function to process the results and determine the best performing configs 

229 including potential repeats to help assess variability. 

230 

231 Parameters 

232 ---------- 

233 exp_data : ExperimentData | None 

234 The ExperimentData (e.g., obtained from the storage layer) to operate on. 

235 results_df : pandas.DataFrame | None 

236 The results dataframe to augment, by default None to use 

237 :py:attr:`.ExperimentData.results_df` property. 

238 objectives : Iterable[str] 

239 Which result column(s) to use for sorting the configs, and in which 

240 direction ("min" or "max"). 

241 By default None to automatically select the :py:attr:`.ExperimentData.objectives`. 

242 top_n_configs : int 

243 How many configs to return, including the default, by default 10. 

244 method: Literal["mean", "median", "p50", "p75", "p90", "p95", "p99"] = "mean", 

245 Which statistical method to use when sorting the config groups before 

246 determining the cutoff, by default "mean". 

247 

248 Returns 

249 ------- 

250 (top_n_config_results_df, top_n_config_ids, orderby_cols) : 

251 tuple[pandas.DataFrame, list[int], dict[str, bool]] 

252 The filtered results dataframe, the config ids, and the columns used to 

253 order the configs. 

254 """ 

255 # Do some input checking first. 

256 if method not in ["mean", "median", "p50", "p75", "p90", "p95", "p99"]: 

257 raise ValueError(f"Invalid method: {method}") 

258 

259 # Prepare the orderby columns. 

260 (results_df, objs_cols) = expand_results_data_args( 

261 exp_data, 

262 results_df=results_df, 

263 objectives=objectives, 

264 ) 

265 assert isinstance(results_df, pandas.DataFrame) 

266 

267 # Augment the results dataframe with some useful stats. 

268 results_df = augment_results_df_with_config_trial_group_stats( 

269 exp_data=exp_data, 

270 results_df=results_df, 

271 requested_result_cols=objs_cols.keys(), 

272 ) 

273 # Note: mypy seems to lose its mind for some reason and keeps forgetting that 

274 # results_df is not None and is in fact a DataFrame, so we periodically assert 

275 # it in this func for now. 

276 assert results_df is not None 

277 orderby_cols: dict[str, bool] = { 

278 obj_col + f".{method}": ascending for (obj_col, ascending) in objs_cols.items() 

279 } 

280 

281 config_id_col = "tunable_config_id" 

282 group_id_col = "tunable_config_trial_group_id" # first trial_id per config group 

283 trial_id_col = "trial_id" 

284 

285 default_config_id = ( 

286 results_df[trial_id_col].min() if exp_data is None else exp_data.default_tunable_config_id 

287 ) 

288 assert default_config_id is not None, "Failed to determine default config id." 

289 

290 # Filter out configs whose variance is too large. 

291 # But also make sure the default configs is still in the resulting dataframe 

292 # (for comparison purposes). 

293 for obj_col in objs_cols: 

294 assert results_df is not None 

295 if method == "mean": 

296 singletons_mask = results_df["tunable_config_trial_group_size"] == 1 

297 else: 

298 singletons_mask = results_df["tunable_config_trial_group_size"] > 1 

299 results_df = results_df.loc[ 

300 ( 

301 (results_df[f"{obj_col}.var_zscore"].abs() < 2) 

302 | (singletons_mask) 

303 | (results_df[config_id_col] == default_config_id) 

304 ) 

305 ] 

306 assert results_df is not None 

307 

308 # Also, filter results that are worse than the default. 

309 default_config_results_df = results_df.loc[results_df[config_id_col] == default_config_id] 

310 for orderby_col, ascending in orderby_cols.items(): 

311 default_vals = default_config_results_df[orderby_col].unique() 

312 assert len(default_vals) == 1 

313 default_val = default_vals[0] 

314 assert results_df is not None 

315 if ascending: 

316 results_df = results_df.loc[(results_df[orderby_col] <= default_val)] 

317 else: 

318 results_df = results_df.loc[(results_df[orderby_col] >= default_val)] 

319 

320 # Now regroup and filter to the top-N configs by their group performance dimensions. 

321 assert results_df is not None 

322 group_results_df: pandas.DataFrame = results_df.groupby(config_id_col).first()[ 

323 orderby_cols.keys() 

324 ] 

325 top_n_config_ids: list[int] = ( 

326 group_results_df.sort_values( 

327 by=list(orderby_cols.keys()), ascending=list(orderby_cols.values()) 

328 ) 

329 .head(top_n_configs) 

330 .index.tolist() 

331 ) 

332 

333 # Remove the default config if it's included. We'll add it back later. 

334 if default_config_id in top_n_config_ids: 

335 top_n_config_ids.remove(default_config_id) 

336 # Get just the top-n config results. 

337 # Sort by the group ids. 

338 top_n_config_results_df = results_df.loc[ 

339 (results_df[config_id_col].isin(top_n_config_ids)) 

340 ].sort_values([group_id_col, config_id_col, trial_id_col]) 

341 # Place the default config at the top of the list. 

342 top_n_config_ids.insert(0, default_config_id) 

343 top_n_config_results_df = pandas.concat( 

344 [default_config_results_df, top_n_config_results_df], 

345 axis=0, 

346 ) 

347 return (top_n_config_results_df, top_n_config_ids, orderby_cols) 

348 

349 

350def plot_optimizer_trends( 

351 exp_data: ExperimentData | None = None, 

352 *, 

353 results_df: pandas.DataFrame | None = None, 

354 objectives: dict[str, Literal["min", "max"]] | None = None, 

355) -> None: 

356 """ 

357 Plots the optimizer trends for the Experiment. 

358 

359 Parameters 

360 ---------- 

361 exp_data : ExperimentData 

362 The ExperimentData (e.g., obtained from the storage layer) to plot. 

363 results_df : pandas.DataFrame | None 

364 Optional results_df to plot. 

365 If not provided, defaults to :py:attr:`.ExperimentData.results_df` property. 

366 objectives : Optional[dict[str, Literal["min", "max"]]] 

367 Optional objectives to plot. 

368 If not provided, defaults to :py:attr:`.ExperimentData.objectives` property. 

369 """ 

370 (results_df, obj_cols) = expand_results_data_args(exp_data, results_df, objectives) 

371 (results_df, groupby_columns, groupby_column) = _add_groupby_desc_column(results_df) 

372 

373 for objective_column, ascending in obj_cols.items(): 

374 incumbent_column = objective_column + ".incumbent" 

375 

376 # Determine the mean of each config trial group to match the box plots. 

377 group_results_df = ( 

378 results_df.groupby(groupby_columns)[objective_column] 

379 .mean() 

380 .reset_index() 

381 .sort_values(groupby_columns) 

382 ) 

383 # 

384 # Note: technically the optimizer (usually) uses the *first* result for a 

385 # given config trial group before moving on to a new config (x-axis), so 

386 # plotting the mean may be slightly misleading when trying to understand the 

387 # actual path taken by the optimizer in case of high variance samples. 

388 # Here's a way to do that, though it can also be misleading if the optimizer 

389 # later gets a worse value for that config group as well. 

390 # 

391 # group_results_df = results_df.sort_values(groupby_columns + ["trial_id"]).groupby( 

392 # groupby_columns).head(1)[groupby_columns + [objective_column]].reset_index() 

393 

394 # Calculate the incumbent (best seen so far) 

395 if ascending: 

396 group_results_df[incumbent_column] = group_results_df[objective_column].cummin() 

397 else: 

398 group_results_df[incumbent_column] = group_results_df[objective_column].cummax() 

399 

400 (_fig, axis) = plt.subplots(figsize=(15, 5)) 

401 

402 # Result of each set of trials for a config 

403 sns.boxplot( 

404 data=results_df, 

405 x=groupby_column, 

406 y=objective_column, 

407 ax=axis, 

408 ) 

409 

410 # Results of the best so far. 

411 axis = sns.lineplot( 

412 data=group_results_df, 

413 x=groupby_column, 

414 y=incumbent_column, 

415 alpha=0.7, 

416 label="Mean of Incumbent Config Trial Group", 

417 ax=axis, 

418 ) 

419 

420 plt.yscale("log") 

421 plt.ylabel(objective_column.replace(ExperimentData.RESULT_COLUMN_PREFIX, "")) 

422 

423 plt.xlabel("Config Trial Group ID, Config ID") 

424 plt.xticks(rotation=90, fontsize=8) 

425 

426 plt.title( 

427 "Optimizer Trends for Experiment: " + exp_data.experiment_id 

428 if exp_data is not None 

429 else "" 

430 ) 

431 plt.grid() 

432 plt.show() 

433 

434 

435def plot_top_n_configs( 

436 exp_data: ExperimentData | None = None, 

437 *, 

438 results_df: pandas.DataFrame | None = None, 

439 objectives: dict[str, Literal["min", "max"]] | None = None, 

440 with_scatter_plot: bool = False, 

441 **kwargs: Any, 

442) -> None: 

443 # pylint: disable=too-many-locals 

444 """ 

445 Plots the top-N configs along with the default config for the given 

446 :py:class:`.ExperimentData`. 

447 

448 Intended to be used from a Jupyter notebook. 

449 

450 Parameters 

451 ---------- 

452 exp_data: ExperimentData 

453 The experiment data to plot. 

454 results_df : pandas.DataFrame | None 

455 Optional results_df to plot. 

456 If not provided, defaults to :py:attr:`.ExperimentData.results_df` property. 

457 objectives : Optional[dict[str, Literal["min", "max"]]] 

458 Optional objectives to plot. 

459 If not provided, defaults to :py:attr:`.ExperimentData.objectives` property. 

460 with_scatter_plot : bool 

461 Whether to also add scatter plot to the output figure. 

462 kwargs : dict 

463 Remaining keyword arguments are passed along to the 

464 :py:func:`limit_top_n_configs` function. 

465 """ 

466 (results_df, _obj_cols) = expand_results_data_args(exp_data, results_df, objectives) 

467 top_n_config_args = _get_kwarg_defaults(limit_top_n_configs, **kwargs) 

468 if "results_df" not in top_n_config_args: 

469 top_n_config_args["results_df"] = results_df 

470 if "objectives" not in top_n_config_args: 

471 top_n_config_args["objectives"] = objectives 

472 (top_n_config_results_df, _top_n_config_ids, orderby_cols) = limit_top_n_configs( 

473 exp_data=exp_data, 

474 **top_n_config_args, 

475 ) 

476 

477 (top_n_config_results_df, _groupby_columns, groupby_column) = _add_groupby_desc_column( 

478 top_n_config_results_df, 

479 ) 

480 top_n = len(top_n_config_results_df[groupby_column].unique()) - 1 

481 

482 for orderby_col, ascending in orderby_cols.items(): 

483 opt_tgt = orderby_col.replace(ExperimentData.RESULT_COLUMN_PREFIX, "") 

484 (_fig, axis) = plt.subplots() 

485 sns.violinplot( 

486 data=top_n_config_results_df, 

487 x=groupby_column, 

488 y=orderby_col, 

489 ax=axis, 

490 ) 

491 if with_scatter_plot: 

492 sns.scatterplot( 

493 data=top_n_config_results_df, 

494 x=groupby_column, 

495 y=orderby_col, 

496 legend=False, 

497 ax=axis, 

498 ) 

499 plt.grid() 

500 (xticks, xlabels) = plt.xticks() 

501 # default should be in the first position based on top_n_configs() return 

502 xlabels[0] = "default" # type: ignore[call-overload] 

503 plt.xticks(xticks, xlabels) # type: ignore[arg-type] 

504 plt.xlabel("Config Trial Group, Config ID") 

505 plt.xticks(rotation=90) 

506 plt.ylabel(opt_tgt) 

507 plt.yscale("log") 

508 extra_title = "(lower is better)" if ascending else "(lower is better)" 

509 plt.title(f"Top {top_n} configs {opt_tgt} {extra_title}") 

510 plt.show()