Coverage for mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py: 87%

103 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-10-07 01:52 +0000

1# 

2# Copyright (c) Microsoft Corporation. 

3# Licensed under the MIT License. 

4# 

5""" 

6Contains the wrapper class for SMAC Bayesian optimizers. 

7 

8See Also: <https://automl.github.io/SMAC3/main/index.html> 

9""" 

10 

11from logging import warning 

12from pathlib import Path 

13from tempfile import TemporaryDirectory 

14from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union 

15from warnings import warn 

16 

17import ConfigSpace 

18import numpy.typing as npt 

19import pandas as pd 

20 

21from mlos_core.optimizers.bayesian_optimizers.bayesian_optimizer import ( 

22 BaseBayesianOptimizer, 

23) 

24from mlos_core.spaces.adapters.adapter import BaseSpaceAdapter 

25from mlos_core.spaces.adapters.identity_adapter import IdentityAdapter 

26from mlos_core.util import drop_nulls 

27 

28 

29class SmacOptimizer(BaseBayesianOptimizer): 

30 """Wrapper class for SMAC based Bayesian optimization.""" 

31 

32 def __init__( 

33 self, 

34 *, # pylint: disable=too-many-locals,too-many-arguments 

35 parameter_space: ConfigSpace.ConfigurationSpace, 

36 optimization_targets: List[str], 

37 objective_weights: Optional[List[float]] = None, 

38 space_adapter: Optional[BaseSpaceAdapter] = None, 

39 seed: Optional[int] = 0, 

40 run_name: Optional[str] = None, 

41 output_directory: Optional[str] = None, 

42 max_trials: int = 100, 

43 n_random_init: Optional[int] = None, 

44 max_ratio: Optional[float] = None, 

45 use_default_config: bool = False, 

46 n_random_probability: float = 0.1, 

47 ): 

48 """ 

49 Instantiate a new SMAC optimizer wrapper. 

50 

51 Parameters 

52 ---------- 

53 parameter_space : ConfigSpace.ConfigurationSpace 

54 The parameter space to optimize. 

55 

56 optimization_targets : List[str] 

57 The names of the optimization targets to minimize. 

58 

59 objective_weights : Optional[List[float]] 

60 Optional list of weights of optimization targets. 

61 

62 space_adapter : BaseSpaceAdapter 

63 The space adapter class to employ for parameter space transformations. 

64 

65 seed : Optional[int] 

66 By default SMAC uses a known seed (0) to keep results reproducible. 

67 However, if a `None` seed is explicitly provided, we let a random seed 

68 be produced by SMAC. 

69 

70 run_name : Optional[str] 

71 Name of this run. This is used to easily distinguish across different runs. 

72 If set to `None` (default), SMAC will generate a hash from metadata. 

73 

74 output_directory : Optional[str] 

75 The directory where SMAC output will saved. If set to `None` (default), 

76 a temporary dir will be used. 

77 

78 max_trials : int 

79 Maximum number of trials (i.e., function evaluations) to be run. Defaults to 100. 

80 Note that modifying this value directly affects the value of 

81 `n_random_init`, if latter is set to `None`. 

82 

83 n_random_init : Optional[int] 

84 Number of points evaluated at start to bootstrap the optimizer. 

85 Default depends on max_trials and number of parameters and max_ratio. 

86 Note: it can sometimes be useful to set this to 1 when pre-warming the 

87 optimizer from historical data. 

88 See Also: mlos_bench.optimizer.bulk_register 

89 

90 max_ratio : Optional[int] 

91 Maximum ratio of max_trials to be random configs to be evaluated 

92 at start to bootstrap the optimizer. 

93 Useful if you want to explicitly control the number of random 

94 configs evaluated at start. 

95 

96 use_default_config: bool 

97 Whether to use the default config for the first trial after random initialization. 

98 

99 n_random_probability: float 

100 Probability of choosing to evaluate a random configuration during optimization. 

101 Defaults to `0.1`. Setting this to a higher value favors exploration over exploitation. 

102 """ 

103 super().__init__( 

104 parameter_space=parameter_space, 

105 optimization_targets=optimization_targets, 

106 objective_weights=objective_weights, 

107 space_adapter=space_adapter, 

108 ) 

109 

110 # Declare at the top because we need it in __del__/cleanup() 

111 self._temp_output_directory: Optional[TemporaryDirectory] = None 

112 

113 # pylint: disable=import-outside-toplevel 

114 from smac import HyperparameterOptimizationFacade as Optimizer_Smac 

115 from smac import Scenario 

116 from smac.intensifier.abstract_intensifier import AbstractIntensifier 

117 from smac.main.config_selector import ConfigSelector 

118 from smac.random_design.probability_design import ProbabilityRandomDesign 

119 from smac.runhistory import TrialInfo 

120 

121 # Store for TrialInfo instances returned by .ask() 

122 self.trial_info_map: Dict[ConfigSpace.Configuration, TrialInfo] = {} 

123 

124 # The default when not specified is to use a known seed (0) to keep results reproducible. 

125 # However, if a `None` seed is explicitly provided, we let a random seed be 

126 # produced by SMAC. 

127 # https://automl.github.io/SMAC3/main/api/smac.scenario.html#smac.scenario.Scenario 

128 seed = -1 if seed is None else seed 

129 

130 # Create temporary directory for SMAC output (if none provided) 

131 if output_directory is None: 

132 # pylint: disable=consider-using-with 

133 try: 

134 # Argument added in Python 3.10 

135 self._temp_output_directory = TemporaryDirectory(ignore_cleanup_errors=True) 

136 except TypeError: 

137 self._temp_output_directory = TemporaryDirectory() 

138 output_directory = self._temp_output_directory.name 

139 

140 if n_random_init is not None: 

141 assert isinstance(n_random_init, int) and n_random_init >= 0 

142 if n_random_init == max_trials and use_default_config: 

143 # Increase max budgeted trials to account for use_default_config. 

144 max_trials += 1 

145 

146 scenario: Scenario = Scenario( 

147 self.optimizer_parameter_space, 

148 objectives=self._optimization_targets, 

149 name=run_name, 

150 output_directory=Path(output_directory), 

151 deterministic=True, 

152 use_default_config=use_default_config, 

153 n_trials=max_trials, 

154 seed=seed or -1, # if -1, SMAC will generate a random seed internally 

155 n_workers=1, # Use a single thread for evaluating trials 

156 ) 

157 intensifier: AbstractIntensifier = Optimizer_Smac.get_intensifier( 

158 scenario, 

159 max_config_calls=1, 

160 ) 

161 config_selector: ConfigSelector = Optimizer_Smac.get_config_selector( 

162 scenario, 

163 retrain_after=1, 

164 ) 

165 

166 # TODO: When bulk registering prior configs to rewarm the optimizer, 

167 # there is a way to inform SMAC's initial design that we have 

168 # additional_configs and can set n_configs == 0. 

169 # Additionally, we may want to consider encoding those values into the 

170 # runhistory when prewarming the optimizer so that the initial design 

171 # doesn't reperform random init. 

172 # See Also: #488 

173 

174 initial_design_args: Dict[str, Union[list, int, float, Scenario]] = { 

175 "scenario": scenario, 

176 # Workaround a bug in SMAC that sets a default arg to a mutable 

177 # value that can cause issues when multiple optimizers are 

178 # instantiated with the use_default_config option within the same 

179 # process that use different ConfigSpaces so that the second 

180 # receives the default config from both as an additional config. 

181 "additional_configs": [], 

182 } 

183 if n_random_init is not None: 

184 initial_design_args["n_configs"] = n_random_init 

185 if n_random_init > 0.25 * max_trials and max_ratio is None: 

186 warning( 

187 "Number of random initial configs (%d) is " 

188 + "greater than 25%% of max_trials (%d). " 

189 + "Consider setting max_ratio to avoid SMAC overriding n_random_init.", 

190 n_random_init, 

191 max_trials, 

192 ) 

193 if max_ratio is not None: 

194 assert isinstance(max_ratio, float) and 0.0 <= max_ratio <= 1.0 

195 initial_design_args["max_ratio"] = max_ratio 

196 

197 # Use the default InitialDesign from SMAC. 

198 # (currently SBOL instead of LatinHypercube due to better uniformity 

199 # for initial sampling which results in lower overall samples required) 

200 initial_design = Optimizer_Smac.get_initial_design( 

201 **initial_design_args, # type: ignore[arg-type] 

202 ) 

203 # initial_design = LatinHypercubeInitialDesign( 

204 # **initial_design_args, # type: ignore[arg-type] 

205 # ) 

206 

207 # Workaround a bug in SMAC that doesn't pass the seed to the random 

208 # design when generated a random_design for itself via the 

209 # get_random_design static method when random_design is None. 

210 assert isinstance(n_random_probability, float) and n_random_probability >= 0 

211 random_design = ProbabilityRandomDesign( 

212 probability=n_random_probability, 

213 seed=scenario.seed, 

214 ) 

215 

216 self.base_optimizer = Optimizer_Smac( 

217 scenario, 

218 SmacOptimizer._dummy_target_func, 

219 initial_design=initial_design, 

220 intensifier=intensifier, 

221 random_design=random_design, 

222 config_selector=config_selector, 

223 multi_objective_algorithm=Optimizer_Smac.get_multi_objective_algorithm( 

224 scenario, 

225 objective_weights=self._objective_weights, 

226 ), 

227 overwrite=True, 

228 logging_level=False, # Use the existing logger 

229 ) 

230 

231 def __del__(self) -> None: 

232 # Best-effort attempt to clean up, in case the user forgets to call .cleanup() 

233 self.cleanup() 

234 

235 @property 

236 def n_random_init(self) -> int: 

237 """ 

238 Gets the number of random samples to use to initialize the optimizer's search 

239 space sampling. 

240 

241 Note: This may not be equal to the value passed to the initializer, due to 

242 logic present in the SMAC. 

243 See Also: max_ratio 

244 

245 Returns 

246 ------- 

247 int 

248 The number of random samples used to initialize the optimizer's search space sampling. 

249 """ 

250 # pylint: disable=protected-access 

251 return self.base_optimizer._initial_design._n_configs 

252 

253 @staticmethod 

254 def _dummy_target_func(config: ConfigSpace.Configuration, seed: int = 0) -> None: 

255 """ 

256 Dummy target function for SMAC optimizer. 

257 

258 Since we only use the ask-and-tell interface, this is never called. 

259 

260 Parameters 

261 ---------- 

262 config : ConfigSpace.Configuration 

263 Configuration to evaluate. 

264 

265 seed : int 

266 Random seed to use for the target function. Not actually used. 

267 """ 

268 # NOTE: Providing a target function when using the ask-and-tell interface is 

269 # an imperfection of the API -- this is planned to be fixed in some future 

270 # release: https://github.com/automl/SMAC3/issues/946 

271 raise RuntimeError("This function should never be called.") 

272 

273 def _register( 

274 self, 

275 *, 

276 configs: pd.DataFrame, 

277 scores: pd.DataFrame, 

278 context: Optional[pd.DataFrame] = None, 

279 metadata: Optional[pd.DataFrame] = None, 

280 ) -> None: 

281 """ 

282 Registers the given configs and scores. 

283 

284 Parameters 

285 ---------- 

286 configs : pd.DataFrame 

287 Dataframe of configs / parameters. The columns are parameter names and 

288 the rows are the configs. 

289 

290 scores : pd.DataFrame 

291 Scores from running the configs. The index is the same as the index of 

292 the configs. 

293 

294 context : pd.DataFrame 

295 Not Yet Implemented. 

296 

297 metadata: pd.DataFrame 

298 Not Yet Implemented. 

299 """ 

300 from smac.runhistory import ( # pylint: disable=import-outside-toplevel 

301 StatusType, 

302 TrialInfo, 

303 TrialValue, 

304 ) 

305 

306 if context is not None: 

307 warn(f"Not Implemented: Ignoring context {list(context.columns)}", UserWarning) 

308 

309 # Register each trial (one-by-one) 

310 for config, (_i, score) in zip( 

311 self._to_configspace_configs(configs=configs), scores.iterrows() 

312 ): 

313 # Retrieve previously generated TrialInfo (returned by .ask()) or create 

314 # new TrialInfo instance 

315 info: TrialInfo = self.trial_info_map.get( 

316 config, 

317 TrialInfo(config=config, seed=self.base_optimizer.scenario.seed), 

318 ) 

319 value = TrialValue(cost=list(score.astype(float)), time=0.0, status=StatusType.SUCCESS) 

320 self.base_optimizer.tell(info, value, save=False) 

321 

322 # Save optimizer once we register all configs 

323 self.base_optimizer.optimizer.save() 

324 

325 def _suggest( 

326 self, 

327 *, 

328 context: Optional[pd.DataFrame] = None, 

329 ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: 

330 """ 

331 Suggests a new configuration. 

332 

333 Parameters 

334 ---------- 

335 context : pd.DataFrame 

336 Not Yet Implemented. 

337 

338 Returns 

339 ------- 

340 configuration : pd.DataFrame 

341 Pandas dataframe with a single row. Column names are the parameter names. 

342 

343 metadata : Optional[pd.DataFrame] 

344 Not yet implemented. 

345 """ 

346 if TYPE_CHECKING: 

347 # pylint: disable=import-outside-toplevel,unused-import 

348 from smac.runhistory import TrialInfo 

349 

350 if context is not None: 

351 warn(f"Not Implemented: Ignoring context {list(context.columns)}", UserWarning) 

352 

353 trial: TrialInfo = self.base_optimizer.ask() 

354 trial.config.check_valid_configuration() 

355 ConfigSpace.Configuration( 

356 self.optimizer_parameter_space, 

357 values=trial.config, 

358 ).check_valid_configuration() 

359 assert trial.config.config_space == self.optimizer_parameter_space 

360 self.trial_info_map[trial.config] = trial 

361 config_df = pd.DataFrame( 

362 [trial.config], columns=list(self.optimizer_parameter_space.keys()) 

363 ) 

364 return config_df, None 

365 

366 def register_pending( 

367 self, 

368 *, 

369 configs: pd.DataFrame, 

370 context: Optional[pd.DataFrame] = None, 

371 metadata: Optional[pd.DataFrame] = None, 

372 ) -> None: 

373 raise NotImplementedError() 

374 

375 def surrogate_predict( 

376 self, 

377 *, 

378 configs: pd.DataFrame, 

379 context: Optional[pd.DataFrame] = None, 

380 ) -> npt.NDArray: 

381 # pylint: disable=import-outside-toplevel 

382 from smac.utils.configspace import convert_configurations_to_array 

383 

384 if context is not None: 

385 warn(f"Not Implemented: Ignoring context {list(context.columns)}", UserWarning) 

386 if self._space_adapter and not isinstance(self._space_adapter, IdentityAdapter): 

387 raise NotImplementedError("Space adapter not supported for surrogate_predict.") 

388 

389 # pylint: disable=protected-access 

390 if len(self._observations) <= self.base_optimizer._initial_design._n_configs: 

391 raise RuntimeError( 

392 "Surrogate model can make predictions *only* after " 

393 "all initial points have been evaluated " 

394 f"{len(self._observations)} <= {self.base_optimizer._initial_design._n_configs}" 

395 ) 

396 if self.base_optimizer._config_selector._model is None: 

397 raise RuntimeError("Surrogate model is not yet trained") 

398 

399 config_array: npt.NDArray = convert_configurations_to_array( 

400 self._to_configspace_configs(configs=configs) 

401 ) 

402 mean_predictions, _ = self.base_optimizer._config_selector._model.predict(config_array) 

403 return mean_predictions.reshape( 

404 -1, 

405 ) 

406 

407 def acquisition_function( 

408 self, 

409 *, 

410 configs: pd.DataFrame, 

411 context: Optional[pd.DataFrame] = None, 

412 ) -> npt.NDArray: 

413 if context is not None: 

414 warn(f"Not Implemented: Ignoring context {list(context.columns)}", UserWarning) 

415 if self._space_adapter: 

416 raise NotImplementedError() 

417 

418 # pylint: disable=protected-access 

419 if self.base_optimizer._config_selector._acquisition_function is None: 

420 raise RuntimeError("Acquisition function is not yet initialized") 

421 

422 cs_configs: list = self._to_configspace_configs(configs=configs) 

423 return self.base_optimizer._config_selector._acquisition_function(cs_configs).reshape( 

424 -1, 

425 ) 

426 

427 def cleanup(self) -> None: 

428 if hasattr(self, "_temp_output_directory") and self._temp_output_directory is not None: 

429 self._temp_output_directory.cleanup() 

430 self._temp_output_directory = None 

431 

432 def _to_configspace_configs(self, *, configs: pd.DataFrame) -> List[ConfigSpace.Configuration]: 

433 """ 

434 Convert a dataframe of configs to a list of ConfigSpace configs. 

435 

436 Parameters 

437 ---------- 

438 configs : pd.DataFrame 

439 Dataframe of configs / parameters. The columns are parameter names and 

440 the rows are the configs. 

441 

442 Returns 

443 ------- 

444 configs : list 

445 List of ConfigSpace configs. 

446 """ 

447 return [ 

448 ConfigSpace.Configuration( 

449 self.optimizer_parameter_space, 

450 # Remove None values for inactive parameters 

451 values=drop_nulls(config.to_dict()), 

452 allow_inactive_with_values=False, 

453 ) 

454 for (_, config) in configs.astype("O").iterrows() 

455 ]