多数据集上的随机梯度下降#
展开复制 examples/2_multi_fidelity/2_sgd_datasets.py
(右上角)
from __future__ import annotations
import itertools
import warnings
import numpy as np
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Float
from sklearn import datasets
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from smac import MultiFidelityFacade as MFFacade
from smac import Scenario
__copyright__ = "Copyright 2025, Leibniz University Hanover, Institute of AI"
__license__ = "3-clause BSD"
class DigitsDataset:
def __init__(self) -> None:
self._data = datasets.load_digits()
def get_instances(self) -> list[str]:
"""Create instances from the dataset which include two classes only."""
return [f"{classA}-{classB}" for classA, classB in itertools.combinations(self._data.target_names, 2)]
def get_instance_features(self) -> dict[str, list[int | float]]:
"""Returns the mean and variance of all instances as features."""
features = {}
for instance in self.get_instances():
data, _ = self.get_instance_data(instance)
features[instance] = [np.mean(data), np.var(data)]
return features
def get_instance_data(self, instance: str) -> tuple[np.ndarray, np.ndarray]:
"""Retrieve data from the passed instance."""
# We split the dataset into two classes
classA, classB = instance.split("-")
indices = np.where(np.logical_or(int(classA) == self._data.target, int(classB) == self._data.target))
data = self._data.data[indices]
target = self._data.target[indices]
return data, target
class SGD:
def __init__(self, dataset: DigitsDataset) -> None:
self.dataset = dataset
@property
def configspace(self) -> ConfigurationSpace:
"""Build the configuration space which defines all parameters and their ranges for the SGD classifier."""
cs = ConfigurationSpace()
# We define a few possible parameters for the SGD classifier
alpha = Float("alpha", (0, 1), default=1.0)
l1_ratio = Float("l1_ratio", (0, 1), default=0.5)
learning_rate = Categorical("learning_rate", ["constant", "invscaling", "adaptive"], default="constant")
eta0 = Float("eta0", (0.00001, 1), default=0.1, log=True)
# Add the parameters to configuration space
cs.add([alpha, l1_ratio, learning_rate, eta0])
return cs
def train(self, config: Configuration, instance: str, seed: int = 0) -> float:
"""Creates a SGD classifier based on a configuration and evaluates it on the
digits dataset using cross-validation."""
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
# SGD classifier using given configuration
clf = SGDClassifier(
loss="log_loss",
penalty="elasticnet",
alpha=config["alpha"],
l1_ratio=config["l1_ratio"],
learning_rate=config["learning_rate"],
eta0=config["eta0"],
max_iter=30,
early_stopping=True,
random_state=seed,
)
# get instance
data, target = self.dataset.get_instance_data(instance)
cv = StratifiedKFold(n_splits=4, random_state=seed, shuffle=True) # to make CV splits consistent
scores = cross_val_score(clf, data, target, cv=cv)
return 1 - np.mean(scores)
if __name__ == "__main__":
dataset = DigitsDataset()
model = SGD(dataset)
scenario = Scenario(
model.configspace,
walltime_limit=30, # We want to optimize for 30 seconds
n_trials=5000, # We want to try max 5000 different trials
min_budget=1, # Use min one instance
max_budget=45, # Use max 45 instances (if we have a lot of instances we could constraint it here)
instances=dataset.get_instances(),
instance_features=dataset.get_instance_features(),
)
# Create our SMAC object and pass the scenario and the train method
smac = MFFacade(
scenario,
model.train,
overwrite=True,
)
# Now we start the optimization process
incumbent = smac.optimize()
default_cost = smac.validate(model.configspace.get_default_configuration())
print(f"Default cost: {default_cost}")
incumbent_cost = smac.validate(incumbent)
print(f"Incumbent cost: {incumbent_cost}")
描述#
跨多个(数据集)实例优化多层感知机(MLP)的示例。
作为预算的替代方案,此处不失一般性,我们将实例视为一种保真度类型。实例代表算法运行的特定场景/条件(例如,不同的数据集、子集、转换)。然后 SMAC 返回在所有实例上表现最佳的算法。在本例中,一个实例是一个二分类数据集,即数字 2 对数字 3。
如果我们使用实例作为保真度,我们需要使用 instance 参数初始化场景(scenario)。在这种情况下,目标函数不再需要 budget 参数。但由于场景(scenario)中的 instance 参数,目标函数现在必须包含一个 instance 参数。
from __future__ import annotations
import itertools
import warnings
import numpy as np
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Float
from sklearn import datasets
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from smac import MultiFidelityFacade as MFFacade
from smac import Scenario
__copyright__ = "Copyright 2025, Leibniz University Hanover, Institute of AI"
__license__ = "3-clause BSD"
class DigitsDataset:
def __init__(self) -> None:
self._data = datasets.load_digits()
def get_instances(self) -> list[str]:
"""Create instances from the dataset which include two classes only."""
return [f"{classA}-{classB}" for classA, classB in itertools.combinations(self._data.target_names, 2)]
def get_instance_features(self) -> dict[str, list[int | float]]:
"""Returns the mean and variance of all instances as features."""
features = {}
for instance in self.get_instances():
data, _ = self.get_instance_data(instance)
features[instance] = [np.mean(data), np.var(data)]
return features
def get_instance_data(self, instance: str) -> tuple[np.ndarray, np.ndarray]:
"""Retrieve data from the passed instance."""
# We split the dataset into two classes
classA, classB = instance.split("-")
indices = np.where(np.logical_or(int(classA) == self._data.target, int(classB) == self._data.target))
data = self._data.data[indices]
target = self._data.target[indices]
return data, target
class SGD:
def __init__(self, dataset: DigitsDataset) -> None:
self.dataset = dataset
@property
def configspace(self) -> ConfigurationSpace:
"""Build the configuration space which defines all parameters and their ranges for the SGD classifier."""
cs = ConfigurationSpace()
# We define a few possible parameters for the SGD classifier
alpha = Float("alpha", (0, 1), default=1.0)
l1_ratio = Float("l1_ratio", (0, 1), default=0.5)
learning_rate = Categorical("learning_rate", ["constant", "invscaling", "adaptive"], default="constant")
eta0 = Float("eta0", (0.00001, 1), default=0.1, log=True)
# Add the parameters to configuration space
cs.add([alpha, l1_ratio, learning_rate, eta0])
return cs
def train(self, config: Configuration, instance: str, seed: int = 0) -> float:
"""Creates a SGD classifier based on a configuration and evaluates it on the
digits dataset using cross-validation."""
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
# SGD classifier using given configuration
clf = SGDClassifier(
loss="log_loss",
penalty="elasticnet",
alpha=config["alpha"],
l1_ratio=config["l1_ratio"],
learning_rate=config["learning_rate"],
eta0=config["eta0"],
max_iter=30,
early_stopping=True,
random_state=seed,
)
# get instance
data, target = self.dataset.get_instance_data(instance)
cv = StratifiedKFold(n_splits=4, random_state=seed, shuffle=True) # to make CV splits consistent
scores = cross_val_score(clf, data, target, cv=cv)
return 1 - np.mean(scores)
if __name__ == "__main__":
dataset = DigitsDataset()
model = SGD(dataset)
scenario = Scenario(
model.configspace,
walltime_limit=30, # We want to optimize for 30 seconds
n_trials=5000, # We want to try max 5000 different trials
min_budget=1, # Use min one instance
max_budget=45, # Use max 45 instances (if we have a lot of instances we could constraint it here)
instances=dataset.get_instances(),
instance_features=dataset.get_instance_features(),
)
# Create our SMAC object and pass the scenario and the train method
smac = MFFacade(
scenario,
model.train,
overwrite=True,
)
# Now we start the optimization process
incumbent = smac.optimize()
default_cost = smac.validate(model.configspace.get_default_configuration())
print(f"Default cost: {default_cost}")
incumbent_cost = smac.validate(incumbent)
print(f"Incumbent cost: {incumbent_cost}")