跳到内容

多数据集上的随机梯度下降#

展开复制 examples/2_multi_fidelity/2_sgd_datasets.py (右上角)
from __future__ import annotations

import itertools
import warnings

import numpy as np
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Float
from sklearn import datasets
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

from smac import MultiFidelityFacade as MFFacade
from smac import Scenario

__copyright__ = "Copyright 2025, Leibniz University Hanover, Institute of AI"
__license__ = "3-clause BSD"


class DigitsDataset:
    def __init__(self) -> None:
        self._data = datasets.load_digits()

    def get_instances(self) -> list[str]:
        """Create instances from the dataset which include two classes only."""
        return [f"{classA}-{classB}" for classA, classB in itertools.combinations(self._data.target_names, 2)]

    def get_instance_features(self) -> dict[str, list[int | float]]:
        """Returns the mean and variance of all instances as features."""
        features = {}
        for instance in self.get_instances():
            data, _ = self.get_instance_data(instance)
            features[instance] = [np.mean(data), np.var(data)]

        return features

    def get_instance_data(self, instance: str) -> tuple[np.ndarray, np.ndarray]:
        """Retrieve data from the passed instance."""
        # We split the dataset into two classes
        classA, classB = instance.split("-")
        indices = np.where(np.logical_or(int(classA) == self._data.target, int(classB) == self._data.target))

        data = self._data.data[indices]
        target = self._data.target[indices]

        return data, target


class SGD:
    def __init__(self, dataset: DigitsDataset) -> None:
        self.dataset = dataset

    @property
    def configspace(self) -> ConfigurationSpace:
        """Build the configuration space which defines all parameters and their ranges for the SGD classifier."""
        cs = ConfigurationSpace()

        # We define a few possible parameters for the SGD classifier
        alpha = Float("alpha", (0, 1), default=1.0)
        l1_ratio = Float("l1_ratio", (0, 1), default=0.5)
        learning_rate = Categorical("learning_rate", ["constant", "invscaling", "adaptive"], default="constant")
        eta0 = Float("eta0", (0.00001, 1), default=0.1, log=True)
        # Add the parameters to configuration space
        cs.add([alpha, l1_ratio, learning_rate, eta0])

        return cs

    def train(self, config: Configuration, instance: str, seed: int = 0) -> float:
        """Creates a SGD classifier based on a configuration and evaluates it on the
        digits dataset using cross-validation."""

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")

            # SGD classifier using given configuration
            clf = SGDClassifier(
                loss="log_loss",
                penalty="elasticnet",
                alpha=config["alpha"],
                l1_ratio=config["l1_ratio"],
                learning_rate=config["learning_rate"],
                eta0=config["eta0"],
                max_iter=30,
                early_stopping=True,
                random_state=seed,
            )

            # get instance
            data, target = self.dataset.get_instance_data(instance)

            cv = StratifiedKFold(n_splits=4, random_state=seed, shuffle=True)  # to make CV splits consistent
            scores = cross_val_score(clf, data, target, cv=cv)

        return 1 - np.mean(scores)


if __name__ == "__main__":
    dataset = DigitsDataset()
    model = SGD(dataset)

    scenario = Scenario(
        model.configspace,
        walltime_limit=30,  # We want to optimize for 30 seconds
        n_trials=5000,  # We want to try max 5000 different trials
        min_budget=1,  # Use min one instance
        max_budget=45,  # Use max 45 instances (if we have a lot of instances we could constraint it here)
        instances=dataset.get_instances(),
        instance_features=dataset.get_instance_features(),
    )

    # Create our SMAC object and pass the scenario and the train method
    smac = MFFacade(
        scenario,
        model.train,
        overwrite=True,
    )

    # Now we start the optimization process
    incumbent = smac.optimize()

    default_cost = smac.validate(model.configspace.get_default_configuration())
    print(f"Default cost: {default_cost}")

    incumbent_cost = smac.validate(incumbent)
    print(f"Incumbent cost: {incumbent_cost}")

描述#

跨多个(数据集)实例优化多层感知机(MLP)的示例。

作为预算的替代方案,此处不失一般性,我们将实例视为一种保真度类型。实例代表算法运行的特定场景/条件(例如,不同的数据集、子集、转换)。然后 SMAC 返回在所有实例上表现最佳的算法。在本例中,一个实例是一个二分类数据集,即数字 2 对数字 3。

如果我们使用实例作为保真度,我们需要使用 instance 参数初始化场景(scenario)。在这种情况下,目标函数不再需要 budget 参数。但由于场景(scenario)中的 instance 参数,目标函数现在必须包含一个 instance 参数。

from __future__ import annotations

import itertools
import warnings

import numpy as np
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Float
from sklearn import datasets
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

from smac import MultiFidelityFacade as MFFacade
from smac import Scenario

__copyright__ = "Copyright 2025, Leibniz University Hanover, Institute of AI"
__license__ = "3-clause BSD"


class DigitsDataset:
    def __init__(self) -> None:
        self._data = datasets.load_digits()

    def get_instances(self) -> list[str]:
        """Create instances from the dataset which include two classes only."""
        return [f"{classA}-{classB}" for classA, classB in itertools.combinations(self._data.target_names, 2)]

    def get_instance_features(self) -> dict[str, list[int | float]]:
        """Returns the mean and variance of all instances as features."""
        features = {}
        for instance in self.get_instances():
            data, _ = self.get_instance_data(instance)
            features[instance] = [np.mean(data), np.var(data)]

        return features

    def get_instance_data(self, instance: str) -> tuple[np.ndarray, np.ndarray]:
        """Retrieve data from the passed instance."""
        # We split the dataset into two classes
        classA, classB = instance.split("-")
        indices = np.where(np.logical_or(int(classA) == self._data.target, int(classB) == self._data.target))

        data = self._data.data[indices]
        target = self._data.target[indices]

        return data, target


class SGD:
    def __init__(self, dataset: DigitsDataset) -> None:
        self.dataset = dataset

    @property
    def configspace(self) -> ConfigurationSpace:
        """Build the configuration space which defines all parameters and their ranges for the SGD classifier."""
        cs = ConfigurationSpace()

        # We define a few possible parameters for the SGD classifier
        alpha = Float("alpha", (0, 1), default=1.0)
        l1_ratio = Float("l1_ratio", (0, 1), default=0.5)
        learning_rate = Categorical("learning_rate", ["constant", "invscaling", "adaptive"], default="constant")
        eta0 = Float("eta0", (0.00001, 1), default=0.1, log=True)
        # Add the parameters to configuration space
        cs.add([alpha, l1_ratio, learning_rate, eta0])

        return cs

    def train(self, config: Configuration, instance: str, seed: int = 0) -> float:
        """Creates a SGD classifier based on a configuration and evaluates it on the
        digits dataset using cross-validation."""

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")

            # SGD classifier using given configuration
            clf = SGDClassifier(
                loss="log_loss",
                penalty="elasticnet",
                alpha=config["alpha"],
                l1_ratio=config["l1_ratio"],
                learning_rate=config["learning_rate"],
                eta0=config["eta0"],
                max_iter=30,
                early_stopping=True,
                random_state=seed,
            )

            # get instance
            data, target = self.dataset.get_instance_data(instance)

            cv = StratifiedKFold(n_splits=4, random_state=seed, shuffle=True)  # to make CV splits consistent
            scores = cross_val_score(clf, data, target, cv=cv)

        return 1 - np.mean(scores)


if __name__ == "__main__":
    dataset = DigitsDataset()
    model = SGD(dataset)

    scenario = Scenario(
        model.configspace,
        walltime_limit=30,  # We want to optimize for 30 seconds
        n_trials=5000,  # We want to try max 5000 different trials
        min_budget=1,  # Use min one instance
        max_budget=45,  # Use max 45 instances (if we have a lot of instances we could constraint it here)
        instances=dataset.get_instances(),
        instance_features=dataset.get_instance_features(),
    )

    # Create our SMAC object and pass the scenario and the train method
    smac = MFFacade(
        scenario,
        model.train,
        overwrite=True,
    )

    # Now we start the optimization process
    incumbent = smac.optimize()

    default_cost = smac.validate(model.configspace.get_default_configuration())
    print(f"Default cost: {default_cost}")

    incumbent_cost = smac.validate(incumbent)
    print(f"Incumbent cost: {incumbent_cost}")