跳到内容

加速交叉验证通过强化#

展开复制 examples/4_advanced_optimizer/4_intensify_crossvalidation.py (右上角)
__copyright__ = "Copyright 2025, Leibniz University Hanover, Institute of AI"
__license__ = "3-clause BSD"

N_FOLDS = 10  # Global variable that determines the number of folds

from ConfigSpace import Configuration, ConfigurationSpace, Float
from sklearn import datasets, svm
from sklearn.model_selection import StratifiedKFold

from smac import HyperparameterOptimizationFacade, Scenario
from smac.intensifier import Intensifier

# We load the digits dataset, a small-scale 10-class digit recognition dataset
X, y = datasets.load_digits(return_X_y=True)


class SVM:
    @property
    def configspace(self) -> ConfigurationSpace:
        # Build Configuration Space which defines all parameters and their ranges
        cs = ConfigurationSpace(seed=0)

        # First we create our hyperparameters
        C = Float("C", (2**-5, 2**15), default=1.0, log=True)
        gamma = Float("gamma", (2**-15, 2**3), default=1.0, log=True)

        # Add hyperparameters to our configspace
        cs.add([C, gamma])

        return cs

    def train(self, config: Configuration, instance: str, seed: int = 0) -> float:
        """Creates a SVM based on a configuration and evaluate on the given fold of the digits dataset

        Parameters
        ----------
        config: Configuration
            The configuration to train the SVM.
        instance: str
            The name of the instance this configuration should be evaluated on. This is always of type
            string by definition. In our case we cast to int, but this could also be the filename of a
            problem instance to be loaded.
        seed: int
            The seed used for this call.
        """
        instance = int(instance)
        classifier = svm.SVC(**config, random_state=seed)
        splitter = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
        for k, (train_idx, test_idx) in enumerate(splitter.split(X=X, y=y)):
            if k != instance:
                continue
            else:
                train_X = X[train_idx]
                train_y = y[train_idx]
                test_X = X[test_idx]
                test_y = y[test_idx]
                classifier.fit(train_X, train_y)
                cost = 1 - classifier.score(test_X, test_y)

        return cost


if __name__ == "__main__":
    classifier = SVM()

    # Next, we create an object, holding general information about the run
    scenario = Scenario(
        classifier.configspace,
        n_trials=50,  # We want to run max 50 trials (combination of config and instances in the case of
        # deterministic=True. In the case of deterministic=False, this would be the
        # combination of instances, seeds and configs). The number of distinct configurations
        # evaluated by SMAC will be lower than this number because some of the configurations
        # will be executed on more than one instance (CV fold).
        instances=[f"{i}" for i in range(N_FOLDS)],  # Specify all instances by their name (as a string)
        instance_features={f"{i}": [i] for i in range(N_FOLDS)},  # breaks SMAC
        deterministic=True  # To simplify the problem we make SMAC believe that we have a deterministic
        # optimization problem.
    )

    # We want to run the facade's default initial design, but we want to change the number
    # of initial configs to 5.
    initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)

    # Now we use SMAC to find the best hyperparameters
    smac = HyperparameterOptimizationFacade(
        scenario,
        classifier.train,
        initial_design=initial_design,
        overwrite=True,  # If the run exists, we overwrite it; alternatively, we can continue from last state
        # The next line defines the intensifier, i.e., the module that governs the selection of
        # instance-seed pairs. Since we set deterministic to True above, it only governs the instance in
        # this example. Technically, it is not necessary to create the intensifier as a user, but it is
        # necessary to do so because we change the argument max_config_calls (the number of instance-seed pairs
        # per configuration to try) to the number of cross-validation folds, while the default would be 3.
        intensifier=Intensifier(scenario=scenario, max_config_calls=N_FOLDS, seed=0),
    )

    incumbent = smac.optimize()

    # Get cost of default configuration
    default_cost = smac.validate(classifier.configspace.get_default_configuration())
    print(f"Default cost: {default_cost}")

    # Let's calculate the cost of the incumbent
    incumbent_cost = smac.validate(incumbent)
    print(f"Incumbent cost: {incumbent_cost}")

    # Let's see how many configurations we have evaluated. If this number is higher than 5, we have looked
    # at more configurations than would have been possible with regular cross-validation, where the number
    # of configurations would be determined by the number of trials divided by the number of folds (50 / 10).
    runhistory = smac.runhistory
    print(f"Number of evaluated configurations: {len(runhistory.config_ids)}")

描述#

这是一个在 digits 数据集上优化简单支持向量机的示例。与 简单示例 不同,简单示例中一次执行所有交叉验证折叠,而我们使用了原始 SMAC 论文 中描述的强化机制,正如 Auto-WEKA 所演示的那样。这种机制允许我们在一定数量的折叠后,如果发现某个配置比当前的最佳配置更差,则终止对该配置的评估。当评估一个配置的成本很高时(例如,必须训练神经网络或必须在大型数据集上评估配置),这种机制特别有用。

__copyright__ = "Copyright 2025, Leibniz University Hanover, Institute of AI"
__license__ = "3-clause BSD"

N_FOLDS = 10  # Global variable that determines the number of folds

from ConfigSpace import Configuration, ConfigurationSpace, Float
from sklearn import datasets, svm
from sklearn.model_selection import StratifiedKFold

from smac import HyperparameterOptimizationFacade, Scenario
from smac.intensifier import Intensifier

# We load the digits dataset, a small-scale 10-class digit recognition dataset
X, y = datasets.load_digits(return_X_y=True)


class SVM:
    @property
    def configspace(self) -> ConfigurationSpace:
        # Build Configuration Space which defines all parameters and their ranges
        cs = ConfigurationSpace(seed=0)

        # First we create our hyperparameters
        C = Float("C", (2**-5, 2**15), default=1.0, log=True)
        gamma = Float("gamma", (2**-15, 2**3), default=1.0, log=True)

        # Add hyperparameters to our configspace
        cs.add([C, gamma])

        return cs

    def train(self, config: Configuration, instance: str, seed: int = 0) -> float:
        """Creates a SVM based on a configuration and evaluate on the given fold of the digits dataset

        Parameters
        ----------
        config: Configuration
            The configuration to train the SVM.
        instance: str
            The name of the instance this configuration should be evaluated on. This is always of type
            string by definition. In our case we cast to int, but this could also be the filename of a
            problem instance to be loaded.
        seed: int
            The seed used for this call.
        """
        instance = int(instance)
        classifier = svm.SVC(**config, random_state=seed)
        splitter = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
        for k, (train_idx, test_idx) in enumerate(splitter.split(X=X, y=y)):
            if k != instance:
                continue
            else:
                train_X = X[train_idx]
                train_y = y[train_idx]
                test_X = X[test_idx]
                test_y = y[test_idx]
                classifier.fit(train_X, train_y)
                cost = 1 - classifier.score(test_X, test_y)

        return cost


if __name__ == "__main__":
    classifier = SVM()

    # Next, we create an object, holding general information about the run
    scenario = Scenario(
        classifier.configspace,
        n_trials=50,  # We want to run max 50 trials (combination of config and instances in the case of
        # deterministic=True. In the case of deterministic=False, this would be the
        # combination of instances, seeds and configs). The number of distinct configurations
        # evaluated by SMAC will be lower than this number because some of the configurations
        # will be executed on more than one instance (CV fold).
        instances=[f"{i}" for i in range(N_FOLDS)],  # Specify all instances by their name (as a string)
        instance_features={f"{i}": [i] for i in range(N_FOLDS)},  # breaks SMAC
        deterministic=True  # To simplify the problem we make SMAC believe that we have a deterministic
        # optimization problem.
    )

    # We want to run the facade's default initial design, but we want to change the number
    # of initial configs to 5.
    initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)

    # Now we use SMAC to find the best hyperparameters
    smac = HyperparameterOptimizationFacade(
        scenario,
        classifier.train,
        initial_design=initial_design,
        overwrite=True,  # If the run exists, we overwrite it; alternatively, we can continue from last state
        # The next line defines the intensifier, i.e., the module that governs the selection of
        # instance-seed pairs. Since we set deterministic to True above, it only governs the instance in
        # this example. Technically, it is not necessary to create the intensifier as a user, but it is
        # necessary to do so because we change the argument max_config_calls (the number of instance-seed pairs
        # per configuration to try) to the number of cross-validation folds, while the default would be 3.
        intensifier=Intensifier(scenario=scenario, max_config_calls=N_FOLDS, seed=0),
    )

    incumbent = smac.optimize()

    # Get cost of default configuration
    default_cost = smac.validate(classifier.configspace.get_default_configuration())
    print(f"Default cost: {default_cost}")

    # Let's calculate the cost of the incumbent
    incumbent_cost = smac.validate(incumbent)
    print(f"Incumbent cost: {incumbent_cost}")

    # Let's see how many configurations we have evaluated. If this number is higher than 5, we have looked
    # at more configurations than would have been possible with regular cross-validation, where the number
    # of configurations would be determined by the number of trials divided by the number of folds (50 / 10).
    runhistory = smac.runhistory
    print(f"Number of evaluated configurations: {len(runhistory.config_ids)}")