Source code for stable_ssl.config

# -*- coding: utf-8 -*-
"""Configuration for stable-ssl runs."""
#
# Author: Hugues Van Assel <vanasselhugues@gmail.com>
#         Randall Balestriero <randallbalestriero@gmail.com>
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

from dataclasses import dataclass, field, make_dataclass
from typing import Optional, Tuple
import logging
from omegaconf import OmegaConf
from pathlib import Path
from datetime import datetime
import random
import torch

from .utils import LARS
from .joint_embedding import (
    BarlowTwinsConfig,
    SimCLRConfig,
    VICRegConfig,
    WMSEConfig,
)
from .supervised import Supervised
from .data import DataConfig
from .base import BaseModelConfig



[docs]
@dataclass
class OptimConfig:
    """Configuration for the 'optimizer' parameters.

    Parameters
    ----------
    optimizer : str
        Type of optimizer to use (e.g., "AdamW", "RMSprop", "SGD", "LARS").
        Default is "LARS".
    lr : float
        Learning rate for the optimizer. Default is 1e0.
    batch_size : int, optional
        Batch size for training. Default is 256.
    epochs : int, optional
        Number of epochs to train the model. Default is 10.
    max_steps : int, optional
        Maximum number of steps per epoch. Default is -1.
    weight_decay : float
        Weight decay for the optimizer. Default is 1e-6.
    momentum : float
        Momentum for the optimizer. Default is None.
    nesterov : bool
        Whether to use Nesterov momentum. Default is False.
    betas : Tuple[float, float], optional
        Betas for the AdamW optimizer. Default is (0.9, 0.999).
    grad_max_norm : float, optional
        Maximum norm for gradient clipping. Default is None.
    """

    optimizer: str = "LARS"
    lr: float = 1e0
    batch_size: int = 256
    epochs: int = 1000
    max_steps: int = -1
    weight_decay: float = 0
    momentum: Optional[float] = None
    nesterov: Optional[bool] = None
    betas: Optional[Tuple[float, float]] = None
    grad_max_norm: Optional[float] = None

    def __post_init__(self):
        """Validate and set default values for optimizer parameters.

        Ensures that a valid optimizer is provided and assigns default values
        for parameters like learning rate, weight decay, and others, if they
        are not explicitly set.
        """
        if not (hasattr(torch.optim, self.optimizer) or self.optimizer == "LARS"):
            raise ValueError(
                f"Invalid optimizer: {self.optimizer}. Must be a "
                "torch optimizer or 'LARS'."
            )

        # Instantiate the optimizer to get the default parameters.
        optimizer = (
            LARS if self.optimizer == "LARS" else getattr(torch.optim, self.optimizer)
        )
        default_params = optimizer([torch.tensor(0)]).defaults

        # Ensure parameters are provided appropriately based on the optimizer.
        for param in ["lr", "weight_decay", "momentum", "betas", "nesterov"]:
            if param in default_params.keys():
                if getattr(self, param) is None:
                    # If a useful parameter is not provided, its default value is used.
                    default_value = default_params[param]
                    setattr(self, param, default_value)
                    logging.warning(
                        f"{param} not provided for {self.optimizer} "
                        f"optimizer. Default value of {default_value} is used."
                    )
            else:
                # If the parameter is useless for the optimizer, it is set to None.
                setattr(self, param, None)




[docs]
@dataclass
class HardwareConfig:
    """Configuration for the 'hardware' parameters.

    Parameters
    ----------
    seed : int, optional
        Random seed for reproducibility. Default is None.
    float16 : bool, optional
        Whether to use mixed precision (float16) for training. Default is False.
    gpu : int, optional
        GPU device ID to use for training. Default is 0.
    world_size : int, optional
        Number of processes participating in distributed training. Default is 1.
    port : int, optional
        Port number for distributed training. Default is None.
    """

    seed: Optional[int] = None
    float16: bool = False
    gpu: int = 0
    world_size: int = 1
    port: Optional[int] = None

    def __post_init__(self):
        """Set a random port for distributed training if not provided."""
        self.port = self.port or random.randint(49152, 65535)




[docs]
@dataclass
class LogConfig:
    """Configuration for the 'log' parameters.

    Parameters
    ----------
    folder : str, optional
        Path to the folder where logs and checkpoints will be saved.
        Default is the current directory + random hash folder.
    load_from : str, optional
        Path to a checkpoint from which to load the model, optimizer, and scheduler.
        Default is "ckpt".
    level : int, optional
        Logging level (e.g., logging.INFO). Default is logging.INFO.
    checkpoint_frequency : int, optional
        Frequency of saving checkpoints (in terms of epochs). Default is 10.
    save_final_model : bool, optional
        Whether to save the final trained model. Default is False.
    final_model_name : str, optional
        Name for the final saved model. Default is "final_model".
    eval_only : bool, optional
        Whether to only evaluate the model without training. Default is False.
    eval_epoch_freq : int, optional
        Frequency of evaluation (in terms of epochs). Default is 1.
    wandb_entity : str, optional
        Name of the (Weights & Biases) entity. Default is None.
    wandb_project : str, optional
        Name of the (Weights & Biases) project. Default is None.
    """

    folder: Optional[str] = None
    run: Optional[str] = None
    load_from: str = "ckpt"
    level: int = logging.INFO
    checkpoint_frequency: int = 10
    save_final_model: bool = False
    final_model_name: str = "final_model"
    eval_only: bool = False
    eval_epoch_freq: int = 1
    wandb_entity: Optional[str] = None
    wandb_project: Optional[str] = None

    def __post_init__(self):
        """Initialize logging folder and run settings.

        If the folder path is not specified, creates a default path under `./logs`.
        The run identifier is set using the current timestamp if not provided.
        """
        if self.folder is None:
            self.folder = Path("./logs")
        else:
            self.folder = Path(self.folder)
        if self.run is None:
            self.run = datetime.now().strftime("%Y%m%d_%H%M%S.%f")
        (self.folder / self.run).mkdir(parents=True, exist_ok=True)

    @property
    def dump_path(self):
        """Return the full path where logs and checkpoints are stored.

        This path includes the base folder and the run identifier.
        """
        return self.folder / self.run



@dataclass
class TrainerConfig:
    """Global configuration for training a model.

    Parameters
    ----------
    model : BaseModelConfig
        Model configuration.
    data : DataConfig
        Data configuration.
    optim : OptimConfig
        Optimizer configuration.
    hardware : HardwareConfig
        Hardware configuration.
    log : LogConfig
        Logging and checkpointing configuration.
    """

    model: BaseModelConfig = field(default_factory=BaseModelConfig)
    data: DataConfig = field(default_factory=DataConfig)
    optim: OptimConfig = field(default_factory=OptimConfig)
    hardware: HardwareConfig = field(default_factory=HardwareConfig)
    log: LogConfig = field(default_factory=LogConfig)

    def __repr__(self) -> str:
        """Return a YAML representation of the configuration."""
        return OmegaConf.to_yaml(self)

    def __str__(self) -> str:
        """Return a YAML string of the configuration."""
        return OmegaConf.to_yaml(self)


_MODEL_CONFIGS = {
    "SimCLR": SimCLRConfig,
    "Barlowtwins": BarlowTwinsConfig,
    "Supervised": BaseModelConfig,
    "VICReg": VICRegConfig,
    "WMSE": WMSEConfig,
}


def get_args(cfg_dict, model_class=None):
    """Create and return a TrainerConfig from a configuration dictionary."""
    kwargs = {
        name: value
        for name, value in cfg_dict.items()
        if name not in ["data", "optim", "model", "hardware", "log"]
    }

    model = cfg_dict.get("model", {})
    if model_class is None:
        name = model.get("name", None)
    else:
        if issubclass(model_class, Supervised):
            name = "Supervised"
    model = _MODEL_CONFIGS[name](**model)

    args = TrainerConfig(
        model=model,
        data=DataConfig(**cfg_dict.get("data", {})),
        optim=OptimConfig(**cfg_dict.get("optim", {})),
        hardware=HardwareConfig(**cfg_dict.get("hardware", {})),
        log=LogConfig(**cfg_dict.get("log", {})),
    )

    args.__class__ = make_dataclass(
        "TrainerConfig",
        fields=[(name, type(v), v) for name, v in kwargs.items()],
        bases=(type(args),),
    )

    return args