From f2e55371b89cf1f7aa1eec90b52a2f80cfadda2a Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Fri, 8 Mar 2024 18:15:52 -0800 Subject: [PATCH 01/32] add `log_analytics` function to `MosaicMLLogger` --- composer/loggers/mosaicml_logger.py | 19 +++++++++++++++++++ composer/trainer/trainer.py | 3 +++ 2 files changed, 22 insertions(+) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index fa317d6f8c..325d03af0e 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -94,6 +94,25 @@ def log_hyperparameters(self, hyperparameters: Dict[str, Any]): def log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> None: self._log_metadata(metrics) + def log_analytics(self, autoresume: bool, trainer_state: State, loggers: List[LoggerDestination]): + metrics = {'composer/autoresume': autoresume, 'composer/precision': trainer_state.precision} + + if trainer_state.fsdp_config: + metrics['composer/sharding_strategy'] = trainer_state.fsdp_config.get('sharding_strategy', None) + metrics['composer/activation_checkpointing'] = trainer_state.fsdp_config.get('activation_checkpointing', False) + metrics['composer/forward_prefetch'] = trainer_state.fsdp_config.get('forward_prefetch', False) + metrics['composer/backward_prefetch'] = trainer_state.fsdp_config.get('backward_prefetch', None) + + mixed_precision = trainer_state.fsdp_config.get('mixed_precision', None) + if mixed_precision is not None and isinstance(mixed_precision, dict): + # Sorting the keys allows us to parse this dict value as JSON in a SQL query if needed + metrics['composer/mixed_precision'] = json.dumps(mixed_precision, sort_keys=True) + else: + metrics['composer/mixed_precision'] = mixed_precision + + if trainer_state.fsdp_state_dict_type is not None: + metrics['composer/state_dict_type'] = trainer_state.fsdp_state_dict_type + def after_load(self, state: State, logger: Logger) -> None: # Log model data downloaded and initialized for run events log.debug(f'Logging model initialized time to metadata') diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 01cd0fcc9b..95180eaa5f 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1113,6 +1113,7 @@ def __init__( # MosaicML Logger # Keep MosaicML logger above the RemoteUploaderDownloader so that fit end is reported before the final checkpoint begins uploading + mosaicml_logger = None if os.environ.get(MOSAICML_PLATFORM_ENV_VAR, 'false').lower() == 'true' and os.environ.get( MOSAICML_ACCESS_TOKEN_ENV_VAR) is not None and not any(isinstance(x, MosaicMLLogger) for x in loggers): log.info('Detected run on MosaicML platform. Adding MosaicMLLogger to loggers.') @@ -1131,6 +1132,8 @@ def __init__( # Logger self.logger = Logger(state=self.state, destinations=loggers) + if mosaicml_logger: + mosaicml_logger.log_analytics(autoresume, self.state, loggers) if save_latest_filename is not None: remote_ud_has_format_string = [ From 55e738f1e4ba8152b1e5eb3cb344e16b656f6eb8 Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Mon, 11 Mar 2024 20:48:14 -0700 Subject: [PATCH 02/32] add `optimizers`, `loggers`, `algorithms`, `device_mesh`, and `save_interval` to analytics logs --- composer/loggers/mosaicml_logger.py | 73 ++++++++++++++++++++++++++--- composer/trainer/trainer.py | 2 +- 2 files changed, 68 insertions(+), 7 deletions(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index f66ae5e45f..c257d125e7 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -15,16 +15,30 @@ import warnings from concurrent.futures import wait from functools import reduce -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union import mcli import torch +from composer.core import Time, Event from composer.core.time import TimeUnit +from composer.utils import dist + +# composer logger types for analytics logging from composer.loggers import Logger -from composer.loggers.logger_destination import LoggerDestination +from composer.loggers.file_logger import FileLogger +from composer.loggers.slack_logger import SlackLogger from composer.loggers.wandb_logger import WandBLogger -from composer.utils import dist +from composer.loggers.mlflow_logger import MLFlowLogger +from composer.loggers.neptune_logger import NeptuneLogger +from composer.loggers.console_logger import ConsoleLogger +from composer.loggers.cometml_logger import CometMLLogger +from composer.loggers.mosaicml_logger import MosaicMLLogger +from composer.loggers.in_memory_logger import InMemoryLogger +from composer.loggers.logger_destination import LoggerDestination +from composer.loggers.tensorboard_logger import TensorboardLogger +from composer.loggers.progress_bar_logger import ProgressBarLogger +from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader if TYPE_CHECKING: from composer.core import State @@ -39,6 +53,21 @@ MOSAICML_LOG_DIR_ENV_VAR = 'MOSAICML_LOG_DIR' MOSAICML_GPU_LOG_FILE_PREFIX_ENV_VAR = 'MOSAICML_GPU_LOG_FILE_PREFIX' +# TODO move this logic somewhere +LOGGER_TYPES = [FileLogger, SlackLogger, + WandBLogger, MLFlowLogger, + NeptuneLogger, ConsoleLogger, + CometMLLogger, MosaicMLLogger, + InMemoryLogger, TensorboardLogger, + ProgressBarLogger, RemoteUploaderDownloader, + LoggerDestination] + +def get_logger_type(logger: Any) -> str: + for logger_type in LOGGER_TYPES: + if isinstance(logger, logger_type): + return logger_type.__name__ + return 'Custom' + class MosaicMLLogger(LoggerDestination): """Log to the MosaicML platform. @@ -96,8 +125,34 @@ def log_hyperparameters(self, hyperparameters: Dict[str, Any]): def log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> None: self._log_metadata(metrics) - def log_analytics(self, autoresume: bool, trainer_state: State, loggers: List[LoggerDestination]): - metrics = {'composer/autoresume': autoresume, 'composer/precision': trainer_state.precision} + def log_analytics( + self, + autoresume: bool, + trainer_state: State, + save_interval: Union[str, int, Time, Callable[[State, Event], bool]], + loggers: List[LoggerDestination], + ) -> None: + metrics: Dict[str, Any] = {'composer/autoresume': autoresume, 'composer/precision': trainer_state.precision} + + metrics['composer/optimizers'] = [ + json.dumps(optimizer.state_dict(), sort_keys=True) + for optimizer in trainer_state.optimizers + ] + metrics['composer/algorithms'] = [ + json.dumps(algorithm.state_dict(), sort_keys=True) + for algorithm in trainer_state.algorithms + ] + metrics['composer/loggers'] = [get_logger_type(logger) for logger in loggers] + + save_interval_str:str = '' + if isinstance(save_interval, Union[str, int]): + save_interval_str = str(save_interval) + elif isinstance(save_interval, Time): + save_interval_str = f'{save_interval._value}{save_interval._unit}' + else: + save_interval_str = 'callable' + + metrics['composer/save_interval'] = save_interval_str if trainer_state.fsdp_config: metrics['composer/sharding_strategy'] = trainer_state.fsdp_config.get('sharding_strategy', None) @@ -105,6 +160,9 @@ def log_analytics(self, autoresume: bool, trainer_state: State, loggers: List[Lo metrics['composer/forward_prefetch'] = trainer_state.fsdp_config.get('forward_prefetch', False) metrics['composer/backward_prefetch'] = trainer_state.fsdp_config.get('backward_prefetch', None) + # Get device_mesh from config so it is in list form and JSON parsable + metrics['composer/device_mesh'] = trainer_state.fsdp_config.get('device_mesh', []) + mixed_precision = trainer_state.fsdp_config.get('mixed_precision', None) if mixed_precision is not None and isinstance(mixed_precision, dict): # Sorting the keys allows us to parse this dict value as JSON in a SQL query if needed @@ -113,7 +171,10 @@ def log_analytics(self, autoresume: bool, trainer_state: State, loggers: List[Lo metrics['composer/mixed_precision'] = mixed_precision if trainer_state.fsdp_state_dict_type is not None: - metrics['composer/state_dict_type'] = trainer_state.fsdp_state_dict_type + metrics['composer/state_dict_type'] = trainer_state.fsdp_state_dict_type + + self.log_metrics(metrics) + self._flush_metadata(force_flush=True) def after_load(self, state: State, logger: Logger) -> None: # Log model data downloaded and initialized for run events diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 0f7ab3d09f..e19ccee7d8 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1276,7 +1276,7 @@ def __init__( # Logger self.logger = Logger(state=self.state, destinations=loggers) if mosaicml_logger: - mosaicml_logger.log_analytics(autoresume, self.state, loggers) + mosaicml_logger.log_analytics(autoresume, self.state, save_interval, loggers, optimizers) if save_latest_filename is not None: remote_ud_has_format_string = [ From da1a179c4fe8ab6d6a17e170263a0bedd318eb39 Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Mon, 11 Mar 2024 21:01:19 -0700 Subject: [PATCH 03/32] fix pyright tests + formatting --- composer/loggers/mosaicml_logger.py | 67 ++++++++++++++++------------- composer/trainer/trainer.py | 2 +- 2 files changed, 39 insertions(+), 30 deletions(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index c257d125e7..a7b1a7056e 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -20,25 +20,24 @@ import mcli import torch -from composer.core import Time, Event +from composer.core import Event, Time from composer.core.time import TimeUnit -from composer.utils import dist - # composer logger types for analytics logging from composer.loggers import Logger -from composer.loggers.file_logger import FileLogger -from composer.loggers.slack_logger import SlackLogger -from composer.loggers.wandb_logger import WandBLogger -from composer.loggers.mlflow_logger import MLFlowLogger -from composer.loggers.neptune_logger import NeptuneLogger -from composer.loggers.console_logger import ConsoleLogger from composer.loggers.cometml_logger import CometMLLogger -from composer.loggers.mosaicml_logger import MosaicMLLogger +from composer.loggers.console_logger import ConsoleLogger +from composer.loggers.file_logger import FileLogger from composer.loggers.in_memory_logger import InMemoryLogger from composer.loggers.logger_destination import LoggerDestination -from composer.loggers.tensorboard_logger import TensorboardLogger +from composer.loggers.mlflow_logger import MLFlowLogger +from composer.loggers.mosaicml_logger import MosaicMLLogger +from composer.loggers.neptune_logger import NeptuneLogger from composer.loggers.progress_bar_logger import ProgressBarLogger from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader +from composer.loggers.slack_logger import SlackLogger +from composer.loggers.tensorboard_logger import TensorboardLogger +from composer.loggers.wandb_logger import WandBLogger +from composer.utils import dist if TYPE_CHECKING: from composer.core import State @@ -54,13 +53,22 @@ MOSAICML_GPU_LOG_FILE_PREFIX_ENV_VAR = 'MOSAICML_GPU_LOG_FILE_PREFIX' # TODO move this logic somewhere -LOGGER_TYPES = [FileLogger, SlackLogger, - WandBLogger, MLFlowLogger, - NeptuneLogger, ConsoleLogger, - CometMLLogger, MosaicMLLogger, - InMemoryLogger, TensorboardLogger, - ProgressBarLogger, RemoteUploaderDownloader, - LoggerDestination] +LOGGER_TYPES = [ + FileLogger, + SlackLogger, + WandBLogger, + MLFlowLogger, + NeptuneLogger, + ConsoleLogger, + CometMLLogger, + MosaicMLLogger, + InMemoryLogger, + TensorboardLogger, + ProgressBarLogger, + RemoteUploaderDownloader, + LoggerDestination, +] + def get_logger_type(logger: Any) -> str: for logger_type in LOGGER_TYPES: @@ -126,25 +134,23 @@ def log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> No self._log_metadata(metrics) def log_analytics( - self, - autoresume: bool, - trainer_state: State, + self, + autoresume: bool, + trainer_state: State, save_interval: Union[str, int, Time, Callable[[State, Event], bool]], loggers: List[LoggerDestination], ) -> None: metrics: Dict[str, Any] = {'composer/autoresume': autoresume, 'composer/precision': trainer_state.precision} metrics['composer/optimizers'] = [ - json.dumps(optimizer.state_dict(), sort_keys=True) - for optimizer in trainer_state.optimizers + json.dumps(optimizer.state_dict(), sort_keys=True) for optimizer in trainer_state.optimizers ] metrics['composer/algorithms'] = [ - json.dumps(algorithm.state_dict(), sort_keys=True) - for algorithm in trainer_state.algorithms + json.dumps(algorithm.state_dict(), sort_keys=True) for algorithm in trainer_state.algorithms ] metrics['composer/loggers'] = [get_logger_type(logger) for logger in loggers] - save_interval_str:str = '' + save_interval_str: str = '' if isinstance(save_interval, Union[str, int]): save_interval_str = str(save_interval) elif isinstance(save_interval, Time): @@ -154,9 +160,12 @@ def log_analytics( metrics['composer/save_interval'] = save_interval_str - if trainer_state.fsdp_config: + if trainer_state.fsdp_config: metrics['composer/sharding_strategy'] = trainer_state.fsdp_config.get('sharding_strategy', None) - metrics['composer/activation_checkpointing'] = trainer_state.fsdp_config.get('activation_checkpointing', False) + metrics['composer/activation_checkpointing'] = trainer_state.fsdp_config.get( + 'activation_checkpointing', + False, + ) metrics['composer/forward_prefetch'] = trainer_state.fsdp_config.get('forward_prefetch', False) metrics['composer/backward_prefetch'] = trainer_state.fsdp_config.get('backward_prefetch', None) @@ -171,7 +180,7 @@ def log_analytics( metrics['composer/mixed_precision'] = mixed_precision if trainer_state.fsdp_state_dict_type is not None: - metrics['composer/state_dict_type'] = trainer_state.fsdp_state_dict_type + metrics['composer/state_dict_type'] = trainer_state.fsdp_state_dict_type self.log_metrics(metrics) self._flush_metadata(force_flush=True) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index e19ccee7d8..a0cc9655ad 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1276,7 +1276,7 @@ def __init__( # Logger self.logger = Logger(state=self.state, destinations=loggers) if mosaicml_logger: - mosaicml_logger.log_analytics(autoresume, self.state, save_interval, loggers, optimizers) + mosaicml_logger.log_analytics(autoresume, self.state, save_interval, loggers) if save_latest_filename is not None: remote_ud_has_format_string = [ From 44cb2838b51f020c5be87fce9cc97082c7a07f01 Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Tue, 12 Mar 2024 15:26:44 -0700 Subject: [PATCH 04/32] log cloud providers from `load_path` / `save_folder` --- composer/loggers/mosaicml_logger.py | 16 ++++++++++++++-- composer/trainer/trainer.py | 5 +++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index a7b1a7056e..7cbc5f68ff 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -38,6 +38,7 @@ from composer.loggers.tensorboard_logger import TensorboardLogger from composer.loggers.wandb_logger import WandBLogger from composer.utils import dist +from composer.utils.file_helpers import parse_uri if TYPE_CHECKING: from composer.core import State @@ -139,6 +140,8 @@ def log_analytics( trainer_state: State, save_interval: Union[str, int, Time, Callable[[State, Event], bool]], loggers: List[LoggerDestination], + load_path: Optional[str] = None, + save_folder: Optional[str] = None ) -> None: metrics: Dict[str, Any] = {'composer/autoresume': autoresume, 'composer/precision': trainer_state.precision} @@ -150,14 +153,23 @@ def log_analytics( ] metrics['composer/loggers'] = [get_logger_type(logger) for logger in loggers] - save_interval_str: str = '' + # Take the service provider out of the URI and log it to metadata. If no service provider + # is found (i.e. backend = ''), then we assume 'local' for the cloud provider. + if load_path is not None: + backend, _, _ = parse_uri(load_path) + metrics['composer/cloud_provider_data'] = backend if backend else 'local' + if save_folder is not None: + backend, _, _ = parse_uri(save_folder) + metrics['composer/cloud_provider_checkpoints'] = backend if backend else 'local' + + # Save interval can be passed in w/ multiple types. If the type is a function, then + # we log 'callable' as the save_interval value for analytics. if isinstance(save_interval, Union[str, int]): save_interval_str = str(save_interval) elif isinstance(save_interval, Time): save_interval_str = f'{save_interval._value}{save_interval._unit}' else: save_interval_str = 'callable' - metrics['composer/save_interval'] = save_interval_str if trainer_state.fsdp_config: diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index c7b9d01d3a..a1bcd245e8 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1300,8 +1300,6 @@ def __init__( # Logger self.logger = Logger(state=self.state, destinations=loggers) - if mosaicml_logger: - mosaicml_logger.log_analytics(autoresume, self.state, save_interval, loggers) if save_latest_filename is not None: remote_ud_has_format_string = [ @@ -1702,6 +1700,9 @@ def __init__( ) self.state.run_name = run_name + if mosaicml_logger is not None: + mosaicml_logger.log_analytics(autoresume, self.state, save_interval, loggers, load_path, save_folder) + # FSDP wrap if model is not yet wrapped and FSDP is enabled. This can happen if # load_fsdp_monolith_rank0_only=True but no checkpoint was loaded. if not self.state.fsdp_enabled and self.state.fsdp_config is not None and self.state.fsdp_auto_wrap and self.state.load_fsdp_monolith_rank0_only: From 681e1668db8dcf445159198c7da1e8d5ea26a08b Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Tue, 12 Mar 2024 15:33:52 -0700 Subject: [PATCH 05/32] run formatter --- composer/loggers/mosaicml_logger.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index 7cbc5f68ff..a59188226e 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -141,7 +141,7 @@ def log_analytics( save_interval: Union[str, int, Time, Callable[[State, Event], bool]], loggers: List[LoggerDestination], load_path: Optional[str] = None, - save_folder: Optional[str] = None + save_folder: Optional[str] = None, ) -> None: metrics: Dict[str, Any] = {'composer/autoresume': autoresume, 'composer/precision': trainer_state.precision} @@ -162,7 +162,7 @@ def log_analytics( backend, _, _ = parse_uri(save_folder) metrics['composer/cloud_provider_checkpoints'] = backend if backend else 'local' - # Save interval can be passed in w/ multiple types. If the type is a function, then + # Save interval can be passed in w/ multiple types. If the type is a function, then # we log 'callable' as the save_interval value for analytics. if isinstance(save_interval, Union[str, int]): save_interval_str = str(save_interval) From 362f9ba86312480030bce2ec6918bdddac5202f9 Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Tue, 12 Mar 2024 15:49:50 -0700 Subject: [PATCH 06/32] get rid of circular imports --- composer/loggers/mosaicml_logger.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index a59188226e..45c7f3fd05 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -20,8 +20,8 @@ import mcli import torch -from composer.core import Event, Time -from composer.core.time import TimeUnit +from composer.core.event import Event +from composer.core.time import Time, TimeUnit # composer logger types for analytics logging from composer.loggers import Logger from composer.loggers.cometml_logger import CometMLLogger @@ -30,7 +30,6 @@ from composer.loggers.in_memory_logger import InMemoryLogger from composer.loggers.logger_destination import LoggerDestination from composer.loggers.mlflow_logger import MLFlowLogger -from composer.loggers.mosaicml_logger import MosaicMLLogger from composer.loggers.neptune_logger import NeptuneLogger from composer.loggers.progress_bar_logger import ProgressBarLogger from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader @@ -62,7 +61,6 @@ NeptuneLogger, ConsoleLogger, CometMLLogger, - MosaicMLLogger, InMemoryLogger, TensorboardLogger, ProgressBarLogger, @@ -151,7 +149,7 @@ def log_analytics( metrics['composer/algorithms'] = [ json.dumps(algorithm.state_dict(), sort_keys=True) for algorithm in trainer_state.algorithms ] - metrics['composer/loggers'] = [get_logger_type(logger) for logger in loggers] + metrics['composer/loggers'] = [get_logger_type(logger) for logger in (loggers + [MosaicMLLogger])] # Take the service provider out of the URI and log it to metadata. If no service provider # is found (i.e. backend = ''), then we assume 'local' for the cloud provider. From 21cce59d434152153d38652a94defc621cd88c8d Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Tue, 12 Mar 2024 16:28:05 -0700 Subject: [PATCH 07/32] access mosiacml_logger in a differnent way that doesn't affect tests --- composer/loggers/mosaicml_logger.py | 37 ++------------------------- composer/trainer/trainer.py | 3 +-- composer/utils/__init__.py | 2 ++ composer/utils/analytics_helpers.py | 39 +++++++++++++++++++++++++++++ 4 files changed, 44 insertions(+), 37 deletions(-) create mode 100644 composer/utils/analytics_helpers.py diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index 45c7f3fd05..b554175c09 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -24,19 +24,9 @@ from composer.core.time import Time, TimeUnit # composer logger types for analytics logging from composer.loggers import Logger -from composer.loggers.cometml_logger import CometMLLogger -from composer.loggers.console_logger import ConsoleLogger -from composer.loggers.file_logger import FileLogger -from composer.loggers.in_memory_logger import InMemoryLogger from composer.loggers.logger_destination import LoggerDestination -from composer.loggers.mlflow_logger import MLFlowLogger -from composer.loggers.neptune_logger import NeptuneLogger -from composer.loggers.progress_bar_logger import ProgressBarLogger -from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader -from composer.loggers.slack_logger import SlackLogger -from composer.loggers.tensorboard_logger import TensorboardLogger from composer.loggers.wandb_logger import WandBLogger -from composer.utils import dist +from composer.utils import dist, get_logger_type from composer.utils.file_helpers import parse_uri if TYPE_CHECKING: @@ -52,29 +42,6 @@ MOSAICML_LOG_DIR_ENV_VAR = 'MOSAICML_LOG_DIR' MOSAICML_GPU_LOG_FILE_PREFIX_ENV_VAR = 'MOSAICML_GPU_LOG_FILE_PREFIX' -# TODO move this logic somewhere -LOGGER_TYPES = [ - FileLogger, - SlackLogger, - WandBLogger, - MLFlowLogger, - NeptuneLogger, - ConsoleLogger, - CometMLLogger, - InMemoryLogger, - TensorboardLogger, - ProgressBarLogger, - RemoteUploaderDownloader, - LoggerDestination, -] - - -def get_logger_type(logger: Any) -> str: - for logger_type in LOGGER_TYPES: - if isinstance(logger, logger_type): - return logger_type.__name__ - return 'Custom' - class MosaicMLLogger(LoggerDestination): """Log to the MosaicML platform. @@ -149,7 +116,7 @@ def log_analytics( metrics['composer/algorithms'] = [ json.dumps(algorithm.state_dict(), sort_keys=True) for algorithm in trainer_state.algorithms ] - metrics['composer/loggers'] = [get_logger_type(logger) for logger in (loggers + [MosaicMLLogger])] + metrics['composer/loggers'] = [get_logger_type(logger) for logger in loggers] # Take the service provider out of the URI and log it to metadata. If no service provider # is found (i.e. backend = ''), then we assume 'local' for the cloud provider. diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index a1bcd245e8..c69731363b 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1280,7 +1280,6 @@ def __init__( # MosaicML Logger # Keep MosaicML logger above the RemoteUploaderDownloader so that fit end is reported before the final checkpoint begins uploading - mosaicml_logger = None if os.environ.get(MOSAICML_PLATFORM_ENV_VAR, 'false').lower() == 'true' and os.environ.get( MOSAICML_ACCESS_TOKEN_ENV_VAR, ) is not None and not any(isinstance(x, MosaicMLLogger) for x in loggers): @@ -1700,7 +1699,7 @@ def __init__( ) self.state.run_name = run_name - if mosaicml_logger is not None: + if next((logger for logger in loggers if isinstance(logger, MosaicMLLogger)), None) is not None: mosaicml_logger.log_analytics(autoresume, self.state, save_interval, loggers, load_path, save_folder) # FSDP wrap if model is not yet wrapped and FSDP is enabled. This can happen if diff --git a/composer/utils/__init__.py b/composer/utils/__init__.py index bdd5a22447..2912ff4b7c 100644 --- a/composer/utils/__init__.py +++ b/composer/utils/__init__.py @@ -3,6 +3,7 @@ """Helper utilities.""" +from composer.utils.analytics_helpers import get_logger_type from composer.utils.auto_log_hparams import ( convert_flat_dict_to_nested_dict, convert_nested_dict_to_flat_dict, @@ -129,4 +130,5 @@ 'LocalEvalClient', 'MosaicMLLambdaEvalClient', 'partial_format', + 'get_logger_type', ] diff --git a/composer/utils/analytics_helpers.py b/composer/utils/analytics_helpers.py new file mode 100644 index 0000000000..f56be20d80 --- /dev/null +++ b/composer/utils/analytics_helpers.py @@ -0,0 +1,39 @@ +# TODO move this logic somewhere +from typing import Any +from composer.loggers.file_logger import FileLogger +from composer.loggers.slack_logger import SlackLogger +from composer.loggers.wandb_logger import WandBLogger +from composer.loggers.mlflow_logger import MLFlowLogger +from composer.loggers.neptune_logger import NeptuneLogger +from composer.loggers.console_logger import ConsoleLogger +from composer.loggers.cometml_logger import CometMLLogger +from composer.loggers.mosaicml_logger import MosaicMLLogger +from composer.loggers.in_memory_logger import InMemoryLogger +from composer.loggers.tensorboard_logger import TensorboardLogger +from composer.loggers.progress_bar_logger import ProgressBarLogger +from composer.loggers.logger_destination import LoggerDestination +from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader + + +LOGGER_TYPES = [ + FileLogger, + SlackLogger, + WandBLogger, + MLFlowLogger, + NeptuneLogger, + ConsoleLogger, + CometMLLogger, + MosaicMLLogger, + InMemoryLogger, + TensorboardLogger, + ProgressBarLogger, + RemoteUploaderDownloader, + LoggerDestination, +] + +def get_logger_type(logger: Any) -> str: + for logger_type in LOGGER_TYPES: + if isinstance(logger, logger_type): + return logger_type.__name__ + return 'Custom' + From be3000437a8ae79053b3995d48793ac367445283 Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Tue, 12 Mar 2024 17:30:28 -0700 Subject: [PATCH 08/32] smol improvements to style --- composer/trainer/trainer.py | 1 + composer/utils/analytics_helpers.py | 30 ++++++++++++++++++----------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index c69731363b..8a66782dab 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1700,6 +1700,7 @@ def __init__( self.state.run_name = run_name if next((logger for logger in loggers if isinstance(logger, MosaicMLLogger)), None) is not None: + mosaicml_logger = next((logger for logger in loggers if isinstance(logger, MosaicMLLogger))) mosaicml_logger.log_analytics(autoresume, self.state, save_interval, loggers, load_path, save_folder) # FSDP wrap if model is not yet wrapped and FSDP is enabled. This can happen if diff --git a/composer/utils/analytics_helpers.py b/composer/utils/analytics_helpers.py index f56be20d80..be8ab93bf4 100644 --- a/composer/utils/analytics_helpers.py +++ b/composer/utils/analytics_helpers.py @@ -1,19 +1,23 @@ -# TODO move this logic somewhere +# Copyright 2024 MosaicML Composer authors +# SPDX-License-Identifier: Apache-2.0 + +"""Helpers for logging analytics with the MosaicMLLogger.""" + from typing import Any + +from composer.loggers.cometml_logger import CometMLLogger +from composer.loggers.console_logger import ConsoleLogger from composer.loggers.file_logger import FileLogger -from composer.loggers.slack_logger import SlackLogger -from composer.loggers.wandb_logger import WandBLogger +from composer.loggers.in_memory_logger import InMemoryLogger +from composer.loggers.logger_destination import LoggerDestination from composer.loggers.mlflow_logger import MLFlowLogger -from composer.loggers.neptune_logger import NeptuneLogger -from composer.loggers.console_logger import ConsoleLogger -from composer.loggers.cometml_logger import CometMLLogger from composer.loggers.mosaicml_logger import MosaicMLLogger -from composer.loggers.in_memory_logger import InMemoryLogger -from composer.loggers.tensorboard_logger import TensorboardLogger +from composer.loggers.neptune_logger import NeptuneLogger from composer.loggers.progress_bar_logger import ProgressBarLogger -from composer.loggers.logger_destination import LoggerDestination from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader - +from composer.loggers.slack_logger import SlackLogger +from composer.loggers.tensorboard_logger import TensorboardLogger +from composer.loggers.wandb_logger import WandBLogger LOGGER_TYPES = [ FileLogger, @@ -31,9 +35,13 @@ LoggerDestination, ] + def get_logger_type(logger: Any) -> str: + """Returns the type of a logger as a string. If the logger is not a known type, returns 'Custom'. + + TODO: could any logger even be labeled as 'Custom'? wouldn't a custom logger just be an instance of LoggerDestination? + """ for logger_type in LOGGER_TYPES: if isinstance(logger, logger_type): return logger_type.__name__ return 'Custom' - From 77d2c25a62bb4f1f4c3354c50098cf3e47805a7f Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Tue, 12 Mar 2024 17:43:08 -0700 Subject: [PATCH 09/32] oops get rid of more circular imports 0_0 --- composer/loggers/mosaicml_logger.py | 8 ++++++-- composer/utils/__init__.py | 2 -- composer/utils/analytics_helpers.py | 2 -- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index b554175c09..5d58a5ffb3 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -26,7 +26,8 @@ from composer.loggers import Logger from composer.loggers.logger_destination import LoggerDestination from composer.loggers.wandb_logger import WandBLogger -from composer.utils import dist, get_logger_type +from composer.utils import dist +from composer.utils.analytics_helpers import get_logger_type from composer.utils.file_helpers import parse_uri if TYPE_CHECKING: @@ -116,7 +117,10 @@ def log_analytics( metrics['composer/algorithms'] = [ json.dumps(algorithm.state_dict(), sort_keys=True) for algorithm in trainer_state.algorithms ] - metrics['composer/loggers'] = [get_logger_type(logger) for logger in loggers] + metrics['composer/loggers'] = [ + get_logger_type(logger) if not isinstance(logger, MosaicMLLogger) else 'MosaicMLLogger' + for logger in loggers + ] # Take the service provider out of the URI and log it to metadata. If no service provider # is found (i.e. backend = ''), then we assume 'local' for the cloud provider. diff --git a/composer/utils/__init__.py b/composer/utils/__init__.py index 2912ff4b7c..bdd5a22447 100644 --- a/composer/utils/__init__.py +++ b/composer/utils/__init__.py @@ -3,7 +3,6 @@ """Helper utilities.""" -from composer.utils.analytics_helpers import get_logger_type from composer.utils.auto_log_hparams import ( convert_flat_dict_to_nested_dict, convert_nested_dict_to_flat_dict, @@ -130,5 +129,4 @@ 'LocalEvalClient', 'MosaicMLLambdaEvalClient', 'partial_format', - 'get_logger_type', ] diff --git a/composer/utils/analytics_helpers.py b/composer/utils/analytics_helpers.py index be8ab93bf4..d273383768 100644 --- a/composer/utils/analytics_helpers.py +++ b/composer/utils/analytics_helpers.py @@ -11,7 +11,6 @@ from composer.loggers.in_memory_logger import InMemoryLogger from composer.loggers.logger_destination import LoggerDestination from composer.loggers.mlflow_logger import MLFlowLogger -from composer.loggers.mosaicml_logger import MosaicMLLogger from composer.loggers.neptune_logger import NeptuneLogger from composer.loggers.progress_bar_logger import ProgressBarLogger from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader @@ -27,7 +26,6 @@ NeptuneLogger, ConsoleLogger, CometMLLogger, - MosaicMLLogger, InMemoryLogger, TensorboardLogger, ProgressBarLogger, From 82d771aa1b9c85bd3b1296cfe5ccfb9ceb1650fe Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Tue, 12 Mar 2024 19:56:36 -0700 Subject: [PATCH 10/32] log `train_loader_workers` and `eval_loaders` to analytics --- composer/loggers/mosaicml_logger.py | 15 +++++++++++++++ composer/trainer/trainer.py | 10 +++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index 5d58a5ffb3..226fbea106 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -20,6 +20,7 @@ import mcli import torch +from composer.core import DataSpec from composer.core.event import Event from composer.core.time import Time, TimeUnit # composer logger types for analytics logging @@ -106,11 +107,25 @@ def log_analytics( trainer_state: State, save_interval: Union[str, int, Time, Callable[[State, Event], bool]], loggers: List[LoggerDestination], + train_dataloader: Union[DataSpec, None], load_path: Optional[str] = None, save_folder: Optional[str] = None, ) -> None: metrics: Dict[str, Any] = {'composer/autoresume': autoresume, 'composer/precision': trainer_state.precision} + if train_dataloader is not None and isinstance(train_dataloader.dataloader, torch.utils.data.DataLoader): + metrics['composer/train_loader_workers'] = train_dataloader.dataloader.num_workers + + metrics['composer/eval_loaders'] = [] + for evaluator in trainer_state.evaluators: + if isinstance(evaluator.dataloader, torch.utils.data.DataLoader): + metrics['composer/eval_loaders'].append( + json.dumps({ + 'label': evaluator.label, + 'num_workers': evaluator.dataloader.num_workers, + }), + ) + metrics['composer/optimizers'] = [ json.dumps(optimizer.state_dict(), sort_keys=True) for optimizer in trainer_state.optimizers ] diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 8a66782dab..aa9b696597 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1701,7 +1701,15 @@ def __init__( if next((logger for logger in loggers if isinstance(logger, MosaicMLLogger)), None) is not None: mosaicml_logger = next((logger for logger in loggers if isinstance(logger, MosaicMLLogger))) - mosaicml_logger.log_analytics(autoresume, self.state, save_interval, loggers, load_path, save_folder) + mosaicml_logger.log_analytics( + autoresume, + self.state, + save_interval, + loggers, + train_dataloader, + load_path, + save_folder, + ) # FSDP wrap if model is not yet wrapped and FSDP is enabled. This can happen if # load_fsdp_monolith_rank0_only=True but no checkpoint was loaded. From b9aa219f8de51bb8028acf04f1005c5f29e78db9 Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Wed, 13 Mar 2024 14:37:48 -0700 Subject: [PATCH 11/32] fix type checks / access for `torch.utils.data.DataLoader` --- composer/loggers/mosaicml_logger.py | 13 +++++++------ composer/trainer/trainer.py | 1 - 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index 226fbea106..bce7702c55 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -19,8 +19,8 @@ import mcli import torch +import torch.utils.data -from composer.core import DataSpec from composer.core.event import Event from composer.core.time import Time, TimeUnit # composer logger types for analytics logging @@ -107,22 +107,23 @@ def log_analytics( trainer_state: State, save_interval: Union[str, int, Time, Callable[[State, Event], bool]], loggers: List[LoggerDestination], - train_dataloader: Union[DataSpec, None], load_path: Optional[str] = None, save_folder: Optional[str] = None, ) -> None: metrics: Dict[str, Any] = {'composer/autoresume': autoresume, 'composer/precision': trainer_state.precision} - if train_dataloader is not None and isinstance(train_dataloader.dataloader, torch.utils.data.DataLoader): - metrics['composer/train_loader_workers'] = train_dataloader.dataloader.num_workers + train_dataloader = trainer_state.train_dataloader + if train_dataloader is not None and isinstance(train_dataloader, torch.utils.data.DataLoader): + metrics['composer/train_loader_workers'] = train_dataloader.num_workers metrics['composer/eval_loaders'] = [] for evaluator in trainer_state.evaluators: - if isinstance(evaluator.dataloader, torch.utils.data.DataLoader): + dataloader = evaluator.dataloader.dataloader + if isinstance(dataloader, torch.utils.data.DataLoader): metrics['composer/eval_loaders'].append( json.dumps({ 'label': evaluator.label, - 'num_workers': evaluator.dataloader.num_workers, + 'num_workers': dataloader.num_workers, }), ) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index aa9b696597..25ec03c4d4 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1706,7 +1706,6 @@ def __init__( self.state, save_interval, loggers, - train_dataloader, load_path, save_folder, ) From d841895052985510528ae5fe9bb5f92339d4bd33 Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Wed, 13 Mar 2024 17:17:09 -0700 Subject: [PATCH 12/32] remove unnecessary comment --- composer/loggers/mosaicml_logger.py | 1 - 1 file changed, 1 deletion(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index bce7702c55..fd3c0798f9 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -23,7 +23,6 @@ from composer.core.event import Event from composer.core.time import Time, TimeUnit -# composer logger types for analytics logging from composer.loggers import Logger from composer.loggers.logger_destination import LoggerDestination from composer.loggers.wandb_logger import WandBLogger From 4a9da314c3c06c804e077f538e38fbc6379fe133 Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Fri, 15 Mar 2024 11:35:16 -0700 Subject: [PATCH 13/32] log analytics on `EVENT.INIT` --- composer/core/engine.py | 5 +++++ composer/loggers/mosaicml_logger.py | 32 ++++++++++++++++++++--------- composer/trainer/trainer.py | 22 +++++++++----------- 3 files changed, 37 insertions(+), 22 deletions(-) diff --git a/composer/core/engine.py b/composer/core/engine.py index 17e5365e2a..58d79d125e 100644 --- a/composer/core/engine.py +++ b/composer/core/engine.py @@ -83,6 +83,7 @@ def run_last(algorithms: Sequence[Algorithm], event: Event) -> Sequence[Algorith from composer.core.event import Event from composer.core.state import State from composer.loggers import Logger, LoggerDestination +from composer.loggers.mosaicml_logger import log_run_analytics from composer.profiler import ProfilerAction from composer.utils import ensure_tuple @@ -293,6 +294,10 @@ def run_event( self._run_loggers(event) self._run_nonlogger_callbacks(event) traces = self._run_algorithms(event) + + # If a MosaicMLLogger is present, log analytics for the run to metadata. + log_run_analytics(self.logger.destinations) + else: traces = self._run_algorithms(event) # Run callbacks first, so any log calls from a callback that are executed lazily diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index d066d86e7a..73cf5b2a15 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -15,7 +15,7 @@ import warnings from concurrent.futures import wait from functools import reduce -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import mcli import torch @@ -73,10 +73,12 @@ def __init__( log_interval: int = 60, ignore_keys: Optional[List[str]] = None, ignore_exceptions: bool = False, + analytics_data: Optional[Dict[str, Any]] = None, ) -> None: self.log_interval = log_interval self.ignore_keys = ignore_keys self.ignore_exceptions = ignore_exceptions + self.analytics_data = analytics_data self._enabled = dist.get_global_rank() == 0 if self._enabled: self.time_last_logged = 0 @@ -100,15 +102,18 @@ def log_hyperparameters(self, hyperparameters: Dict[str, Any]): def log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> None: self._log_metadata(metrics) - def log_analytics( - self, - autoresume: bool, - trainer_state: State, - save_interval: Union[str, int, Time, Callable[[State, Event], bool]], - loggers: List[LoggerDestination], - load_path: Optional[str] = None, - save_folder: Optional[str] = None, - ) -> None: + def log_analytics(self,) -> None: + if self.analytics_data is None: + return + + # Fetch / cast metrics that we want to log from self.analytics_data + autoresume: bool = self.analytics_data['autoresume'] + trainer_state: State = self.analytics_data['state'] + save_interval: Union[str, int, Time, Callable[[State, Event], bool]] = self.analytics_data['save_interval'] + loggers: List[LoggerDestination] = self.analytics_data['loggers'] + load_path: Union[str, None] = self.analytics_data['load_path'] + save_folder: Union[str, None] = self.analytics_data['save_folder'] + metrics: Dict[str, Any] = {'composer/autoresume': autoresume, 'composer/precision': trainer_state.precision} train_dataloader = trainer_state.train_dataloader @@ -369,3 +374,10 @@ def exception_to_json_serializable_dict(exc: Exception): except AttributeError: pass return exc_data + + +def log_run_analytics(loggers: Tuple[LoggerDestination, ...]): + """Log run analytics to MosaicML, if a MosaicMLLogger is available in the list.""" + mosaicml_logger = next((logger for logger in loggers if isinstance(logger, MosaicMLLogger)), None) + if mosaicml_logger is not None: + mosaicml_logger.log_analytics() diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 25ec03c4d4..cee5fce72d 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1284,7 +1284,16 @@ def __init__( MOSAICML_ACCESS_TOKEN_ENV_VAR, ) is not None and not any(isinstance(x, MosaicMLLogger) for x in loggers): log.info('Detected run on MosaicML platform. Adding MosaicMLLogger to loggers.') - mosaicml_logger = MosaicMLLogger() + mosaicml_logger = MosaicMLLogger( + analytics_data={ + 'autoresume': autoresume, + 'state': self.state, + 'save_interval': save_interval, + 'loggers': loggers, + 'load_path': load_path, + 'save_folder': save_folder, + }, + ) loggers.append(mosaicml_logger) # Remote Uploader Downloader @@ -1699,17 +1708,6 @@ def __init__( ) self.state.run_name = run_name - if next((logger for logger in loggers if isinstance(logger, MosaicMLLogger)), None) is not None: - mosaicml_logger = next((logger for logger in loggers if isinstance(logger, MosaicMLLogger))) - mosaicml_logger.log_analytics( - autoresume, - self.state, - save_interval, - loggers, - load_path, - save_folder, - ) - # FSDP wrap if model is not yet wrapped and FSDP is enabled. This can happen if # load_fsdp_monolith_rank0_only=True but no checkpoint was loaded. if not self.state.fsdp_enabled and self.state.fsdp_config is not None and self.state.fsdp_auto_wrap and self.state.load_fsdp_monolith_rank0_only: From 8a5d9dff3f30116df464743e2367a50e97d62fef Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Fri, 15 Mar 2024 11:39:42 -0700 Subject: [PATCH 14/32] comment adjustment --- composer/loggers/mosaicml_logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index 73cf5b2a15..c34f753290 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -377,7 +377,7 @@ def exception_to_json_serializable_dict(exc: Exception): def log_run_analytics(loggers: Tuple[LoggerDestination, ...]): - """Log run analytics to MosaicML, if a MosaicMLLogger is available in the list.""" + """Log run analytics to metadata if a MosaicMLLogger is available in the list.""" mosaicml_logger = next((logger for logger in loggers if isinstance(logger, MosaicMLLogger)), None) if mosaicml_logger is not None: mosaicml_logger.log_analytics() From b8032c5362ac0448b2a617cd6be6f074a4dd0d58 Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Fri, 15 Mar 2024 12:09:20 -0700 Subject: [PATCH 15/32] make sure `Logger.destinations` is iterable --- composer/loggers/mosaicml_logger.py | 6 +++++- composer/utils/analytics_helpers.py | 5 +---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index c34f753290..7dc93f21d3 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -15,7 +15,7 @@ import warnings from concurrent.futures import wait from functools import reduce -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Tuple, Union import mcli import torch @@ -378,6 +378,10 @@ def exception_to_json_serializable_dict(exc: Exception): def log_run_analytics(loggers: Tuple[LoggerDestination, ...]): """Log run analytics to metadata if a MosaicMLLogger is available in the list.""" + # Avoids a casting bug during testing + if not isinstance(loggers, Iterable): + return + mosaicml_logger = next((logger for logger in loggers if isinstance(logger, MosaicMLLogger)), None) if mosaicml_logger is not None: mosaicml_logger.log_analytics() diff --git a/composer/utils/analytics_helpers.py b/composer/utils/analytics_helpers.py index d273383768..b38a2a9497 100644 --- a/composer/utils/analytics_helpers.py +++ b/composer/utils/analytics_helpers.py @@ -35,10 +35,7 @@ def get_logger_type(logger: Any) -> str: - """Returns the type of a logger as a string. If the logger is not a known type, returns 'Custom'. - - TODO: could any logger even be labeled as 'Custom'? wouldn't a custom logger just be an instance of LoggerDestination? - """ + """Returns the type of a logger as a string. If the logger is not a known type, returns 'Custom'.""" for logger_type in LOGGER_TYPES: if isinstance(logger, logger_type): return logger_type.__name__ From 52db068fb52a8f3a01196104f478c574c559abd7 Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Tue, 19 Mar 2024 16:55:53 -0700 Subject: [PATCH 16/32] update default for `backward_prefetch` and move analytics logging to `def init` --- composer/core/engine.py | 4 ---- composer/loggers/mosaicml_logger.py | 24 +++++++++++------------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/composer/core/engine.py b/composer/core/engine.py index 58d79d125e..e0e640d9a3 100644 --- a/composer/core/engine.py +++ b/composer/core/engine.py @@ -83,7 +83,6 @@ def run_last(algorithms: Sequence[Algorithm], event: Event) -> Sequence[Algorith from composer.core.event import Event from composer.core.state import State from composer.loggers import Logger, LoggerDestination -from composer.loggers.mosaicml_logger import log_run_analytics from composer.profiler import ProfilerAction from composer.utils import ensure_tuple @@ -295,9 +294,6 @@ def run_event( self._run_nonlogger_callbacks(event) traces = self._run_algorithms(event) - # If a MosaicMLLogger is present, log analytics for the run to metadata. - log_run_analytics(self.logger.destinations) - else: traces = self._run_algorithms(event) # Run callbacks first, so any log calls from a callback that are executed lazily diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index 7dc93f21d3..9b4e1355fd 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -15,11 +15,13 @@ import warnings from concurrent.futures import wait from functools import reduce -from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union import mcli import torch import torch.utils.data +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.nn import Module from composer.core.event import Event from composer.core.time import Time, TimeUnit @@ -66,6 +68,7 @@ class MosaicMLLogger(LoggerDestination): (default: ``None``) ignore_exceptions: Flag to disable logging exceptions. Defaults to False. + analytics_data (Dict[str, Any], optional): Analytical metrics to log about the current run. Defaults to ``None``. """ def __init__( @@ -168,7 +171,10 @@ def log_analytics(self,) -> None: False, ) metrics['composer/forward_prefetch'] = trainer_state.fsdp_config.get('forward_prefetch', False) - metrics['composer/backward_prefetch'] = trainer_state.fsdp_config.get('backward_prefetch', None) + metrics['composer/backward_prefetch'] = trainer_state.fsdp_config.get( + 'backward_prefetch', + FSDP(Module()).backward_prefetch + ) # Get device_mesh from config so it is in list form and JSON parsable metrics['composer/device_mesh'] = trainer_state.fsdp_config.get('device_mesh', []) @@ -190,6 +196,9 @@ def log_exception(self, exception: Exception): self._log_metadata({'exception': exception_to_json_serializable_dict(exception)}) self._flush_metadata(force_flush=True) + def init(self, state: State, logger: Logger) -> None: + self.log_analytics() + def after_load(self, state: State, logger: Logger) -> None: # Log model data downloaded and initialized for run events log.debug(f'Logging model initialized time to metadata') @@ -374,14 +383,3 @@ def exception_to_json_serializable_dict(exc: Exception): except AttributeError: pass return exc_data - - -def log_run_analytics(loggers: Tuple[LoggerDestination, ...]): - """Log run analytics to metadata if a MosaicMLLogger is available in the list.""" - # Avoids a casting bug during testing - if not isinstance(loggers, Iterable): - return - - mosaicml_logger = next((logger for logger in loggers if isinstance(logger, MosaicMLLogger)), None) - if mosaicml_logger is not None: - mosaicml_logger.log_analytics() From b499ab4894c37b878a2d6cc9ca7b1687b638e054 Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Tue, 19 Mar 2024 17:12:18 -0700 Subject: [PATCH 17/32] =?UTF-8?q?more=20formatting=20=F0=9F=A4=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- composer/loggers/mosaicml_logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index 9b4e1355fd..9c8079122e 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -173,7 +173,7 @@ def log_analytics(self,) -> None: metrics['composer/forward_prefetch'] = trainer_state.fsdp_config.get('forward_prefetch', False) metrics['composer/backward_prefetch'] = trainer_state.fsdp_config.get( 'backward_prefetch', - FSDP(Module()).backward_prefetch + FSDP(Module()).backward_prefetch, ) # Get device_mesh from config so it is in list form and JSON parsable From 45b1f8b214404de38184292de17c7477393ade7b Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Tue, 19 Mar 2024 17:34:27 -0700 Subject: [PATCH 18/32] get rid of unnecessary diff --- composer/core/engine.py | 1 - 1 file changed, 1 deletion(-) diff --git a/composer/core/engine.py b/composer/core/engine.py index e0e640d9a3..17e5365e2a 100644 --- a/composer/core/engine.py +++ b/composer/core/engine.py @@ -293,7 +293,6 @@ def run_event( self._run_loggers(event) self._run_nonlogger_callbacks(event) traces = self._run_algorithms(event) - else: traces = self._run_algorithms(event) # Run callbacks first, so any log calls from a callback that are executed lazily From 06cd615a0e827034ee08f0ceaf88417918f3835d Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Wed, 20 Mar 2024 13:12:29 -0700 Subject: [PATCH 19/32] make tests for `get_logger_type` --- composer/utils/analytics_helpers.py | 2 +- tests/loggers/test_mosaicml_logger.py | 52 ++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/composer/utils/analytics_helpers.py b/composer/utils/analytics_helpers.py index b38a2a9497..718644edb9 100644 --- a/composer/utils/analytics_helpers.py +++ b/composer/utils/analytics_helpers.py @@ -39,4 +39,4 @@ def get_logger_type(logger: Any) -> str: for logger_type in LOGGER_TYPES: if isinstance(logger, logger_type): return logger_type.__name__ - return 'Custom' + return 'Other' diff --git a/tests/loggers/test_mosaicml_logger.py b/tests/loggers/test_mosaicml_logger.py index 795c8da56b..9b92d719f6 100644 --- a/tests/loggers/test_mosaicml_logger.py +++ b/tests/loggers/test_mosaicml_logger.py @@ -2,7 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import json +import os from concurrent.futures import Future +from pathlib import Path from typing import Type from unittest.mock import MagicMock @@ -12,7 +14,7 @@ from torch.utils.data import DataLoader from composer.core import Callback, Time, TimeUnit -from composer.loggers import WandBLogger +from composer.loggers import CometMLLogger, LoggerDestination, RemoteUploaderDownloader, WandBLogger from composer.loggers.mosaicml_logger import ( MOSAICML_ACCESS_TOKEN_ENV_VAR, MOSAICML_PLATFORM_ENV_VAR, @@ -22,6 +24,7 @@ ) from composer.trainer import Trainer from composer.utils import dist, get_composer_env_dict +from composer.utils.analytics_helpers import LOGGER_TYPES, get_logger_type from tests.callbacks.callback_settings import get_cb_kwargs, get_cb_model_and_datasets, get_cbs_and_marks from tests.common import RandomClassificationDataset, SimpleModel from tests.common.markers import world_size @@ -57,6 +60,53 @@ def _update_metadata(self, run_name, new_metadata): json.dumps(self.run_metadata[run_name]) +@pytest.fixture +def comet_offline_directory(tmp_path): + return str(tmp_path / Path('my_cometml_runs')) + + +@pytest.fixture +def comet_logger(monkeypatch, comet_offline_directory): + comet_ml = pytest.importorskip('comet_ml', reason='comet_ml is optional') + + monkeypatch.setattr(comet_ml, 'Experiment', comet_ml.OfflineExperiment) + from composer.loggers import CometMLLogger + + # Set offline directory. + os.environ['COMET_OFFLINE_DIRECTORY'] = comet_offline_directory + + comet_logger = CometMLLogger() + return comet_logger + + +def test_get_logger_type(tmp_path: Path, comet_logger: CometMLLogger): + """Test that `get_logger_type` returns the correct logger type.""" + for logger in LOGGER_TYPES: + if logger == CometMLLogger: + assert get_logger_type(comet_logger) == 'CometMLLogger' + elif logger == RemoteUploaderDownloader: + remote_dir = str(tmp_path / 'object_store') + os.makedirs(remote_dir, exist_ok=True) + remote_uploader_downloader = RemoteUploaderDownloader(remote_dir) + assert get_logger_type(remote_uploader_downloader) == 'RemoteUploaderDownloader' + else: + assert get_logger_type(logger()) == logger.__name__ + + # Custom loggers should default to `LoggerDestination` + class CustomLogger(LoggerDestination): + pass + + assert get_logger_type(CustomLogger()) == 'LoggerDestination' + + # If logger isn't a subclass of any known logger, it should default to 'Other' + class DummyClass: + + def __init__(self): + return + + assert get_logger_type(DummyClass()) == 'Other' + + def test_format_data_to_json_serializable(): data = { 'key1': 'value1', From a08bc518bd8ece0284bb8ab868f8f0cfd1d9d645 Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Wed, 20 Mar 2024 17:58:22 -0700 Subject: [PATCH 20/32] add analytics metadata test, log `optimizer` and `algorithms` using `__dict__` --- composer/loggers/mosaicml_logger.py | 37 ++++++-- composer/trainer/trainer.py | 11 +++ tests/loggers/test_mosaicml_logger.py | 130 ++++++++++++++++---------- 3 files changed, 124 insertions(+), 54 deletions(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index 9d6e30e803..826c4585b9 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -50,7 +50,27 @@ class MosaicMLLogger(LoggerDestination): """Log to the MosaicML platform. Logs metrics to the MosaicML platform. Logging only happens on rank 0 every ``log_interval`` - seconds to avoid performance issues. + seconds to avoid performance issues. The following metrics are logged upon ``INIT``: + - ``composer/autoresume``: Whether or not the run can be stopped / resumed during training. + - ``composer/precision``: The precision to use for training. + - ``composer/train_loader_workers``: The number of workers for the train dataloader. + - ``composer/eval_loaders``: A list of dictionaries containing the label and the number of workers for each + evaluation dataloader. + - ``composer/optimizers``: A list of dictionaries containing the _ for each opimizer. + - ``composer/algorithms``: A list of dictionaries containing the _ for each algorithm. + - ``composer/loggers``: A list containing the loggers used in the ``Trainer``. + - ``composer/cloud_provider_data``: The cloud provider for the load path. + - ``composer/cloud_provider_checkpoints``: The cloud provider for the save folder. + - ``composer/save_interval``: The save interval for the run. + - FSDP specific metrics: + - ``composer/sharding_strategy``: The sharding strategy used. + - ``composer/activation_checkpointing``: Whether or not activation checkpointing is used. + - ``composer/forward_prefetch``: Whether or not forward prefetch is used. + - ``composer/backward_prefetch``: Whether or not backward prefetch is used. + - ``composer/device_mesh``: The device mesh used. + - ``composer/mixed_precision``: The mixed precision configuration used. + - ``composer/state_dict_type``: The state dict type of FSDP config. + When running on the MosaicML platform, the logger is automatically enabled by Trainer. To disable, the environment variable 'MOSAICML_PLATFORM' can be set to False. @@ -134,12 +154,15 @@ def log_analytics(self,) -> None: }), ) - metrics['composer/optimizers'] = [ - json.dumps(optimizer.state_dict(), sort_keys=True) for optimizer in trainer_state.optimizers - ] - metrics['composer/algorithms'] = [ - json.dumps(algorithm.state_dict(), sort_keys=True) for algorithm in trainer_state.algorithms - ] + metrics['composer/optimizers'] = [{ + 'name': optimizer.__class__.__name__, + 'fields': optimizer.__dict__, + } for optimizer in trainer_state.optimizers] + metrics['composer/algorithms'] = [{ + 'name': algorithm.__class__.__name__, + 'fields': algorithm.__dict__, + } for algorithm in trainer_state.algorithms] + metrics['composer/loggers'] = [ get_logger_type(logger) if not isinstance(logger, MosaicMLLogger) else 'MosaicMLLogger' for logger in loggers diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index cee5fce72d..0da086be65 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1295,6 +1295,17 @@ def __init__( }, ) loggers.append(mosaicml_logger) + elif any(isinstance(x, MosaicMLLogger) for x in loggers): + # If a MosaicMLLogger is already present (i.e. passed into the Trainer), update the analytics data + mosaicml_logger = next((logger for logger in loggers if isinstance(logger, MosaicMLLogger))) + mosaicml_logger.analytics_data = { + 'autoresume': autoresume, + 'state': self.state, + 'save_interval': save_interval, + 'loggers': loggers, + 'load_path': load_path, + 'save_folder': save_folder, + } # Remote Uploader Downloader # Keep the ``RemoteUploaderDownloader`` below client-provided loggers so the loggers init callbacks run before diff --git a/tests/loggers/test_mosaicml_logger.py b/tests/loggers/test_mosaicml_logger.py index 9b92d719f6..a58f60738c 100644 --- a/tests/loggers/test_mosaicml_logger.py +++ b/tests/loggers/test_mosaicml_logger.py @@ -60,53 +60,6 @@ def _update_metadata(self, run_name, new_metadata): json.dumps(self.run_metadata[run_name]) -@pytest.fixture -def comet_offline_directory(tmp_path): - return str(tmp_path / Path('my_cometml_runs')) - - -@pytest.fixture -def comet_logger(monkeypatch, comet_offline_directory): - comet_ml = pytest.importorskip('comet_ml', reason='comet_ml is optional') - - monkeypatch.setattr(comet_ml, 'Experiment', comet_ml.OfflineExperiment) - from composer.loggers import CometMLLogger - - # Set offline directory. - os.environ['COMET_OFFLINE_DIRECTORY'] = comet_offline_directory - - comet_logger = CometMLLogger() - return comet_logger - - -def test_get_logger_type(tmp_path: Path, comet_logger: CometMLLogger): - """Test that `get_logger_type` returns the correct logger type.""" - for logger in LOGGER_TYPES: - if logger == CometMLLogger: - assert get_logger_type(comet_logger) == 'CometMLLogger' - elif logger == RemoteUploaderDownloader: - remote_dir = str(tmp_path / 'object_store') - os.makedirs(remote_dir, exist_ok=True) - remote_uploader_downloader = RemoteUploaderDownloader(remote_dir) - assert get_logger_type(remote_uploader_downloader) == 'RemoteUploaderDownloader' - else: - assert get_logger_type(logger()) == logger.__name__ - - # Custom loggers should default to `LoggerDestination` - class CustomLogger(LoggerDestination): - pass - - assert get_logger_type(CustomLogger()) == 'LoggerDestination' - - # If logger isn't a subclass of any known logger, it should default to 'Other' - class DummyClass: - - def __init__(self): - return - - assert get_logger_type(DummyClass()) == 'Other' - - def test_format_data_to_json_serializable(): data = { 'key1': 'value1', @@ -434,3 +387,86 @@ def test_epoch_zero_no_dataloader_progress_metrics(): assert training_progress['training_progress'] == '[epoch=1/3]' assert 'training_sub_progress' in training_progress assert training_progress['training_sub_progress'] == '[batch=1]' + + +@pytest.fixture +def comet_offline_directory(tmp_path): + return str(tmp_path / Path('my_cometml_runs')) + + +@pytest.fixture +def comet_logger(monkeypatch, comet_offline_directory): + comet_ml = pytest.importorskip('comet_ml', reason='comet_ml is optional') + + monkeypatch.setattr(comet_ml, 'Experiment', comet_ml.OfflineExperiment) + from composer.loggers import CometMLLogger + + # Set offline directory. + os.environ['COMET_OFFLINE_DIRECTORY'] = comet_offline_directory + + comet_logger = CometMLLogger() + return comet_logger + + +def test_logged_metrics(monkeypatch): + mock_mapi = MockMAPI() + monkeypatch.setenv('MOSAICML_PLATFORM', 'True') + monkeypatch.setattr(mcli, 'update_run_metadata', mock_mapi.update_run_metadata) + run_name = 'test-run-name' + monkeypatch.setenv('RUN_NAME', run_name) + trainer = Trainer( + model=SimpleModel(), + train_dataloader=DataLoader(RandomClassificationDataset()), + train_subset_num_batches=1, + max_duration='4ba', + loggers=[MosaicMLLogger()], + ) + trainer.fit() + + # Check that analytics metrics were logged + metadata = mock_mapi.run_metadata[run_name] + analytics = {k: v for k, v in metadata.items() if k.startswith('mosaicml/composer/')} + assert len(analytics) > 0 + + key_name = lambda x: f'mosaicml/composer/{x}' + assert key_name('autoresume') in analytics and analytics[key_name('autoresume')] == False + assert key_name('precision') in analytics and analytics[key_name('precision')] == 'Precision.FP32' + assert key_name('eval_loaders') in analytics and analytics[key_name('eval_loaders')] == [] + + raise Exception(str(analytics)) + + assert key_name('optimizers') in analytics and analytics[key_name('optimizers')] == [ + '{"param_groups": [{"dampening": 0, "differentiable": false, "foreach": null, "initial_lr": 0.1, "lr": 0.1, "maximize": false, "momentum": 0, "nesterov": false, "params": [0, 1, 2, 3], "weight_decay": 0}], "state": {}}' + ] + assert key_name('algorithms') in analytics and analytics[key_name('algorithms')] == [] + assert key_name('loggers') in analytics and analytics[key_name('loggers') + ] == ["MosaicMLLogger", "ProgressBarLogger"] + assert key_name('save_interval') in analytics and analytics[key_name('save_interval')] == '1ep' + + +def test_get_logger_type(tmp_path: Path, comet_logger: CometMLLogger): + """Test that `get_logger_type` returns the correct logger type.""" + for logger in LOGGER_TYPES: + if logger == CometMLLogger: + assert get_logger_type(comet_logger) == 'CometMLLogger' + elif logger == RemoteUploaderDownloader: + remote_dir = str(tmp_path / 'object_store') + os.makedirs(remote_dir, exist_ok=True) + remote_uploader_downloader = RemoteUploaderDownloader(remote_dir) + assert get_logger_type(remote_uploader_downloader) == 'RemoteUploaderDownloader' + else: + assert get_logger_type(logger()) == logger.__name__ + + # Custom loggers should default to `LoggerDestination` + class CustomLogger(LoggerDestination): + pass + + assert get_logger_type(CustomLogger()) == 'LoggerDestination' + + # If logger isn't a subclass of any known logger, it should default to 'Other' + class DummyClass: + + def __init__(self): + return + + assert get_logger_type(DummyClass()) == 'Other' From 192fb4bb9dbe0bd5e7b977a3d2340b856ebc95bc Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Wed, 20 Mar 2024 18:01:05 -0700 Subject: [PATCH 21/32] run formatters --- composer/loggers/mosaicml_logger.py | 2 +- tests/loggers/test_mosaicml_logger.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index 826c4585b9..33e01dd932 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -162,7 +162,7 @@ def log_analytics(self,) -> None: 'name': algorithm.__class__.__name__, 'fields': algorithm.__dict__, } for algorithm in trainer_state.algorithms] - + metrics['composer/loggers'] = [ get_logger_type(logger) if not isinstance(logger, MosaicMLLogger) else 'MosaicMLLogger' for logger in loggers diff --git a/tests/loggers/test_mosaicml_logger.py b/tests/loggers/test_mosaicml_logger.py index a58f60738c..7921fafb00 100644 --- a/tests/loggers/test_mosaicml_logger.py +++ b/tests/loggers/test_mosaicml_logger.py @@ -436,11 +436,11 @@ def test_logged_metrics(monkeypatch): raise Exception(str(analytics)) assert key_name('optimizers') in analytics and analytics[key_name('optimizers')] == [ - '{"param_groups": [{"dampening": 0, "differentiable": false, "foreach": null, "initial_lr": 0.1, "lr": 0.1, "maximize": false, "momentum": 0, "nesterov": false, "params": [0, 1, 2, 3], "weight_decay": 0}], "state": {}}' + '{"param_groups": [{"dampening": 0, "differentiable": false, "foreach": null, "initial_lr": 0.1, "lr": 0.1, "maximize": false, "momentum": 0, "nesterov": false, "params": [0, 1, 2, 3], "weight_decay": 0}], "state": {}}', ] assert key_name('algorithms') in analytics and analytics[key_name('algorithms')] == [] assert key_name('loggers') in analytics and analytics[key_name('loggers') - ] == ["MosaicMLLogger", "ProgressBarLogger"] + ] == ['MosaicMLLogger', 'ProgressBarLogger'] assert key_name('save_interval') in analytics and analytics[key_name('save_interval')] == '1ep' From 0c6439e215cda31d0ca37f5632b3be8dd6a9f370 Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Wed, 20 Mar 2024 18:07:25 -0700 Subject: [PATCH 22/32] adjust type hint for `get_logger_type` and delete test `Exception` --- composer/utils/analytics_helpers.py | 4 +--- tests/loggers/test_mosaicml_logger.py | 6 ------ 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/composer/utils/analytics_helpers.py b/composer/utils/analytics_helpers.py index 718644edb9..27885099d0 100644 --- a/composer/utils/analytics_helpers.py +++ b/composer/utils/analytics_helpers.py @@ -3,8 +3,6 @@ """Helpers for logging analytics with the MosaicMLLogger.""" -from typing import Any - from composer.loggers.cometml_logger import CometMLLogger from composer.loggers.console_logger import ConsoleLogger from composer.loggers.file_logger import FileLogger @@ -34,7 +32,7 @@ ] -def get_logger_type(logger: Any) -> str: +def get_logger_type(logger: LoggerDestination) -> str: """Returns the type of a logger as a string. If the logger is not a known type, returns 'Custom'.""" for logger_type in LOGGER_TYPES: if isinstance(logger, logger_type): diff --git a/tests/loggers/test_mosaicml_logger.py b/tests/loggers/test_mosaicml_logger.py index 7921fafb00..d59cbe8553 100644 --- a/tests/loggers/test_mosaicml_logger.py +++ b/tests/loggers/test_mosaicml_logger.py @@ -432,12 +432,6 @@ def test_logged_metrics(monkeypatch): assert key_name('autoresume') in analytics and analytics[key_name('autoresume')] == False assert key_name('precision') in analytics and analytics[key_name('precision')] == 'Precision.FP32' assert key_name('eval_loaders') in analytics and analytics[key_name('eval_loaders')] == [] - - raise Exception(str(analytics)) - - assert key_name('optimizers') in analytics and analytics[key_name('optimizers')] == [ - '{"param_groups": [{"dampening": 0, "differentiable": false, "foreach": null, "initial_lr": 0.1, "lr": 0.1, "maximize": false, "momentum": 0, "nesterov": false, "params": [0, 1, 2, 3], "weight_decay": 0}], "state": {}}', - ] assert key_name('algorithms') in analytics and analytics[key_name('algorithms')] == [] assert key_name('loggers') in analytics and analytics[key_name('loggers') ] == ['MosaicMLLogger', 'ProgressBarLogger'] From 6abd957b1e7859337b04792aeaa38a191d2db1df Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Wed, 20 Mar 2024 18:25:30 -0700 Subject: [PATCH 23/32] fix formatting on docstring --- composer/loggers/mosaicml_logger.py | 16 +++++++--------- tests/loggers/test_mosaicml_logger.py | 2 +- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index 33e01dd932..ac9250b28e 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -62,15 +62,13 @@ class MosaicMLLogger(LoggerDestination): - ``composer/cloud_provider_data``: The cloud provider for the load path. - ``composer/cloud_provider_checkpoints``: The cloud provider for the save folder. - ``composer/save_interval``: The save interval for the run. - - FSDP specific metrics: - - ``composer/sharding_strategy``: The sharding strategy used. - - ``composer/activation_checkpointing``: Whether or not activation checkpointing is used. - - ``composer/forward_prefetch``: Whether or not forward prefetch is used. - - ``composer/backward_prefetch``: Whether or not backward prefetch is used. - - ``composer/device_mesh``: The device mesh used. - - ``composer/mixed_precision``: The mixed precision configuration used. - - ``composer/state_dict_type``: The state dict type of FSDP config. - + - ``composer/sharding_strategy``: The sharding strategy used. + - ``composer/activation_checkpointing``: Whether or not activation checkpointing is used. + - ``composer/forward_prefetch``: Whether or not forward prefetch is used. + - ``composer/backward_prefetch``: Whether or not backward prefetch is used. + - ``composer/device_mesh``: The device mesh used. + - ``composer/mixed_precision``: The mixed precision configuration used. + - ``composer/state_dict_type``: The state dict type of FSDP config. When running on the MosaicML platform, the logger is automatically enabled by Trainer. To disable, the environment variable 'MOSAICML_PLATFORM' can be set to False. diff --git a/tests/loggers/test_mosaicml_logger.py b/tests/loggers/test_mosaicml_logger.py index d59cbe8553..d244078b8b 100644 --- a/tests/loggers/test_mosaicml_logger.py +++ b/tests/loggers/test_mosaicml_logger.py @@ -463,4 +463,4 @@ class DummyClass: def __init__(self): return - assert get_logger_type(DummyClass()) == 'Other' + assert get_logger_type(DummyClass()) == 'Other' # type: ignore From f10e8a972ee04b815bca452d3f6ecce3d7d2c07e Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Wed, 20 Mar 2024 19:05:04 -0700 Subject: [PATCH 24/32] remove indent in comment --- composer/loggers/mosaicml_logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index ac9250b28e..a4aaebf664 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -55,7 +55,7 @@ class MosaicMLLogger(LoggerDestination): - ``composer/precision``: The precision to use for training. - ``composer/train_loader_workers``: The number of workers for the train dataloader. - ``composer/eval_loaders``: A list of dictionaries containing the label and the number of workers for each - evaluation dataloader. + evaluation dataloader. - ``composer/optimizers``: A list of dictionaries containing the _ for each opimizer. - ``composer/algorithms``: A list of dictionaries containing the _ for each algorithm. - ``composer/loggers``: A list containing the loggers used in the ``Trainer``. From 11eb8538438d882aa899e86567f2a287b9e34ccc Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Thu, 21 Mar 2024 13:25:37 -0700 Subject: [PATCH 25/32] remove underscored fields and `param_groups` from `composer/optimizer` logs --- composer/loggers/mosaicml_logger.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index a4aaebf664..1506147e15 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -152,9 +152,12 @@ def log_analytics(self,) -> None: }), ) + get_optimizer_args = lambda optimizer: { + k: v for k, v in optimizer.__dict__.items() if not k.startswith('_') and k != 'param_groups' + } metrics['composer/optimizers'] = [{ 'name': optimizer.__class__.__name__, - 'fields': optimizer.__dict__, + 'fields': get_optimizer_args(optimizer), } for optimizer in trainer_state.optimizers] metrics['composer/algorithms'] = [{ 'name': algorithm.__class__.__name__, From 59dea8fde5cd9be381a6c7d6bb3a8a03cce37e5e Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Thu, 21 Mar 2024 13:35:59 -0700 Subject: [PATCH 26/32] display name and data in one field for `optimizers` and `algorithms` --- composer/loggers/mosaicml_logger.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index 1506147e15..ea7c7450be 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -156,12 +156,10 @@ def log_analytics(self,) -> None: k: v for k, v in optimizer.__dict__.items() if not k.startswith('_') and k != 'param_groups' } metrics['composer/optimizers'] = [{ - 'name': optimizer.__class__.__name__, - 'fields': get_optimizer_args(optimizer), + optimizer.__class__.__name__: get_optimizer_args(optimizer), } for optimizer in trainer_state.optimizers] metrics['composer/algorithms'] = [{ - 'name': algorithm.__class__.__name__, - 'fields': algorithm.__dict__, + algorithm.__class__.__name__: algorithm.__dict__, } for algorithm in trainer_state.algorithms] metrics['composer/loggers'] = [ From 5037ffb8acdcbe04e46af8f63eecf20abfa0d035 Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Fri, 22 Mar 2024 15:20:25 -0700 Subject: [PATCH 27/32] fix docstring --- composer/loggers/mosaicml_logger.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index ea7c7450be..6bf838cbd8 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -50,14 +50,16 @@ class MosaicMLLogger(LoggerDestination): """Log to the MosaicML platform. Logs metrics to the MosaicML platform. Logging only happens on rank 0 every ``log_interval`` - seconds to avoid performance issues. The following metrics are logged upon ``INIT``: + seconds to avoid performance issues. + + Additionally, The following metrics are logged upon ``INIT``: - ``composer/autoresume``: Whether or not the run can be stopped / resumed during training. - ``composer/precision``: The precision to use for training. - ``composer/train_loader_workers``: The number of workers for the train dataloader. - ``composer/eval_loaders``: A list of dictionaries containing the label and the number of workers for each evaluation dataloader. - - ``composer/optimizers``: A list of dictionaries containing the _ for each opimizer. - - ``composer/algorithms``: A list of dictionaries containing the _ for each algorithm. + - ``composer/optimizers``: A list of dictionaries containing information about each opimizer. + - ``composer/algorithms``: A list of dictionaries containing information about each algorithm. - ``composer/loggers``: A list containing the loggers used in the ``Trainer``. - ``composer/cloud_provider_data``: The cloud provider for the load path. - ``composer/cloud_provider_checkpoints``: The cloud provider for the save folder. @@ -86,7 +88,7 @@ class MosaicMLLogger(LoggerDestination): (default: ``None``) ignore_exceptions: Flag to disable logging exceptions. Defaults to False. - analytics_data (Dict[str, Any], optional): Analytical metrics to log about the current run. Defaults to ``None``. + analytics_data (Dict[str, Any], optional): A dictionary containing variables used to log analytics. Defaults to ``None``. """ def __init__( From 8c867bcb0c78c89fdd0fadfd812db4f15200485a Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Mon, 25 Mar 2024 21:53:36 -0700 Subject: [PATCH 28/32] Make `MosaicAnalyticsData` class, change cloud path names, and log `fsdp_config` as a dict --- composer/loggers/mosaicml_logger.py | 118 +++++++++----------------- composer/trainer/trainer.py | 38 +++++---- composer/utils/analytics_helpers.py | 40 --------- tests/loggers/test_mosaicml_logger.py | 31 +------ 4 files changed, 62 insertions(+), 165 deletions(-) delete mode 100644 composer/utils/analytics_helpers.py diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index 6bf838cbd8..296ab58277 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -14,14 +14,13 @@ import time import warnings from concurrent.futures import wait +from dataclasses import dataclass from functools import reduce from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union import mcli import torch import torch.utils.data -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from torch.nn import Module from composer.core.event import Event from composer.core.time import Time, TimeUnit @@ -29,7 +28,6 @@ from composer.loggers.logger_destination import LoggerDestination from composer.loggers.wandb_logger import WandBLogger from composer.utils import dist -from composer.utils.analytics_helpers import get_logger_type from composer.utils.file_helpers import parse_uri if TYPE_CHECKING: @@ -55,21 +53,13 @@ class MosaicMLLogger(LoggerDestination): Additionally, The following metrics are logged upon ``INIT``: - ``composer/autoresume``: Whether or not the run can be stopped / resumed during training. - ``composer/precision``: The precision to use for training. - - ``composer/train_loader_workers``: The number of workers for the train dataloader. - - ``composer/eval_loaders``: A list of dictionaries containing the label and the number of workers for each - evaluation dataloader. - - ``composer/optimizers``: A list of dictionaries containing information about each opimizer. + - ``composer/eval_loaders``: A list containing the label for each evaluation dataloader. + - ``composer/optimizers``: A list of dictionaries containing information about each optimizer. - ``composer/algorithms``: A list of dictionaries containing information about each algorithm. - ``composer/loggers``: A list containing the loggers used in the ``Trainer``. - - ``composer/cloud_provider_data``: The cloud provider for the load path. - - ``composer/cloud_provider_checkpoints``: The cloud provider for the save folder. + - ``composer/cloud_provided_load_path``: The cloud provider for the load path. + - ``composer/cloud_provided_save_folder``: The cloud provider for the save folder. - ``composer/save_interval``: The save interval for the run. - - ``composer/sharding_strategy``: The sharding strategy used. - - ``composer/activation_checkpointing``: Whether or not activation checkpointing is used. - - ``composer/forward_prefetch``: Whether or not forward prefetch is used. - - ``composer/backward_prefetch``: Whether or not backward prefetch is used. - - ``composer/device_mesh``: The device mesh used. - - ``composer/mixed_precision``: The mixed precision configuration used. - ``composer/state_dict_type``: The state dict type of FSDP config. When running on the MosaicML platform, the logger is automatically enabled by Trainer. To disable, @@ -96,7 +86,7 @@ def __init__( log_interval: int = 60, ignore_keys: Optional[List[str]] = None, ignore_exceptions: bool = False, - analytics_data: Optional[Dict[str, Any]] = None, + analytics_data: Optional[MosaicAnalyticsData] = None, ) -> None: self.log_interval = log_interval self.ignore_keys = ignore_keys @@ -125,93 +115,58 @@ def log_hyperparameters(self, hyperparameters: Dict[str, Any]): def log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> None: self._log_metadata(metrics) - def log_analytics(self,) -> None: + def log_analytics(self, state: State) -> None: if self.analytics_data is None: return - # Fetch / cast metrics that we want to log from self.analytics_data - autoresume: bool = self.analytics_data['autoresume'] - trainer_state: State = self.analytics_data['state'] - save_interval: Union[str, int, Time, Callable[[State, Event], bool]] = self.analytics_data['save_interval'] - loggers: List[LoggerDestination] = self.analytics_data['loggers'] - load_path: Union[str, None] = self.analytics_data['load_path'] - save_folder: Union[str, None] = self.analytics_data['save_folder'] - - metrics: Dict[str, Any] = {'composer/autoresume': autoresume, 'composer/precision': trainer_state.precision} - - train_dataloader = trainer_state.train_dataloader - if train_dataloader is not None and isinstance(train_dataloader, torch.utils.data.DataLoader): - metrics['composer/train_loader_workers'] = train_dataloader.num_workers - + metrics: Dict[str, Any] = { + 'composer/autoresume': self.analytics_data.autoresume, + 'composer/precision': state.precision, + } metrics['composer/eval_loaders'] = [] - for evaluator in trainer_state.evaluators: + for evaluator in state.evaluators: dataloader = evaluator.dataloader.dataloader if isinstance(dataloader, torch.utils.data.DataLoader): metrics['composer/eval_loaders'].append( json.dumps({ 'label': evaluator.label, - 'num_workers': dataloader.num_workers, }), ) - get_optimizer_args = lambda optimizer: { - k: v for k, v in optimizer.__dict__.items() if not k.startswith('_') and k != 'param_groups' - } metrics['composer/optimizers'] = [{ - optimizer.__class__.__name__: get_optimizer_args(optimizer), - } for optimizer in trainer_state.optimizers] + optimizer.__class__.__name__: optimizer.defaults, + } for optimizer in state.optimizers] metrics['composer/algorithms'] = [{ algorithm.__class__.__name__: algorithm.__dict__, - } for algorithm in trainer_state.algorithms] + } for algorithm in state.algorithms] - metrics['composer/loggers'] = [ - get_logger_type(logger) if not isinstance(logger, MosaicMLLogger) else 'MosaicMLLogger' - for logger in loggers - ] + metrics['composer/loggers'] = [logger.__class__.__name__ for logger in self.analytics_data.loggers] # Take the service provider out of the URI and log it to metadata. If no service provider # is found (i.e. backend = ''), then we assume 'local' for the cloud provider. - if load_path is not None: - backend, _, _ = parse_uri(load_path) - metrics['composer/cloud_provider_data'] = backend if backend else 'local' - if save_folder is not None: - backend, _, _ = parse_uri(save_folder) - metrics['composer/cloud_provider_checkpoints'] = backend if backend else 'local' + if self.analytics_data.load_path is not None: + backend, _, _ = parse_uri(self.analytics_data.load_path) + metrics['composer/cloud_provided_load_path'] = backend if backend else 'local' + if self.analytics_data.save_folder is not None: + backend, _, _ = parse_uri(self.analytics_data.save_folder) + metrics['composer/cloud_provided_save_folder'] = backend if backend else 'local' # Save interval can be passed in w/ multiple types. If the type is a function, then # we log 'callable' as the save_interval value for analytics. - if isinstance(save_interval, Union[str, int]): - save_interval_str = str(save_interval) - elif isinstance(save_interval, Time): - save_interval_str = f'{save_interval._value}{save_interval._unit}' + if isinstance(self.analytics_data.save_interval, Union[str, int]): + save_interval_str = str(self.analytics_data.save_interval) + elif isinstance(self.analytics_data.save_interval, Time): + save_interval_str = f'{self.analytics_data.save_interval._value}{self.analytics_data.save_interval._unit}' else: save_interval_str = 'callable' metrics['composer/save_interval'] = save_interval_str - if trainer_state.fsdp_config: - metrics['composer/sharding_strategy'] = trainer_state.fsdp_config.get('sharding_strategy', None) - metrics['composer/activation_checkpointing'] = trainer_state.fsdp_config.get( - 'activation_checkpointing', - False, - ) - metrics['composer/forward_prefetch'] = trainer_state.fsdp_config.get('forward_prefetch', False) - metrics['composer/backward_prefetch'] = trainer_state.fsdp_config.get( - 'backward_prefetch', - FSDP(Module()).backward_prefetch, - ) - - # Get device_mesh from config so it is in list form and JSON parsable - metrics['composer/device_mesh'] = trainer_state.fsdp_config.get('device_mesh', []) - - mixed_precision = trainer_state.fsdp_config.get('mixed_precision', None) - if mixed_precision is not None and isinstance(mixed_precision, dict): - # Sorting the keys allows us to parse this dict value as JSON in a SQL query if needed - metrics['composer/mixed_precision'] = json.dumps(mixed_precision, sort_keys=True) - else: - metrics['composer/mixed_precision'] = mixed_precision + if state.fsdp_config: + # Keys need to be sorted so they can be parsed consistently in SQL queries + metrics['composer/fsdp_config'] = json.dumps(state.fsdp_config, sort_keys=True) - if trainer_state.fsdp_state_dict_type is not None: - metrics['composer/state_dict_type'] = trainer_state.fsdp_state_dict_type + if state.fsdp_state_dict_type is not None: + metrics['composer/state_dict_type'] = state.fsdp_state_dict_type self.log_metrics(metrics) self._flush_metadata(force_flush=True) @@ -221,7 +176,7 @@ def log_exception(self, exception: Exception): self._flush_metadata(force_flush=True) def init(self, state: State, logger: Logger) -> None: - self.log_analytics() + self.log_analytics(state) def after_load(self, state: State, logger: Logger) -> None: # Log model data downloaded and initialized for run events @@ -352,6 +307,15 @@ def _get_training_progress_metrics(self, state: State) -> Dict[str, Any]: return training_progress_metrics +@dataclass(frozen=True) +class MosaicAnalyticsData: + autoresume: bool + save_interval: Union[str, int, Time, Callable[[State, Event], bool]] + loggers: List[LoggerDestination] + load_path: Union[str, None] + save_folder: Union[str, None] + + def format_data_to_json_serializable(data: Any): """Recursively formats data to be JSON serializable. diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index b54d663ef7..7c6a800f0b 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -83,7 +83,11 @@ RemoteUploaderDownloader, WandBLogger, ) -from composer.loggers.mosaicml_logger import MOSAICML_ACCESS_TOKEN_ENV_VAR, MOSAICML_PLATFORM_ENV_VAR +from composer.loggers.mosaicml_logger import ( + MOSAICML_ACCESS_TOKEN_ENV_VAR, + MOSAICML_PLATFORM_ENV_VAR, + MosaicAnalyticsData, +) from composer.models import ComposerModel from composer.optim import ComposerScheduler, DecoupledSGDW, compile_composer_scheduler from composer.profiler import Profiler @@ -1284,28 +1288,26 @@ def __init__( MOSAICML_ACCESS_TOKEN_ENV_VAR, ) is not None and not any(isinstance(x, MosaicMLLogger) for x in loggers): log.info('Detected run on MosaicML platform. Adding MosaicMLLogger to loggers.') - mosaicml_logger = MosaicMLLogger( - analytics_data={ - 'autoresume': autoresume, - 'state': self.state, - 'save_interval': save_interval, - 'loggers': loggers, - 'load_path': load_path, - 'save_folder': save_folder, - }, + + analytics_data = MosaicAnalyticsData( + autoresume=autoresume, + save_interval=save_interval, + loggers=loggers, + load_path=load_path, + save_folder=save_folder, ) + mosaicml_logger = MosaicMLLogger(analytics_data=analytics_data,) loggers.append(mosaicml_logger) elif any(isinstance(x, MosaicMLLogger) for x in loggers): # If a MosaicMLLogger is already present (i.e. passed into the Trainer), update the analytics data mosaicml_logger = next((logger for logger in loggers if isinstance(logger, MosaicMLLogger))) - mosaicml_logger.analytics_data = { - 'autoresume': autoresume, - 'state': self.state, - 'save_interval': save_interval, - 'loggers': loggers, - 'load_path': load_path, - 'save_folder': save_folder, - } + mosaicml_logger.analytics_data = MosaicAnalyticsData( + autoresume=autoresume, + save_interval=save_interval, + loggers=loggers, + load_path=load_path, + save_folder=save_folder, + ) # Remote Uploader Downloader # Keep the ``RemoteUploaderDownloader`` below client-provided loggers so the loggers init callbacks run before diff --git a/composer/utils/analytics_helpers.py b/composer/utils/analytics_helpers.py deleted file mode 100644 index 27885099d0..0000000000 --- a/composer/utils/analytics_helpers.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright 2024 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""Helpers for logging analytics with the MosaicMLLogger.""" - -from composer.loggers.cometml_logger import CometMLLogger -from composer.loggers.console_logger import ConsoleLogger -from composer.loggers.file_logger import FileLogger -from composer.loggers.in_memory_logger import InMemoryLogger -from composer.loggers.logger_destination import LoggerDestination -from composer.loggers.mlflow_logger import MLFlowLogger -from composer.loggers.neptune_logger import NeptuneLogger -from composer.loggers.progress_bar_logger import ProgressBarLogger -from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader -from composer.loggers.slack_logger import SlackLogger -from composer.loggers.tensorboard_logger import TensorboardLogger -from composer.loggers.wandb_logger import WandBLogger - -LOGGER_TYPES = [ - FileLogger, - SlackLogger, - WandBLogger, - MLFlowLogger, - NeptuneLogger, - ConsoleLogger, - CometMLLogger, - InMemoryLogger, - TensorboardLogger, - ProgressBarLogger, - RemoteUploaderDownloader, - LoggerDestination, -] - - -def get_logger_type(logger: LoggerDestination) -> str: - """Returns the type of a logger as a string. If the logger is not a known type, returns 'Custom'.""" - for logger_type in LOGGER_TYPES: - if isinstance(logger, logger_type): - return logger_type.__name__ - return 'Other' diff --git a/tests/loggers/test_mosaicml_logger.py b/tests/loggers/test_mosaicml_logger.py index d244078b8b..178d9df984 100644 --- a/tests/loggers/test_mosaicml_logger.py +++ b/tests/loggers/test_mosaicml_logger.py @@ -14,7 +14,7 @@ from torch.utils.data import DataLoader from composer.core import Callback, Time, TimeUnit -from composer.loggers import CometMLLogger, LoggerDestination, RemoteUploaderDownloader, WandBLogger +from composer.loggers import WandBLogger from composer.loggers.mosaicml_logger import ( MOSAICML_ACCESS_TOKEN_ENV_VAR, MOSAICML_PLATFORM_ENV_VAR, @@ -24,7 +24,6 @@ ) from composer.trainer import Trainer from composer.utils import dist, get_composer_env_dict -from composer.utils.analytics_helpers import LOGGER_TYPES, get_logger_type from tests.callbacks.callback_settings import get_cb_kwargs, get_cb_model_and_datasets, get_cbs_and_marks from tests.common import RandomClassificationDataset, SimpleModel from tests.common.markers import world_size @@ -436,31 +435,3 @@ def test_logged_metrics(monkeypatch): assert key_name('loggers') in analytics and analytics[key_name('loggers') ] == ['MosaicMLLogger', 'ProgressBarLogger'] assert key_name('save_interval') in analytics and analytics[key_name('save_interval')] == '1ep' - - -def test_get_logger_type(tmp_path: Path, comet_logger: CometMLLogger): - """Test that `get_logger_type` returns the correct logger type.""" - for logger in LOGGER_TYPES: - if logger == CometMLLogger: - assert get_logger_type(comet_logger) == 'CometMLLogger' - elif logger == RemoteUploaderDownloader: - remote_dir = str(tmp_path / 'object_store') - os.makedirs(remote_dir, exist_ok=True) - remote_uploader_downloader = RemoteUploaderDownloader(remote_dir) - assert get_logger_type(remote_uploader_downloader) == 'RemoteUploaderDownloader' - else: - assert get_logger_type(logger()) == logger.__name__ - - # Custom loggers should default to `LoggerDestination` - class CustomLogger(LoggerDestination): - pass - - assert get_logger_type(CustomLogger()) == 'LoggerDestination' - - # If logger isn't a subclass of any known logger, it should default to 'Other' - class DummyClass: - - def __init__(self): - return - - assert get_logger_type(DummyClass()) == 'Other' # type: ignore From 57530201b835a8ef8815aff6299f45dc871f63fd Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Mon, 25 Mar 2024 21:59:03 -0700 Subject: [PATCH 29/32] just log algorithm names for analytics --- composer/loggers/mosaicml_logger.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index 296ab58277..df0a0693e4 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -55,7 +55,7 @@ class MosaicMLLogger(LoggerDestination): - ``composer/precision``: The precision to use for training. - ``composer/eval_loaders``: A list containing the label for each evaluation dataloader. - ``composer/optimizers``: A list of dictionaries containing information about each optimizer. - - ``composer/algorithms``: A list of dictionaries containing information about each algorithm. + - ``composer/algorithms``: A list containing the names of the algorithms used for training. - ``composer/loggers``: A list containing the loggers used in the ``Trainer``. - ``composer/cloud_provided_load_path``: The cloud provider for the load path. - ``composer/cloud_provided_save_folder``: The cloud provider for the save folder. @@ -136,9 +136,7 @@ def log_analytics(self, state: State) -> None: metrics['composer/optimizers'] = [{ optimizer.__class__.__name__: optimizer.defaults, } for optimizer in state.optimizers] - metrics['composer/algorithms'] = [{ - algorithm.__class__.__name__: algorithm.__dict__, - } for algorithm in state.algorithms] + metrics['composer/algorithms'] = [algorithm.__class__.__name__ for algorithm in state.algorithms] metrics['composer/loggers'] = [logger.__class__.__name__ for logger in self.analytics_data.loggers] From 1083ae6afda96e1be08e06e0ae7d02169a51b2a1 Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Mon, 25 Mar 2024 22:05:39 -0700 Subject: [PATCH 30/32] just pass `evaluator.label` for `eval_loaders` --- composer/loggers/mosaicml_logger.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index df0a0693e4..a07c874d30 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -53,7 +53,7 @@ class MosaicMLLogger(LoggerDestination): Additionally, The following metrics are logged upon ``INIT``: - ``composer/autoresume``: Whether or not the run can be stopped / resumed during training. - ``composer/precision``: The precision to use for training. - - ``composer/eval_loaders``: A list containing the label for each evaluation dataloader. + - ``composer/eval_loaders``: A list containing the labels of each evaluation dataloader. - ``composer/optimizers``: A list of dictionaries containing information about each optimizer. - ``composer/algorithms``: A list containing the names of the algorithms used for training. - ``composer/loggers``: A list containing the loggers used in the ``Trainer``. @@ -127,11 +127,7 @@ def log_analytics(self, state: State) -> None: for evaluator in state.evaluators: dataloader = evaluator.dataloader.dataloader if isinstance(dataloader, torch.utils.data.DataLoader): - metrics['composer/eval_loaders'].append( - json.dumps({ - 'label': evaluator.label, - }), - ) + metrics['composer/eval_loaders'].append(evaluator.label) metrics['composer/optimizers'] = [{ optimizer.__class__.__name__: optimizer.defaults, From bf02105db7079fff8ccacbc988c9a6b18beecb21 Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Tue, 26 Mar 2024 10:25:14 -0700 Subject: [PATCH 31/32] fix `fsdp_config`, `eval_loaders`, and `loggers`. also `warn` when analytics logging fails --- composer/loggers/mosaicml_logger.py | 23 ++++++++--------------- composer/trainer/trainer.py | 2 -- tests/loggers/test_mosaicml_logger.py | 21 --------------------- 3 files changed, 8 insertions(+), 38 deletions(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index a07c874d30..f8e3494894 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -16,7 +16,7 @@ from concurrent.futures import wait from dataclasses import dataclass from functools import reduce -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import mcli import torch @@ -115,7 +115,7 @@ def log_hyperparameters(self, hyperparameters: Dict[str, Any]): def log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> None: self._log_metadata(metrics) - def log_analytics(self, state: State) -> None: + def log_analytics(self, state: State, loggers: Tuple[LoggerDestination, ...]) -> None: if self.analytics_data is None: return @@ -123,18 +123,12 @@ def log_analytics(self, state: State) -> None: 'composer/autoresume': self.analytics_data.autoresume, 'composer/precision': state.precision, } - metrics['composer/eval_loaders'] = [] - for evaluator in state.evaluators: - dataloader = evaluator.dataloader.dataloader - if isinstance(dataloader, torch.utils.data.DataLoader): - metrics['composer/eval_loaders'].append(evaluator.label) - + metrics['composer/eval_loaders'] = [evaluator.label for evaluator in state.evaluators] metrics['composer/optimizers'] = [{ optimizer.__class__.__name__: optimizer.defaults, } for optimizer in state.optimizers] metrics['composer/algorithms'] = [algorithm.__class__.__name__ for algorithm in state.algorithms] - - metrics['composer/loggers'] = [logger.__class__.__name__ for logger in self.analytics_data.loggers] + metrics['composer/loggers'] = [logger.__class__.__name__ for logger in loggers] # Take the service provider out of the URI and log it to metadata. If no service provider # is found (i.e. backend = ''), then we assume 'local' for the cloud provider. @@ -159,9 +153,6 @@ def log_analytics(self, state: State) -> None: # Keys need to be sorted so they can be parsed consistently in SQL queries metrics['composer/fsdp_config'] = json.dumps(state.fsdp_config, sort_keys=True) - if state.fsdp_state_dict_type is not None: - metrics['composer/state_dict_type'] = state.fsdp_state_dict_type - self.log_metrics(metrics) self._flush_metadata(force_flush=True) @@ -170,7 +161,10 @@ def log_exception(self, exception: Exception): self._flush_metadata(force_flush=True) def init(self, state: State, logger: Logger) -> None: - self.log_analytics(state) + try: + self.log_analytics(state, logger.destinations) + except: + warnings.warn('Failed to log analytics data to MosaicML. Continuing without logging analytics data.') def after_load(self, state: State, logger: Logger) -> None: # Log model data downloaded and initialized for run events @@ -305,7 +299,6 @@ def _get_training_progress_metrics(self, state: State) -> Dict[str, Any]: class MosaicAnalyticsData: autoresume: bool save_interval: Union[str, int, Time, Callable[[State, Event], bool]] - loggers: List[LoggerDestination] load_path: Union[str, None] save_folder: Union[str, None] diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index c7af95cd15..41f34a51d8 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1292,7 +1292,6 @@ def __init__( analytics_data = MosaicAnalyticsData( autoresume=autoresume, save_interval=save_interval, - loggers=loggers, load_path=load_path, save_folder=save_folder, ) @@ -1304,7 +1303,6 @@ def __init__( mosaicml_logger.analytics_data = MosaicAnalyticsData( autoresume=autoresume, save_interval=save_interval, - loggers=loggers, load_path=load_path, save_folder=save_folder, ) diff --git a/tests/loggers/test_mosaicml_logger.py b/tests/loggers/test_mosaicml_logger.py index 178d9df984..5048398378 100644 --- a/tests/loggers/test_mosaicml_logger.py +++ b/tests/loggers/test_mosaicml_logger.py @@ -2,9 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import json -import os from concurrent.futures import Future -from pathlib import Path from typing import Type from unittest.mock import MagicMock @@ -388,25 +386,6 @@ def test_epoch_zero_no_dataloader_progress_metrics(): assert training_progress['training_sub_progress'] == '[batch=1]' -@pytest.fixture -def comet_offline_directory(tmp_path): - return str(tmp_path / Path('my_cometml_runs')) - - -@pytest.fixture -def comet_logger(monkeypatch, comet_offline_directory): - comet_ml = pytest.importorskip('comet_ml', reason='comet_ml is optional') - - monkeypatch.setattr(comet_ml, 'Experiment', comet_ml.OfflineExperiment) - from composer.loggers import CometMLLogger - - # Set offline directory. - os.environ['COMET_OFFLINE_DIRECTORY'] = comet_offline_directory - - comet_logger = CometMLLogger() - return comet_logger - - def test_logged_metrics(monkeypatch): mock_mapi = MockMAPI() monkeypatch.setenv('MOSAICML_PLATFORM', 'True') From 3be10edf486c7e16c8b1d70402dc8794f704f3d4 Mon Sep 17 00:00:00 2001 From: Angel Ruiz Date: Tue, 26 Mar 2024 10:33:59 -0700 Subject: [PATCH 32/32] update docstring --- composer/loggers/mosaicml_logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index f8e3494894..7d1f34633e 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -60,7 +60,7 @@ class MosaicMLLogger(LoggerDestination): - ``composer/cloud_provided_load_path``: The cloud provider for the load path. - ``composer/cloud_provided_save_folder``: The cloud provider for the save folder. - ``composer/save_interval``: The save interval for the run. - - ``composer/state_dict_type``: The state dict type of FSDP config. + - ``composer/fsdp_config``: The FSDP config used for training. When running on the MosaicML platform, the logger is automatically enabled by Trainer. To disable, the environment variable 'MOSAICML_PLATFORM' can be set to False.