pytorch · ebsmothers · Aug 11, 2024 · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import os
 import sys
 import time
 
@@ -15,25 +16,21 @@
 from omegaconf import DictConfig, ListConfig
 
 from torch import nn
-from torch.distributed import init_process_group
-from torch.distributed.fsdp import (
- CPUOffload,
- FullOptimStateDictConfig,
- FullStateDictConfig,
- FullyShardedDataParallel as FSDP,
- StateDictType,
+from torch.distributed import destroy_process_group, init_process_group
+from torch.distributed._composable.fsdp import CPUOffloadPolicy, fully_shard
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+ CheckpointWrapper,
 )
+
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader, DistributedSampler
-
 from torchtune import config, modules, utils
 from torchtune.datasets import ConcatDataset
 from torchtune.recipe_interfaces import FTRecipeInterface
 from torchtune.utils.activations import apply_selective_activation_checkpointing
 
 from tqdm import tqdm
 
-
 log = utils.get_logger("DEBUG")
 
 
@@ -43,8 +40,8 @@ class FullFinetuneRecipeDistributed(FTRecipeInterface):
  distributed training and can be run on a single node (1 to 8 GPUs).
 
  Features:
- - FSDP. Supported using PyTorch's FSDP APIs. DDP is currently not supported. Training on CPU
- is not supported.
+ - FSDP. Supported using PyTorch's FSDP APIs. DDP is currently not supported. Training on CPU is not
+ supported.
 
  - Activation Checkpointing. This can be controlled using the ``activation_checkpointing``
  flag. Activation checkpointing helps reduce the memory footprint since we no longer keep
@@ -92,10 +89,10 @@ class FullFinetuneRecipeDistributed(FTRecipeInterface):
 
  Raises:
  ValueError: If ``dtype`` is set to fp16.
+ RuntimeError: If ``dtype`` is set to bf16 and the hardware does not support bf16.
  """
 
  def __init__(self, cfg: DictConfig) -> None:
-
  self._device = utils.get_device(device=cfg.device)
  self._dtype = utils.get_dtype(cfg.dtype, device=self._device)
 
@@ -192,37 +189,31 @@ def _update_recipe_state(self, ckpt_dict: Dict[str, Any]) -> None:
 
  def setup(self, cfg: DictConfig) -> None:
  """
- Sets up the recipe state correctly. This includes setting recipe attributes based
- on the ``resume_from_checkpoint`` flag.
+ Setup the recipe state. This includes recipe state (if resume_from_checkpoint is True),
+ model, tokenizer, loss, optimizer, sampler, and dataloader.
  """
  if self._is_rank_zero:
  self._metric_logger = config.instantiate(cfg.metric_logger)
 
  # log config with parameter override
  self._metric_logger.log_config(cfg)
 
- ckpt_dict = self.load_checkpoint(cfg.checkpointer)
+ checkpoint_dict = self.load_checkpoint(cfg_checkpointer=cfg.checkpointer)
 
- # ``_setup_model`` handles initialization and loading the state dict. This method
- # should be called before ``_setup_optimizer`` since transforming the optimizer
- # state dict requires the model
  self._model = self._setup_model(
  cfg_model=cfg.model,
  enable_activation_checkpointing=cfg.enable_activation_checkpointing,
  memory_efficient_fsdp_wrap=cfg.get("memory_efficient_fsdp_wrap", False),
  fsdp_cpu_offload=cfg.get("fsdp_cpu_offload", False),
- model_state_dict=ckpt_dict[utils.MODEL_KEY],
+ model_state_dict=checkpoint_dict[utils.MODEL_KEY],
  ac_mode=cfg.get("ac_mode", None),
  ac_option=cfg.get("ac_option", None),
  )
-
  self._tokenizer = config.instantiate(cfg.tokenizer)
 
- # _setup_optimizer should take in ckpt_dict only if training is resumed from
- # checkpoint. Transforming the opt state dict is handled by this method
  self._optimizer = self._setup_optimizer(
  cfg_optimizer=cfg.optimizer,
- opt_state_dict=ckpt_dict[utils.OPT_KEY]
+ opt_state_dict=checkpoint_dict[utils.OPT_KEY]
  if self._resume_from_checkpoint
  else None,
  )
@@ -266,37 +257,21 @@ def _setup_model(
  ) -> nn.Module:
  """
  Model initialization has some important considerations:
- a. To minimize GPU peak memory, we load the model on CPU with the right
- dtype. To ensure that we don't instantiate ``world_size`` number of models,
- we initialize on meta_device for all ranks other than rank 0.
- b. Rank 0 is also responsible for calling ``load_state_dict`` and loading the
- model weights from checkpoint.
- c. While wrapping the model with FSDP, we set ``sync_module_states``
- to TRUE and broadcast module params and buffers from rank 0.
- d. The ``device_id`` param ensures that the FSDP initialization happens on
- the correct device.
+ a. To minimize GPU peak memory, we initialize the model on meta device with
+ the right dtype
+ b. All ranks calls ``load_state_dict`` without peaking CPU RAMs since
+ full state dicts are loaded with ``torch.load(mmap=True)``
+ c. We register (pre-)forward hooks with ``fully_shard`` instead of wrapping `nn.Module`
  """
- if self._is_rank_zero:
- log.info("FSDP is enabled. Instantiating Model on CPU for Rank 0 ...")
- init_start = time.perf_counter()
-
- with utils.set_default_dtype(self._dtype):
- model = config.instantiate(cfg_model)
 
+ if self._is_rank_zero:
  log.info(
- f"Model instantiation took {time.perf_counter() - init_start:.2f} secs"
+ "FSDP is enabled. Instantiating model and loading checkpoint on Rank 0 ..."
  )
+ init_start = time.perf_counter()
 
- # Load both the model weights. This should happen only on Rank 0
- model.load_state_dict(model_state_dict)
-
- else:
- # For non-zero ranks, load the model on meta device
- with utils.set_default_dtype(self._dtype), torch.device("meta"):
- model = config.instantiate(cfg_model)
-
- if self._dtype == torch.bfloat16:
- model = model.to(torch.bfloat16)
+ with utils.set_default_dtype(self._dtype), torch.device("meta"):
+ model = config.instantiate(cfg_model)
 
  # We currently have two versions of activation checkpointing in this recipe
  # for testing and BC purposes. ``enable_activation_checkpointing`` controls
@@ -314,41 +289,51 @@ def _setup_model(
  ac_option,
  )
 
- # Wrap the model with FSDP. This will ensure that the model is sharded
- # across all available GPUs.
- model = FSDP(
- module=model,
- auto_wrap_policy=utils.get_full_finetune_fsdp_wrap_policy(
- memory_efficient_fsdp_wrap=memory_efficient_fsdp_wrap,
- modules_to_wrap={modules.TransformerDecoderLayer},
- ),
- cpu_offload=CPUOffload(offload_params=fsdp_cpu_offload),
- sharding_strategy=torch.distributed.fsdp.ShardingStrategy.FULL_SHARD,
- device_id=self._device,
- # this recipe does not currently support mixed precision training
- mixed_precision=None,
- # Ensure we broadcast params and buffers from rank 0
- sync_module_states=True,
- # Initialize empty modules on all non-zero ranks
- param_init_fn=(
- lambda module: module.to_empty(
- device=torch.device("cuda"), recurse=False
- )
- if not self._is_rank_zero
- else None
- ),
- )
-
- # Ensure no params and buffers are on meta device
- utils.validate_no_params_on_meta_device(model)
-
- # original activation checkpointing (full) - flip the condition above
  if enable_activation_checkpointing and ac_mode is None:
  utils.set_activation_checkpointing(
  model, auto_wrap_policy={modules.TransformerDecoderLayer}
  )
 
+ fsdp_kwargs = {}
+ if fsdp_cpu_offload:
+ fsdp_kwargs["offload_policy"] = CPUOffloadPolicy()
+
+ # Shard the model with FSDP
+ for m in reversed(list(model.modules())):
+ # TransformerDecoderLayer is wrapped by CheckpointWrapper
+ # when enable_activation_checkpointing
+ if enable_activation_checkpointing:
+ if isinstance(m, CheckpointWrapper):
+ fully_shard(m, **fsdp_kwargs)
+ # For large vocab size, we can additionally shard
+ # the token embeddings and output projections individually
+ if memory_efficient_fsdp_wrap:
+ if isinstance(m, nn.Embedding):
+ fully_shard(m, **fsdp_kwargs)
+ if isinstance(m, modules.TransformerDecoder):
+ fully_shard(m.output, **fsdp_kwargs)
+ else:
+ if isinstance(m, modules.TransformerDecoderLayer):
+ fully_shard(m, **fsdp_kwargs)
+ fully_shard(model, **fsdp_kwargs)
+
+ with utils.set_default_dtype(self._dtype), self._device:
+ for m in model.modules():
+ # RoPE is not covered in state dict
+ if isinstance(m, modules.RotaryPositionalEmbeddings):
+ m.reset_parameters()
+
+ utils.load_from_full_model_state_dict(
+ model, model_state_dict, self._device, self._is_rank_zero, strict=True
+ )
+
+ # Ensure no params and buffers are on meta device
+ utils.validate_no_params_on_meta_device(model)
+
  if self._is_rank_zero:
+ log.info(
+ f"Instantiating model and loading checkpoint took {time.perf_counter() - init_start:.2f} secs"
+ )
  memory_stats = utils.get_memory_stats(device=self._device)
  utils.log_memory_stats(memory_stats)
 
@@ -360,17 +345,13 @@ def _setup_model(
  def _setup_optimizer(
  self, cfg_optimizer: DictConfig, opt_state_dict: Optional[Dict[str, Any]] = None
  ) -> Optimizer:
- """
- Set up the optimizer. This method also handles transforing the state dict
- for FSDP.
- """
  optimizer = config.instantiate(cfg_optimizer, self._model.parameters())
-
  if opt_state_dict:
- opt_state_dict = FSDP.optim_state_dict_to_load(
- self._model, optimizer, opt_state_dict
+ utils.load_from_full_optimizer_state_dict(
+ optimizer,
+ opt_state_dict,
+ self._device,
  )
- optimizer.load_state_dict(opt_state_dict)
 
  if self._is_rank_zero:
  log.info("Optimizer is initialized.")
@@ -391,21 +372,17 @@ def _setup_data(
 
  if isinstance(cfg_dataset, ListConfig):
  datasets = [
- config.instantiate(single_cfg_dataset, self._tokenizer)
+ config.instantiate(single_cfg_dataset, tokenizer=self._tokenizer)
  for single_cfg_dataset in cfg_dataset
  ]
  ds = ConcatDataset(datasets=datasets)
  packed = False
  else:
- ds = config.instantiate(cfg_dataset, self._tokenizer)
+ ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
  packed = cfg_dataset.get("packed", False)
 
  sampler = DistributedSampler(
- ds,
- num_replicas=world_size,
- rank=rank,
- shuffle=shuffle,
- seed=0,
+ ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0
  )
  dataloader = DataLoader(
  dataset=ds,
@@ -425,32 +402,48 @@ def _setup_data(
 
  return sampler, dataloader
 
- def save_checkpoint(self, epoch: int) -> None:
+ def save_checkpoint(
+ self,
+ epoch: int,
+ ) -> None:
  """
- Save state dict to file. The recipe save_checkpoint method is responsible for
- correctly creating the checkpoint dict and passing to the checkpointer.
+ Checkpoint the state of the recipe. The constructed checkpoint state dict
+ contains the following information:
+ - Model weights with key MODEL_KEY
- - Model weights with key MODEL_KEY
+ - Model weights with key utils.MODEL_KEY
- - Model weights with key MODEL_KEY
+ - Model weights with key utils.MODEL_KEY
+ - Relevant recipe state if training is not complete
+
+ Checkpointer will save the model weights and recipe state in
+ different checkpoint files. To correctly resume training from an intermediate checkpoint,
+ the model weights and recipe state must be provided.
  """
+ # final dict passed onto the checkpointer
  checkpoint_dict = {}
 
+ intermediate_checkpoint = epoch + 1 < self.total_epochs
  # To prevent GPU memory from spiking during checkpoint save,
  # we consolidate the full model and optim state dicts on CPU for rank 0
- with FSDP.state_dict_type(
+ cpu_state_dict = utils.get_full_model_state_dict(
  self._model,
- StateDictType.FULL_STATE_DICT,
- FullStateDictConfig(offload_to_cpu=True, rank0_only=True),
- FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=True),
- ):
- cpu_state_dict = self._model.state_dict()
- opt_state_dict = FSDP.optim_state_dict(self._model, self._optimizer)
+ self._is_rank_zero,
+ )
+
+ if intermediate_checkpoint:
+ opt_state_dict = utils.get_full_optimizer_state_dict(
+ self._optimizer,
+ self._is_rank_zero,
+ )
+ else:
+ opt_state_dict = None
 
  # Now that we have the model and opt state dict, create the actual checkpoint dict
  # to be sent to the checkpointer and ultimately written to file
  if self._is_rank_zero:
 
  checkpoint_dict.update({utils.MODEL_KEY: cpu_state_dict})
 
- # if training is in-progress, checkpoint the optimizer state as well
- if epoch + 1 < self.total_epochs:
+ # if training is in-progress, checkpoint the optimizer state and recipe state
+ # as well.
+ if intermediate_checkpoint:
  checkpoint_dict.update(
  {
  utils.OPT_KEY: opt_state_dict,
@@ -464,13 +457,12 @@ def save_checkpoint(self, epoch: int) -> None:
  self._checkpointer.save_checkpoint(
  checkpoint_dict,
  epoch=epoch,
- intermediate_checkpoint=(epoch + 1 < self.total_epochs),
+ intermediate_checkpoint=intermediate_checkpoint,
  )
 
  def train(self) -> None:
  """
- The core training loop. Supports training on subsets of the dataset using the
- ``max_steps_per_epoch``.
+ The core training loop.
  """
  # clean up before training begins
  utils.cleanup_before_training()
@@ -573,7 +565,7 @@ def train(self) -> None:
  def cleanup(self) -> None:
  if self._is_rank_zero:
  self._metric_logger.close()
- torch.distributed.destroy_process_group()
+ destroy_process_group()
 
 
 @config.parse
@@ -590,12 +582,8 @@ def recipe_main(cfg: DictConfig) -> None:
  "Distributed finetune recipe should be run via a distributed launcher."
  "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
  )
-
+ os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
  init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl")
- if cfg.get("fsdp_cpu_offload", False):
- # Utilize all available CPU cores for intra-op parallelism. This provides ~2x
- # speed up when benchmarking fused AdamW on CPU
- utils.set_torch_num_threads()
 
  config.log_config(recipe_name="FullFinetuneRecipeDistributed", cfg=cfg)
 

diff --git a/torchtune/utils/_distributed.py b/torchtune/utils/_distributed.py
@@ -289,6 +289,7 @@ def load_from_full_model_state_dict(
  full_sd: Dict[str, Any],
  device: torch.device,
  is_rank_zero: bool,
+ strict: bool = False,
 ):
  """
  Converting full state dict into a sharded state dict
@@ -355,7 +356,7 @@ def load_from_full_model_state_dict(
  )
  sharded_sd[param_name] = nn.Parameter(sharded_tensor)
  # choose `assign=True` since we cannot call `copy_` on meta tensor
- return model.load_state_dict(sharded_sd, strict=False, assign=True)
+ return model.load_state_dict(sharded_sd, strict=strict, assign=True)
 
 
 def get_full_model_state_dict(