pytorch · weifengpy · Jun 3, 2024 · Apr 24, 2024 · Apr 24, 2024 · Apr 24, 2024
diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
@@ -17,23 +17,24 @@
 
 from torch import nn
 from torch.distributed import destroy_process_group, init_process_group
-from torch.distributed.fsdp import (
- FullOptimStateDictConfig,
- FullStateDictConfig,
- FullyShardedDataParallel as FSDP,
- StateDictType,
-)
+from torch.distributed._composable.fsdp import fully_shard
+from torch.distributed._tensor import DTensor
 from torch.optim import Optimizer
+from torch.optim.optimizer import _foreach_supported_types
 from torch.utils.data import DataLoader, DistributedSampler
 from torchtune import config, modules, utils
+from torchtune.modules.peft import LoRALinear
 from torchtune.modules.peft.peft_utils import (
  get_adapter_params,
  get_merged_lora_ckpt,
  set_trainable_params,
- validate_state_dict_for_lora,
 )
 from torchtune.recipe_interfaces import FTRecipeInterface
 
+# use foreach on CUDA
+if DTensor not in _foreach_supported_types:
+ _foreach_supported_types.append(DTensor)
+
 from tqdm import tqdm
 
 log = utils.get_logger("DEBUG")
@@ -277,86 +278,62 @@ def _setup_model(
  the correct device.
  """
 
+ if self._device.type != "cuda":
+ raise ValueError(
+ f'FSDP needs device="cuda" but found device={self._device.type}'
+ )
+
  if self._is_rank_zero:
- log.info("FSDP is enabled. Instantiating Model on CPU for Rank 0 ...")
+ log.info("FSDP is enabled. Model init and checkpoint loading on Rank 0 ...")
- log.info("FSDP is enabled. Model init and checkpoint loading on Rank 0 ...")
+ log.info("FSDP is enabled. Instantiating model and loading checkpoint on Rank 0 ...")
- log.info("FSDP is enabled. Model init and checkpoint loading on Rank 0 ...")
+ log.info("FSDP is enabled. Instantiating model and loading checkpoint on Rank 0 ...")
  init_start = time.perf_counter()
 
-  with utils.set_default_dtype(self._dtype):
-  model = config.instantiate(cfg_model)
+ with utils.set_default_dtype(self._dtype), torch.device("meta"):
+ model = config.instantiate(cfg_model)
 
-  log.info(
-  f"Model instantiation took {time.perf_counter() - init_start:.2f} secs"
-  )
+ # Note: this needs to be set before wrapping with FSDP
+ self.adapter_params = get_adapter_params(model)
+ set_trainable_params(model, self.adapter_params)
 
- # The model contains LoRA params which won't have any matching keys in
- # the state dict. As a result, we need to load with strict=False.
- # Before loading the state dict, ensure the state dict keys for the base
- # model and adapters (if available) match the keys in the full LoRA model
- # This is a good sanity check to prevent silent errors
- validate_state_dict_for_lora(
- lora_attn_modules=cfg_model.lora_attn_modules,
- apply_lora_to_mlp=cfg_model.apply_lora_to_mlp,
- apply_lora_to_output=getattr(cfg_model, "apply_lora_to_output", False),
- full_model_state_dict_keys=model.state_dict().keys(),
- lora_state_dict_keys=(
- lora_weights_state_dict.keys()
- if lora_weights_state_dict is not None
- else None
- ),
- base_model_state_dict_keys=base_model_state_dict.keys(),
+ if enable_activation_checkpointing:
+ utils.set_activation_checkpointing(
+ model, auto_wrap_policy={modules.TransformerDecoderLayer}
  )
 
- # Load both the base model weights and (if available) the adapter weights. Both
- # of this should happen only on Rank 0
- model.load_state_dict(base_model_state_dict, strict=False)
- if lora_weights_state_dict:
- model.load_state_dict(lora_weights_state_dict, strict=False)
+ for m in model.modules():
+ if isinstance(m, modules.TransformerDecoderLayer):
+ fully_shard(m)
+ fully_shard(model)
 
- else:
- # For non-zero ranks, load the model on meta device
- with utils.set_default_dtype(self._dtype), torch.device("meta"):
- model = config.instantiate(cfg_model)
+ utils.load_from_full_state_dict(
+ model, base_model_state_dict, self._device, self._is_rank_zero
+ )
+ if lora_weights_state_dict:
+ utils.load_from_full_state_dict(
+ model, lora_weights_state_dict, self._device, self._is_rank_zero
+ )
 
- if self._dtype == torch.bfloat16:
- model = model.to(torch.bfloat16)
+ with utils.set_default_dtype(self._dtype), self._device:
+ for m in model.modules():
+ if isinstance(m, LoRALinear) and not lora_weights_state_dict:
+ # to_empty is needed since kaiming_uniform_ is inplace
+ m.to_empty(device=self._device)
+ m.initialize_parameters()
+ if isinstance(m, modules.RotaryPositionalEmbeddings):
+ m.reset_parameters()
+
+ model = model.to(self._dtype)
 
  # LoRA hyper-params needed for merging weights while saving checkpoints
  self._lora_rank = cfg_model.lora_rank
  self._lora_alpha = cfg_model.lora_alpha
 
- # Note: this needs to be set before wrapping with FSDP
- self.adapter_params = get_adapter_params(model)
- set_trainable_params(model, self.adapter_params)
-
- model = FSDP(
- module=model,
- auto_wrap_policy=utils.lora_fsdp_wrap_policy(
- modules_to_wrap={modules.TransformerDecoderLayer}
- ),
- sharding_strategy=torch.distributed.fsdp.ShardingStrategy.FULL_SHARD,
- device_id=self._device,
- # this recipe does not currently support mixed precision training
- mixed_precision=None,
- # Ensure we broadcast params and buffers from rank 0
- sync_module_states=True,
- # Initialize empty modules on all non-zero ranks
- param_init_fn=(
- lambda module: module.to_empty(
- device=torch.device("cuda"), recurse=False
- )
- if not self._is_rank_zero
- else None
- ),
- )
-
  # Ensure no params and buffers are on meta device
  utils.validate_no_params_on_meta_device(model)
 
- if enable_activation_checkpointing:
- utils.set_activation_checkpointing(
- model, auto_wrap_policy={modules.TransformerDecoderLayer}
- )
  if self._is_rank_zero:
+ log.info(
+ f"Model init and checkpoint loading took {time.perf_counter() - init_start:.2f} secs"
+ )
  memory_stats = utils.get_memory_stats(device=self._device)
  utils.log_memory_stats(memory_stats)
 
@@ -372,6 +349,7 @@ def _setup_optimizer(
  if opt_state_dict:
  # Note: technically we should check _contains_fsdp for
  # just the state dict of the adapter cfg, but should be equivalent
+ # TODO: implement local -> DTensor
  opt_state_dict = utils.transform_opt_state_dict(
  opt_state_dict, self._model, optimizer
  )
@@ -451,22 +429,21 @@ def save_checkpoint(
  intermediate_checkpoint = epoch + 1 < self.total_epochs
  # To prevent GPU memory from spiking during checkpoint save,
  # we consolidate the full model and optim state dicts on CPU for rank 0
- with FSDP.state_dict_type(
-  self._model,
- StateDictType.FULL_STATE_DICT,
-  FullStateDictConfig(offload_to_cpu=True, rank0_only=True),
- FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=True),
- ):
- cpu_state_dict = self._model.state_dict()
- if intermediate_checkpoint:
-  opt_state_dict = FSDP.optim_state_dict(self._model, self._optimizer)
-  else:
-  opt_state_dict = None
+
+ cpu_state_dict = utils.get_full_model_state_dict(
+ self._model, self._is_rank_zero
+ )
+
+ if intermediate_checkpoint:
+ opt_state_dict = utils.get_full_optimizer_state_dict(
+  self._optimizer, self._is_rank_zero
+ )
+ else:
+ opt_state_dict = None
 
  # Now that we have the model and opt state dict, create the actual checkpoint dict
  # to be sent to the checkpointer and ultimately written to file
  if self._is_rank_zero:
-
  # Filter out the adapter keys and weights from the model state dict. These will
  # be saved separately
  adapter_key_filter = lambda x: x in self.adapter_params

diff --git a/torchtune/utils/__init__.py b/torchtune/utils/__init__.py
@@ -16,9 +16,12 @@
 from ._device import get_device
 from ._distributed import ( # noqa
  contains_fsdp,
+ get_full_model_state_dict,
+ get_full_optimizer_state_dict,
  get_world_size_and_rank,
  init_distributed,
  is_distributed,
+ load_from_full_state_dict,
  lora_fsdp_wrap_policy,
  prepare_model_for_fsdp_with_meta_device,
  validate_no_params_on_meta_device,

diff --git a/torchtune/utils/_distributed.py b/torchtune/utils/_distributed.py
@@ -8,23 +8,25 @@
 import logging
 import os
 from itertools import chain
-from typing import Callable, Dict, Optional, Set, Tuple, Type, Union
+from typing import Any, Callable, Dict, Optional, Set, Tuple, Type, Union
 
 import torch
 import torch.distributed as dist
+import torch.distributed._composable.fsdp
 from torch import nn
+from torch.distributed._tensor import distribute_tensor, DTensor
 from torch.distributed.fsdp import (
  FullyShardedDataParallel as FSDP,
  MixedPrecision,
  ShardingStrategy,
 )
 from torch.distributed.fsdp.wrap import ModuleWrapPolicy
+from torch.optim import Optimizer
 from torchtune.modules.peft.lora import (
  _lora_a_init_params,
  _lora_b_init_params,
  LoRALinear,
 )
-
 from torchtune.utils._device import _validate_device_from_env, get_device
 from torchtune.utils.logging import get_logger
 
@@ -297,3 +299,69 @@ def lora_wrap_fsdp(module: nn.Module, recurse: bool, **kwargs):
  return isinstance(module, tuple(modules_to_wrap))
 
  return lora_wrap_fsdp
+
+
+def load_from_full_state_dict(
+ model: torch.distributed._composable.fsdp.FSDP,
+ full_sd: Dict[str, Any],
+ device: torch.device,
+ is_rank_zero: bool,
+):
+ meta_sharded_sd = model.state_dict()
+ sharded_sd = {}
+ for param_name, full_tensor in full_sd.items():
+ sharded_meta_param = meta_sharded_sd.get(param_name)
+ if is_rank_zero:
+ full_tensor = full_tensor.detach().to(device)
+ else:
+ full_tensor = torch.empty(
+ sharded_meta_param.size(),
+ device=device,
+ dtype=sharded_meta_param.dtype,
+ )
+ torch.distributed.broadcast(full_tensor, src=0)
+ sharded_tensor = distribute_tensor(
+ full_tensor, sharded_meta_param.device_mesh, sharded_meta_param.placements
+ )
+ sharded_sd[param_name] = nn.Parameter(sharded_tensor)
+ model.load_state_dict(sharded_sd, strict=False, assign=True)
+
+
+def get_full_model_state_dict(
+ model: torch.distributed._composable.fsdp.FSDP,
+ is_rank_zero: bool,
+) -> Dict[str, Any]:
+ sharded_sd = model.state_dict()
+ cpu_state_dict = {}
+ for param_name, sharded_param in sharded_sd.items():
+ full_param = sharded_param.full_tensor()
+ if is_rank_zero:
+ cpu_state_dict[param_name] = full_param.cpu()
+ else:
+ del full_param
+ return cpu_state_dict
+
+
+def get_full_optimizer_state_dict(
+ opt: Optimizer,
+ is_rank_zero: bool,
+) -> Dict[str, Any]:
+ sharded_sd = opt.state_dict()
+ sharded_state = sharded_sd["state"]
+ full_state = {}
+ for group_id, sharded_group in sharded_state.items():
+ group_state = {}
+ for attr, sharded_tensor in sharded_group.items():
+ if isinstance(sharded_tensor, DTensor):
+ full_tensor = sharded_tensor.full_tensor()
+ else:
+ full_tensor = sharded_tensor
+ if is_rank_zero:
+ group_state[attr] = full_tensor.cpu()
+ else:
+ del full_tensor
+ full_state[group_id] = group_state
+ return {
+ "param_groups": sharded_sd["param_groups"],
+ "state": full_state,
+ }