pytorch · RdoubleA · Aug 30, 2024 · Aug 28, 2024 · Aug 28, 2024 · Aug 28, 2024
diff --git a/docs/source/api_ref_training.rst b/docs/source/api_ref_training.rst
@@ -4,6 +4,25 @@ torchtune.training
 
 .. currentmodule:: torchtune.training
 
+.. _checkpointing_label:
+
+Checkpointing
+-------------
+
+torchtune offers checkpointers to allow seamless transitioning between checkpoint formats for training and interoperability with the rest of the ecosystem. For a comprehensive overview of
+checkpointing, please see the :ref:`checkpointing deep-dive <understand_checkpointer>`.
+
+.. autosummary::
+ :toctree: generated/
+ :nosignatures:
+
+ FullModelHFCheckpointer
+ FullModelMetaCheckpointer
+ FullModelTorchTuneCheckpointer
+ ModelType
+ update_state_dict_for_classifier
+
+
 Reduced Precision
 ------------------
 

diff --git a/docs/source/api_ref_utilities.rst b/docs/source/api_ref_utilities.rst
@@ -1,28 +1,9 @@
-=================
+===============
 torchtune.utils
-=================
+===============
 
 .. currentmodule:: torchtune.utils
 
-
-.. _checkpointing_label:
-
-Checkpointing
--------------
-
-torchtune offers checkpointers to allow seamless transitioning between checkpoint formats for training and interoperability with the rest of the ecosystem. For a comprehensive overview of
-checkpointing, please see the :ref:`checkpointing deep-dive <understand_checkpointer>`.
-
-.. autosummary::
- :toctree: generated/
- :nosignatures:
-
- FullModelHFCheckpointer
- FullModelMetaCheckpointer
- FullModelTorchTuneCheckpointer
- ModelType
- update_state_dict_for_classifier
-
 .. _dist_label:
 
 Distributed

diff --git a/docs/source/deep_dives/checkpointer.rst b/docs/source/deep_dives/checkpointer.rst
@@ -135,8 +135,8 @@ torchtune supports three different
 each of which supports a different checkpoint format.
 
 
-:class:`HFCheckpointer <torchtune.utils.FullModelHFCheckpointer>`
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+:class:`HFCheckpointer <torchtune.training.FullModelHFCheckpointer>`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 This checkpointer reads and writes checkpoints in a format which is compatible with the transformers
 framework from Hugging Face. As mentioned above, this is the most popular format within the Hugging Face
@@ -167,7 +167,7 @@ The following snippet explains how the HFCheckpointer is setup in torchtune conf
  checkpointer:
 
  # checkpointer to use
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
 
  # directory with the checkpoint files
  # this should match the output_dir above
@@ -205,8 +205,8 @@ The following snippet explains how the HFCheckpointer is setup in torchtune conf
 
 |
 
-:class:`MetaCheckpointer <torchtune.utils.FullModelMetaCheckpointer>`
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+:class:`MetaCheckpointer <torchtune.training.FullModelMetaCheckpointer>`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 This checkpointer reads and writes checkpoints in a format which is compatible with the original meta-llama
 github repository.
@@ -237,7 +237,7 @@ The following snippet explains how the MetaCheckpointer is setup in torchtune co
  checkpointer:
 
  # checkpointer to use
- _component_: torchtune.utils.FullModelMetaCheckpointer
+ _component_: torchtune.training.FullModelMetaCheckpointer
 
  # directory with the checkpoint files
  # this should match the output_dir above
@@ -265,8 +265,8 @@ The following snippet explains how the MetaCheckpointer is setup in torchtune co
 
 |
 
-:class:`TorchTuneCheckpointer <torchtune.utils.FullModelTorchTuneCheckpointer>`
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+:class:`TorchTuneCheckpointer <torchtune.training.FullModelTorchTuneCheckpointer>`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 This checkpointer reads and writes checkpoints in a format that is compatible with torchtune's
 model definition. This does not perform any state_dict conversions and is currently used either
@@ -335,7 +335,7 @@ to the config file
  checkpointer:
 
  # checkpointer to use
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
 
  checkpoint_dir: <checkpoint_dir>
 
@@ -381,7 +381,7 @@ looks something like this:
  checkpointer:
 
  # checkpointer to use
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
 
  # directory with the checkpoint files
  # this should match the output_dir above
@@ -427,7 +427,7 @@ For this section we'll use the Llama2 13B model in HF format.
 .. code-block:: python
 
  import torch
- from torchtune.utils import FullModelHFCheckpointer, ModelType
+ from torchtune.training import FullModelHFCheckpointer, ModelType
  from torchtune.models.llama2 import llama2_13b
 
  # Set the right directory and files

diff --git a/docs/source/deep_dives/wandb_logging.rst b/docs/source/deep_dives/wandb_logging.rst
@@ -87,10 +87,10 @@ A suggested approach would be something like this:
  description="Model checkpoint",
  # you can add whatever metadata you want as a dict
  metadata={
- utils.SEED_KEY: self.seed,
- utils.EPOCHS_KEY: self.epochs_run,
- utils.TOTAL_EPOCHS_KEY: self.total_epochs,
- utils.MAX_STEPS_KEY: self.max_steps_per_epoch,
+ training.SEED_KEY: self.seed,
+ training.EPOCHS_KEY: self.epochs_run,
+ training.TOTAL_EPOCHS_KEY: self.total_epochs,
+ training.MAX_STEPS_KEY: self.max_steps_per_epoch,
  }
  )
  wandb_at.add_file(checkpoint_file)

diff --git a/docs/source/tutorials/e2e_flow.rst b/docs/source/tutorials/e2e_flow.rst
@@ -195,7 +195,7 @@ First, we modify ``custom_eval_config.yaml`` to include the fine-tuned checkpoin
 .. code-block:: yaml
 
  checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
 
  # directory with the checkpoint files
  # this should match the output_dir specified during
@@ -262,7 +262,7 @@ Let's modify ``custom_generation_config.yaml`` to include the following changes.
 .. code-block:: yaml
 
  checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
 
  # directory with the checkpoint files
  # this should match the output_dir specified during

diff --git a/docs/source/tutorials/llama3.rst b/docs/source/tutorials/llama3.rst
@@ -149,7 +149,7 @@ Next, we modify ``custom_eval_config.yaml`` to include the fine-tuned checkpoint
  _component_: torchtune.models.llama3.llama3_8b
 
  checkpointer:
- _component_: torchtune.utils.FullModelMetaCheckpointer
+ _component_: torchtune.training.FullModelMetaCheckpointer
 
  # directory with the checkpoint files
  # this should match the output_dir specified during
@@ -203,7 +203,7 @@ Now we modify ``custom_generation_config.yaml`` to point to our checkpoint and t
  _component_: torchtune.models.llama3.llama3_8b
 
  checkpointer:
- _component_: torchtune.utils.FullModelMetaCheckpointer
+ _component_: torchtune.training.FullModelMetaCheckpointer
 
  # directory with the checkpoint files
  # this should match the output_dir specified during

diff --git a/docs/source/tutorials/qat_finetune.rst b/docs/source/tutorials/qat_finetune.rst
@@ -223,7 +223,7 @@ copy and make the following modifications to the quantization config:
  _component_: torchtune.models.llama3.llama3_8b
 
  checkpointer:
- _component_: torchtune.utils.FullModelMetaCheckpointer
+ _component_: torchtune.training.FullModelMetaCheckpointer
  checkpoint_dir: <your QAT checkpoint dir>
  checkpoint_files: [meta_model_0.pt]
  recipe_checkpoint: null
@@ -269,7 +269,7 @@ integrated in torchtune. First, copy the evaluation config and make the followin
  _component_: torchtune.models.llama3.llama3_8b
 
  checkpointer:
- _component_: torchtune.utils.FullModelTorchTuneCheckpointer
+ _component_: torchtune.training.FullModelTorchTuneCheckpointer
  checkpoint_dir: <your quantized model checkpoint dir>
  checkpoint_files: [meta_model_0-8da4w.pt]
  recipe_checkpoint: null

diff --git a/recipes/configs/code_llama2/7B_full_low_memory.yaml b/recipes/configs/code_llama2/7B_full_low_memory.yaml
@@ -31,7 +31,7 @@ tokenizer:
 
 # Checkpointer
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/CodeLlama-7b-hf
  checkpoint_files: [
  pytorch_model-00001-of-00003.bin,

diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml
@@ -32,7 +32,7 @@ tokenizer:
 
 # Checkpointer
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/CodeLlama-7b-hf
  checkpoint_files: [
  pytorch_model-00001-of-00003.bin,

diff --git a/recipes/configs/code_llama2/7B_qlora_single_device.yaml b/recipes/configs/code_llama2/7B_qlora_single_device.yaml
@@ -32,7 +32,7 @@ tokenizer:
 
 # Checkpointer
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/CodeLlama-7b-hf
  checkpoint_files: [
  pytorch_model-00001-of-00003.bin,

diff --git a/recipes/configs/dev/8B_full_experimental.yaml b/recipes/configs/dev/8B_full_experimental.yaml
@@ -35,7 +35,7 @@ model:
  _component_: torchtune.models.llama3.llama3_8b
 
 checkpointer:
- _component_: torchtune.utils.FullModelMetaCheckpointer
+ _component_: torchtune.training.FullModelMetaCheckpointer
  checkpoint_dir: /tmp/Meta-Llama-3-8B/original/
  checkpoint_files: [
  consolidated.00.pth

diff --git a/recipes/configs/dev/llama2/13B_lora_fsdp2.yaml b/recipes/configs/dev/llama2/13B_lora_fsdp2.yaml
@@ -33,7 +33,7 @@ model:
  lora_alpha: 16
 
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/Llama-2-13b-hf/
  checkpoint_files: [
  pytorch_model-00001-of-00003.bin,

diff --git a/recipes/configs/dev/llama2/70B_lora_fsdp2.yaml b/recipes/configs/dev/llama2/70B_lora_fsdp2.yaml
@@ -28,7 +28,7 @@ tokenizer:
  max_seq_len: null
 
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/Llama-2-70b-hf
  checkpoint_files: [
  pytorch_model-00001-of-00015.bin,

diff --git a/recipes/configs/dev/llama2/70B_qlora_fsdp2.yaml b/recipes/configs/dev/llama2/70B_qlora_fsdp2.yaml
@@ -28,7 +28,7 @@ tokenizer:
  max_seq_len: null
 
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/Llama-2-70b-hf
  checkpoint_files: [
  pytorch_model-00001-of-00015.bin,

diff --git a/recipes/configs/dev/llama2/7B_lora_fsdp2.yaml b/recipes/configs/dev/llama2/7B_lora_fsdp2.yaml
@@ -37,7 +37,7 @@ tokenizer:
  max_seq_len: null
 
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/Llama-2-7b-hf
  checkpoint_files: [
  pytorch_model-00001-of-00002.bin,

diff --git a/recipes/configs/dev/llama2/7B_qlora_fsdp2.yaml b/recipes/configs/dev/llama2/7B_qlora_fsdp2.yaml
@@ -36,7 +36,7 @@ tokenizer:
  max_seq_len: null
 
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/Llama-2-7b-hf
  checkpoint_files: [
  pytorch_model-00001-of-00002.bin,

diff --git a/recipes/configs/eleuther_evaluation.yaml b/recipes/configs/eleuther_evaluation.yaml
@@ -8,7 +8,7 @@ model:
  _component_: torchtune.models.llama2.llama2_7b
 
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/Llama-2-7b-hf
  checkpoint_files: [
  pytorch_model-00001-of-00002.bin,

diff --git a/recipes/configs/gemma/2B_full.yaml b/recipes/configs/gemma/2B_full.yaml
@@ -32,7 +32,7 @@ model:
  _component_: torchtune.models.gemma.gemma_2b
 
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/gemma-2b/
  checkpoint_files: [
  model-00001-of-00002.safetensors,

diff --git a/recipes/configs/gemma/2B_lora.yaml b/recipes/configs/gemma/2B_lora.yaml
@@ -36,7 +36,7 @@ model:
  lora_alpha: 16
 
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/gemma-2b/
  checkpoint_files: [
  model-00001-of-00002.safetensors,

diff --git a/recipes/configs/gemma/2B_lora_single_device.yaml b/recipes/configs/gemma/2B_lora_single_device.yaml
@@ -35,7 +35,7 @@ model:
  lora_alpha: 16
 
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/gemma-2b/
  checkpoint_files: [
  model-00001-of-00002.safetensors,

diff --git a/recipes/configs/gemma/2B_qlora_single_device.yaml b/recipes/configs/gemma/2B_qlora_single_device.yaml
@@ -35,7 +35,7 @@ model:
  lora_alpha: 16
 
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/gemma-2b/
  checkpoint_files: [
  model-00001-of-00002.safetensors,

diff --git a/recipes/configs/gemma/7B_full.yaml b/recipes/configs/gemma/7B_full.yaml
@@ -32,7 +32,7 @@ model:
  _component_: torchtune.models.gemma.gemma_7b
 
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/gemma-7b/
  checkpoint_files: [
  model-00001-of-00004.safetensors,

diff --git a/recipes/configs/gemma/7B_lora.yaml b/recipes/configs/gemma/7B_lora.yaml
@@ -36,7 +36,7 @@ model:
  lora_alpha: 16
 
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/gemma-7b/
  checkpoint_files: [
  model-00001-of-00004.safetensors,

diff --git a/recipes/configs/gemma/7B_lora_single_device.yaml b/recipes/configs/gemma/7B_lora_single_device.yaml
@@ -35,7 +35,7 @@ model:
  lora_alpha: 16
 
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/gemma-7b/
  checkpoint_files: [
  model-00001-of-00004.safetensors,

diff --git a/recipes/configs/gemma/7B_qlora_single_device.yaml b/recipes/configs/gemma/7B_qlora_single_device.yaml
@@ -35,7 +35,7 @@ model:
  lora_alpha: 16
 
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/gemma-7b/
  checkpoint_files: [
  model-00001-of-00004.safetensors,

diff --git a/recipes/configs/generation.yaml b/recipes/configs/generation.yaml
@@ -8,7 +8,7 @@ model:
  _component_: torchtune.models.llama2.llama2_7b
 
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/Llama-2-7b-hf/
  checkpoint_files: [
  pytorch_model-00001-of-00002.bin,

diff --git a/recipes/configs/llama2/13B_full.yaml b/recipes/configs/llama2/13B_full.yaml
@@ -23,7 +23,7 @@ model:
  _component_: torchtune.models.llama2.llama2_13b
 
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/Llama-2-13b-hf/
  checkpoint_files: [
  pytorch_model-00001-of-00003.bin,

diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml
@@ -29,7 +29,7 @@ model:
  lora_alpha: 16
 
 checkpointer:
- _component_: torchtune.utils.FullModelHFCheckpointer
+ _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/Llama-2-13b-hf/
  checkpoint_files: [
  pytorch_model-00001-of-00003.bin,