Refactor AlpacaDataset with InstructDataset and add builders (#520)

pytorch · Mar 20, 2024 · 34aeb98 · 34aeb98
1 parent 3763738
commit 34aeb98
Show file tree

Hide file tree

Showing 17 changed files with 336 additions and 118 deletions.
diff --git a/docs/source/api_ref_datasets.rst b/docs/source/api_ref_datasets.rst
@@ -10,5 +10,5 @@ torchtune.datasets
  :toctree: generated/
  :nosignatures:
 
- AlpacaDataset
+ alpaca_dataset
  SlimOrcaDataset
diff --git a/docs/source/examples/configs.rst b/docs/source/examples/configs.rst
@@ -55,13 +55,13 @@ common examples of this. You can easily do this using the :code:`_component_`
 subfield. In :code:`_component_`, you need to specify the dotpath of the object
 you wish to instantiate in the recipe. The dotpath is the exact path you would use
 to import the object normally in a Python file. For example, to specify the
-:class:`~torchtune.datasets._alpaca.AlpacaDataset` in your config with custom
+:class:`~torchtune.datasets._alpaca.alpaca_dataset` in your config with custom
 arguments:
 
 .. code-block:: yaml
 
  dataset:
- _component_: torchtune.datasets.AlpacaDataset
+ _component_: torchtune.datasets.alpaca_dataset
  train_on_input: False
 
 Here, we are changing the default value for :code:`train_on_input` from :code:`True`
@@ -80,7 +80,7 @@ instance of the specified object in your recipe's setup like so:
 This will automatically use any keyword arguments specified in the fields under
 :code:`dataset`.
 
-As written, the preceding example will actually throw an error. If you look at the constructor for :class:`~torchtune.datasets._alpaca.AlpacaDataset`,
+As written, the preceding example will actually throw an error. If you look at the method for :class:`~torchtune.datasets._alpaca.alpaca_dataset`,
 you'll notice that we're missing a required positional argument, the tokenizer.
 Since this is another configurable TorchTune object, let's understand how to handle
 this by taking a look at the :func:`~torchtune.config._instantiate.instantiate` API.
@@ -106,7 +106,7 @@ keyword arguments not specified in the config if we'd like:
  path: /tmp/tokenizer.model
 
  dataset:
- _component_: torchtune.datasets.AlpacaDataset
+ _component_: torchtune.datasets.alpaca_dataset
  train_on_input: True
 
 .. code-block:: python
@@ -116,14 +116,11 @@ keyword arguments not specified in the config if we'd like:
 
  # Note the API of the dataset we specified - we need to pass in a tokenizer
  # and any optional keyword arguments
- class AlpacaDataset(Dataset):
- def __init__(
- self,
- tokenizer: Tokenizer,
- train_on_input: bool = True,
- use_clean: bool = False,
- **kwargs,
- ) -> None;
+ def alpaca_dataset(
+ tokenizer: Tokenizer,
+ train_on_input: bool = True,
+ use_clean: bool = False,
+ ) -> InstructDataset:
 
  from torchtune import config
 
@@ -171,15 +168,15 @@ make it significantly easier to debug.
 
  # dont do this
  alpaca_dataset:
- _component_: torchtune.datasets.AlpacaDataset
+ _component_: torchtune.datasets.alpaca_dataset
  train_on_input: True
  slimorca_dataset:
  ...
 
  # do this
  dataset:
  # change this in config or override when needed
- _component_: torchtune.datasets.AlpacaDataset
+ _component_: torchtune.datasets.alpaca_dataset
  train_on_input: True
 
 Use public APIs only
@@ -194,12 +191,12 @@ component dotpath.
 
  # don't do this
  dataset:
- _component_: torchtune.datasets._alpaca.AlpacaDataset
+ _component_: torchtune.datasets._alpaca.alpaca_dataset
  train_on_input: True
 
  # do this
  dataset:
- _component_: torchtune.datasets.AlpacaDataset
+ _component_: torchtune.datasets.alpaca_dataset
  train_on_input: True
 
 
@@ -226,7 +223,7 @@ name directly. Any nested fields in the components can be overridden with dot no
 .. code-block:: yaml
 
  dataset:
- _component_: torchtune.datasets.AlpacaDataset
+ _component_: torchtune.datasets.alpaca_dataset
  train_on_input: True
 
 .. code-block:: bash

diff --git a/docs/source/examples/finetune_llm.rst b/docs/source/examples/finetune_llm.rst
@@ -35,7 +35,7 @@ An example config for training the Llama 7B model using the Alpaca dataset looks
 
  # Dataset
  dataset:
- _component_: torchtune.datasets.AlpacaDataset
+ _component_: torchtune.datasets.alpaca_dataset
  shuffle: True
 
  # Model Arguments
@@ -67,14 +67,14 @@ To run the recipe without any changes on 4 GPUs, launch a training run using Tun
 Dataset
 -------
 
-In this example, we use :class:`~torchtune.datasets.AlpacaDataset`
+In this example, we use :class:`~torchtune.datasets.alpaca_dataset`
 from Stanford. The following parameters are related to the data:
 
 .. code-block:: python
 
  # Point the dataset to the Alpaca Dataset implementation in TorchTune
  # This is set in the config
- dataset: AlpacaDataset
+ dataset: alpaca_dataset
 
  # Don't mask the prompt during training
  # This is the default value

diff --git a/docs/source/examples/first_finetune_tutorial.rst b/docs/source/examples/first_finetune_tutorial.rst
@@ -102,7 +102,7 @@ lowering the epochs to 1 so you can see results sooner, and updating the learnin
 
  # Dataset
  dataset:
- _component_: torchtune.datasets.AlpacaDataset
+ _component_: torchtune.datasets.alpaca_dataset
  seed: 42
  shuffle: True
 

diff --git a/recipes/configs/alpaca_llama2_full_finetune_distributed.yaml b/recipes/configs/alpaca_llama2_full_finetune_distributed.yaml
@@ -10,7 +10,7 @@ tokenizer:
 
 # Dataset
 dataset:
- _component_: torchtune.datasets.AlpacaDataset
+ _component_: torchtune.datasets.alpaca_dataset
  train_on_input: True
 seed: null
 shuffle: True

diff --git a/recipes/configs/alpaca_llama2_full_finetune_single_device.yaml b/recipes/configs/alpaca_llama2_full_finetune_single_device.yaml
@@ -10,7 +10,7 @@ tokenizer:
 
 # Dataset
 dataset:
- _component_: torchtune.datasets.AlpacaDataset
+ _component_: torchtune.datasets.alpaca_dataset
  train_on_input: True
 seed: null
 shuffle: True

diff --git a/recipes/configs/alpaca_llama2_lora_finetune_distributed.yaml b/recipes/configs/alpaca_llama2_lora_finetune_distributed.yaml
@@ -29,7 +29,7 @@ tokenizer:
 
 # Dataset and Sampler
 dataset:
- _component_: torchtune.datasets.AlpacaDataset
+ _component_: torchtune.datasets.alpaca_dataset
  train_on_input: True
  use_clean: True
 seed: null

diff --git a/recipes/configs/alpaca_llama2_lora_finetune_single_device.yaml b/recipes/configs/alpaca_llama2_lora_finetune_single_device.yaml
@@ -29,7 +29,7 @@ tokenizer:
 
 # Dataset and Sampler
 dataset:
- _component_: torchtune.datasets.AlpacaDataset
+ _component_: torchtune.datasets.alpaca_dataset
  train_on_input: True
  use_clean: True
 seed: null

diff --git a/tests/recipes/full_finetune_test_config.yaml b/tests/recipes/full_finetune_test_config.yaml
@@ -11,7 +11,7 @@ tokenizer:
 
 # Dataset and Sampler
 dataset:
- _component_: torchtune.datasets.AlpacaDataset
+ _component_: torchtune.datasets.alpaca_dataset
  train_on_input: False
 seed: 9
 shuffle: True

diff --git a/tests/recipes/lora_finetune_test_config.yaml b/tests/recipes/lora_finetune_test_config.yaml
@@ -12,7 +12,7 @@ tokenizer:
 
 # Dataset and Sampler
 dataset:
- _component_: torchtune.datasets.AlpacaDataset
+ _component_: torchtune.datasets.alpaca_dataset
  train_on_input: False
 seed: 9
 shuffle: True

diff --git a/tests/torchtune/datasets/test_alpaca_dataset.py b/tests/torchtune/datasets/test_alpaca_dataset.py
@@ -10,7 +10,8 @@
 
 from tests.test_utils import get_assets_path
 
-from torchtune.datasets._alpaca import AlpacaDataset, CROSS_ENTROPY_IGNORE_IDX
+from torchtune.datasets._alpaca import alpaca_dataset
+from torchtune.datasets._common import CROSS_ENTROPY_IGNORE_IDX
 from torchtune.modules.tokenizer import Tokenizer
 
 
@@ -21,7 +22,7 @@ def tokenizer(self):
  # spm.SentencePieceTrainer.train('--input=<TRAIN_FILE> --model_prefix=m --vocab_size=2000')
  return Tokenizer.from_file(str(get_assets_path() / "m.model"))
 
- @patch("torchtune.datasets._alpaca.load_dataset")
+ @patch("torchtune.datasets._instruct.load_dataset")
  def test_label_no_masking(self, load_dataset, tokenizer):
  """
  Test whether the input and the labels are correctly created when the input is not masked.
@@ -40,15 +41,15 @@ def test_label_no_masking(self, load_dataset, tokenizer):
  }
  ]
 
- alpaca_dataset = AlpacaDataset(tokenizer=tokenizer)
- input, labels = alpaca_dataset[0]
+ alpaca_ds = alpaca_dataset(tokenizer=tokenizer)
+ input, labels = alpaca_ds[0]
 
  assert len(input) == len(labels)
  assert labels[-1] == tokenizer.eos_id
  assert input[0] == tokenizer.bos_id
  assert CROSS_ENTROPY_IGNORE_IDX not in labels
 
- @patch("torchtune.datasets._alpaca.load_dataset")
+ @patch("torchtune.datasets._instruct.load_dataset")
  def test_label_masking(self, load_dataset, tokenizer):
  """
  Test whether the input and the labels are correctly created when the input is masked.
@@ -67,23 +68,23 @@ def test_label_masking(self, load_dataset, tokenizer):
  }
  ]
 
- alpaca_dataset = AlpacaDataset(tokenizer=tokenizer, train_on_input=False)
+ alpaca_ds = alpaca_dataset(tokenizer=tokenizer, train_on_input=False)
 
  # Extract the prompt and tokenize it; we'll need this to test whether we're masking the
  # input correctly
- sample = alpaca_dataset._data[0]
- prompt = alpaca_dataset.template.format(sample=sample)
+ sample = alpaca_ds._data[0]
+ prompt = alpaca_ds.template.format(sample=sample)
  encoded_prompt = tokenizer.encode(text=prompt, add_bos=True, add_eos=False)
 
  # Generate the input and labels
- input, labels = alpaca_dataset[0]
+ input, labels = alpaca_ds[0]
 
  assert len(input) == len(labels)
  assert labels[-1] == tokenizer.eos_id
  assert input[0] == tokenizer.bos_id
  assert labels.count(CROSS_ENTROPY_IGNORE_IDX) == len(encoded_prompt)
 
- @patch("torchtune.datasets._alpaca.load_dataset")
+ @patch("torchtune.datasets._instruct.load_dataset")
  def test_alpaca_clean(self, load_dataset, tokenizer):
  """
  Test whether the input and the labels are correctly created when the input is not masked.
@@ -102,8 +103,8 @@ def test_alpaca_clean(self, load_dataset, tokenizer):
  }
  ]
 
- alpaca_dataset = AlpacaDataset(tokenizer=tokenizer, use_clean=True)
- input, labels = alpaca_dataset[0]
+ alpaca_ds = alpaca_dataset(tokenizer=tokenizer, use_clean=True)
+ input, labels = alpaca_ds[0]
 
  assert len(input) == len(labels)
  assert labels[-1] == tokenizer.eos_id

diff --git a/tests/torchtune/datasets/test_instruct_dataset.py b/tests/torchtune/datasets/test_instruct_dataset.py
@@ -0,0 +1,99 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from unittest import mock
+
+from torchtune.datasets import InstructDataset
+from torchtune.datasets._common import CROSS_ENTROPY_IGNORE_IDX
+
+
+class DummyTokenizer:
+ def encode(self, text, **kwargs):
+ words = text.split()
+ return [len(word) for word in words]
+
+
+def dummy_transform(sample):
+ sample["input"] = sample["input"] + " asdfghjkl; "
+ sample["instruction"] = sample["instruction"] + " asdfghjkl; "
+ return sample
+
+
+class DummyTemplate:
+ def __init__(self, template):
+ self.template = template
+
+ def format(self, sample, column_map):
+ return self.template.format(**sample)
+
+
+class TestInstructDataset:
+ template = DummyTemplate(
+ "Instruction:\n{instruction}\n\nInput:\n{input}\n\nResponse: "
+ )
+ expected_tokenized_prompts = [
+ [12, 4, 2, 3, 2, 12, 10, 6, 4, 2, 3, 2, 6, 10, 9, 1, 5, 4, 4, 3, 6, 2, 4],
+ [12, 4, 2, 2, 12, 10, 6, 4, 2, 2, 6, 10, 9, 1, 6, 4, 4, 3, 6, 2, 4],
+ ]
+
+ def get_samples(self):
+ return [
+ {
+ "instruction": "This is not an instruction.",
+ "input": "This is not an input.",
+ "output": "I never know what I'm doing, do you?",
+ },
+ {
+ "instruction": "This is an instruction.",
+ "input": "This is an input.",
+ "output": "I always know what I'm doing, do you?",
+ },
+ ]
+
+ @mock.patch("torchtune.datasets._instruct.load_dataset")
+ def test_get_item_no_train_on_input(self, mock_load_dataset):
+ mock_load_dataset.return_value = self.get_samples()
+ prompt_lengths = (15, 13)
+ expected_labels = [
+ [CROSS_ENTROPY_IGNORE_IDX] * prompt_lengths[0] + [1, 5, 4, 4, 3, 6, 2, 4],
+ [CROSS_ENTROPY_IGNORE_IDX] * prompt_lengths[1] + [1, 6, 4, 4, 3, 6, 2, 4],
+ ]
+
+ dataset = InstructDataset(
+ tokenizer=DummyTokenizer(),
+ source="iam/agoofy/goober",
+ template=self.template,
+ transform=dummy_transform,
+ train_on_input=False,
+ )
+ assert len(dataset) == 2
+ mock_load_dataset.assert_called_once()
+
+ for i in range(len(dataset)):
+ prompt, label = dataset[i]
+ print(prompt, label)
+ assert prompt == self.expected_tokenized_prompts[i]
+ assert label == expected_labels[i]
+
+ @mock.patch("torchtune.datasets._instruct.load_dataset")
+ def test_get_item_train_on_input(self, mock_load_dataset):
+ mock_load_dataset.return_value = self.get_samples()
+ expected_labels = self.expected_tokenized_prompts
+
+ dataset = InstructDataset(
+ tokenizer=DummyTokenizer(),
+ source="iam/agoofy/goober",
+ template=self.template,
+ transform=dummy_transform,
+ train_on_input=True,
+ )
+ assert len(dataset) == 2
+ mock_load_dataset.assert_called_once()
+
+ for i in range(len(dataset)):
+ prompt, label = dataset[i]
+ assert prompt == self.expected_tokenized_prompts[i]
+ assert label == expected_labels[i]
diff --git a/torchtune/datasets/__init__.py b/torchtune/datasets/__init__.py
@@ -4,10 +4,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from ._alpaca import AlpacaDataset
-from ._slimorca import SlimOrcaDataset
+from torchtune.datasets._alpaca import alpaca_dataset
+from torchtune.datasets._instruct import InstructDataset
+from torchtune.datasets._slimorca import SlimOrcaDataset
 
 __all__ = [
- "AlpacaDataset",
+ "alpaca_dataset",
  "SlimOrcaDataset",
+ "InstructDataset",
 ]