pytorch · RdoubleA · Sep 4, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024
diff --git a/tests/assets/test_image.jpg b/tests/assets/test_image.jpg
diff --git a/tests/torchtune/data/test_data_utils.py b/tests/torchtune/data/test_data_utils.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import pytest
-from torchtune.data import Message, truncate, validate_messages
+from torchtune.data import Message, split_text_by_image_tag, truncate, validate_messages
 
 
 def test_truncate():
@@ -88,3 +88,25 @@ def test_validate_messages():
  match="Assistant message before expected user message at index 0 in messages",
  ):
  validate_messages(messages)
+
+
+def test_split_text_by_image_tag():
+ text = "hello <image>world"
+ assert split_text_by_image_tag(text, "<image>") == [
+ {"type": "text", "content": "hello "},
+ {"type": "image"},
+ {"type": "text", "content": "world"},
+ ]
+
+ text = "[image]hello [image]world"
+ assert split_text_by_image_tag(text, "[image]") == [
+ {"type": "image"},
+ {"type": "text", "content": "hello "},
+ {"type": "image"},
+ {"type": "text", "content": "world"},
+ ]
+
+ text = "hello world"
+ assert split_text_by_image_tag(text, "asdfghjkl;") == [
+ {"type": "text", "content": "hello world"}
+ ]
diff --git a/tests/torchtune/datasets/test_llava_instruct_dataset.py b/tests/torchtune/datasets/test_llava_instruct_dataset.py
@@ -0,0 +1,138 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import Counter
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+from datasets import Dataset
+from PIL import Image
+
+from tests.test_utils import DummyTokenizer
+from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
+
+from torchtune.datasets import llava_instruct_dataset
+
+ASSETS = Path(__file__).parent.parent.parent / "assets"
+
+
+class TestLLaVAInstructDataset:
+ @pytest.fixture
+ def tokenizer(self):
+ return DummyTokenizer()
+
+ @patch("torchtune.datasets._sft.load_dataset")
+ def test_label_no_masking(self, load_dataset, tokenizer):
+ """
+ Test whether the input and the labels are correctly created when the input is not masked.
+ """
+
+ # mock the call to HF datasets
+ load_dataset.return_value = Dataset.from_list(
+ [
+ {
+ "image": "test_image.jpg",
+ "conversations": [
+ {
+ "from": "human",
+ "value": "<image>\nWhat can you infer about the man's outdoor activity?",
+ },
+ {
+ "from": "gpt",
+ "value": "From the image, we can infer that the man is engaging in a "
+ "recreational activity involving a frisbee in a park or grass field. "
+ "The frisbee is in the air, and the man appears to be either catching "
+ "or throwing it. This suggests that he might be playing a casual game "
+ "of catch with a friend or practicing his frisbee skills, enjoying the "
+ "outdoors and getting some physical activity at the same time.",
+ },
+ ],
+ }
+ ]
+ )
+
+ ds = llava_instruct_dataset(
+ model_transform=tokenizer, train_on_input=True, coco_image_dir=str(ASSETS)
+ )
+ input, labels, images = ds[0]["tokens"], ds[0]["labels"], ds[0]["images"][0]
+
+ expected_count = {
+ 3: 17,
+ 2: 15,
+ 4: 11,
+ 8: 9,
+ 5: 8,
+ 7: 8,
+ 6: 5,
+ 1: 5,
+ 9: 2,
+ 0: 1,
+ -2: 1,
+ 12: 1,
+ 10: 1,
+ -1: 1,
+ }
+
+ assert Counter(input) == expected_count
+ assert Counter(labels) == expected_count
+ assert isinstance(images, Image.Image)
+
+ @patch("torchtune.datasets._sft.load_dataset")
+ def test_label_masking(self, load_dataset, tokenizer):
+ """
+ Test whether the input and the labels are correctly created when the input is masked.
+ """
+
+ # mock the call to HF datasets
+ load_dataset.return_value = Dataset.from_list(
+ [
+ {
+ "image": "test_image.jpg",
+ "conversations": [
+ {
+ "from": "human",
+ "value": "<image>\nWhat can you infer about the man's outdoor activity?",
+ },
+ {
+ "from": "gpt",
+ "value": "From the image, we can infer that the man is engaging in a "
+ "recreational activity involving a frisbee in a park or grass field. "
+ "The frisbee is in the air, and the man appears to be either catching "
+ "or throwing it. This suggests that he might be playing a casual game "
+ "of catch with a friend or practicing his frisbee skills, enjoying the "
+ "outdoors and getting some physical activity at the same time.",
+ },
+ ],
+ }
+ ]
+ )
+
+ ds = llava_instruct_dataset(
+ model_transform=tokenizer, train_on_input=False, coco_image_dir=str(ASSETS)
+ )
+ input, labels, images = ds[0]["tokens"], ds[0]["labels"], ds[0]["images"][0]
+
+ expected_count = {
+ 3: 17,
+ 2: 15,
+ 4: 11,
+ 8: 9,
+ 5: 8,
+ 7: 8,
+ 6: 5,
+ 1: 5,
+ 9: 2,
+ 0: 1,
+ -2: 1,
+ 12: 1,
+ 10: 1,
+ -1: 1,
+ }
+
+ assert Counter(input) == expected_count
+ assert labels.count(CROSS_ENTROPY_IGNORE_IDX) == 11
+ assert isinstance(images, Image.Image)
diff --git a/tests/torchtune/datasets/test_the_cauldron_dataset.py b/tests/torchtune/datasets/test_the_cauldron_dataset.py
@@ -0,0 +1,157 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pdb
+from unittest.mock import patch
+
+import pytest
+import torch
+from datasets import Dataset
+
+from tests.test_utils import DummyTokenizer
+from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
+
+from torchtune.datasets import the_cauldron_dataset
+from torchvision.transforms import PILToTensor, ToPILImage
+
+
+class TestTheCauldronDataset:
+ @pytest.fixture
+ def tokenizer(self):
+ return DummyTokenizer()
+
+ @patch("torchtune.datasets._sft.load_dataset")
+ def test_label_no_masking(self, load_dataset, tokenizer):
+ """
+ Test whether the input and the labels are correctly created when the input is not masked.
+ """
+
+ image_tensor = torch.randint(0, 256, (3, 4, 4), dtype=torch.uint8)
+ # mock the call to HF datasets
+ load_dataset.return_value = Dataset.from_list(
+ [
+ {
+ "images": [ToPILImage()(image_tensor)],
+ "texts": [
+ {
+ "user": "Question: What do respiration and combustion give out"
+ "\nChoices:\nA. Oxygen\nB. Carbon dioxide\nC. Nitrogen\nD. Heat"
+ "\nAnswer with the letter.",
+ "assistant": "Answer: B",
+ "source": "AI2D",
+ }
+ ],
+ }
+ ]
+ )
+
+ ds = the_cauldron_dataset(
+ model_transform=tokenizer, subset="dummy", train_on_input=True
+ )
+ input, labels, images = (
+ ds[0]["tokens"],
+ ds[0]["labels"],
+ ds[0]["images"][0],
+ )
+
+ assert input == [
+ 0,
+ -2,
+ 9,
+ 4,
+ 2,
+ 11,
+ 3,
+ 10,
+ 4,
+ 3,
+ 8,
+ 2,
+ 6,
+ 2,
+ 6,
+ 7,
+ 2,
+ 8,
+ 2,
+ 4,
+ 6,
+ 4,
+ 3,
+ 7,
+ 7,
+ 1,
+ -1,
+ ]
+ assert labels == input
+ pdb.set_trace()
+ torch.testing.assert_close(PILToTensor()(images), image_tensor)
+
+ @patch("torchtune.datasets._sft.load_dataset")
+ def test_label_masking(self, load_dataset, tokenizer):
+ """
+ Test whether the input and the labels are correctly created when the input is masked.
+ """
+
+ image_tensor = torch.randint(0, 256, (3, 4, 4), dtype=torch.uint8)
+ # mock the call to HF datasets
+ load_dataset.return_value = Dataset.from_list(
+ [
+ {
+ "images": [ToPILImage()(image_tensor)],
+ "texts": [
+ {
+ "user": "Question: What do respiration and combustion give out"
+ "\nChoices:\nA. Oxygen\nB. Carbon dioxide\nC. Nitrogen\nD. Heat"
+ "\nAnswer with the letter.",
+ "assistant": "Answer: B",
+ "source": "AI2D",
+ }
+ ],
+ }
+ ]
+ )
+
+ ds = the_cauldron_dataset(
+ model_transform=tokenizer, subset="dummy", train_on_input=False
+ )
+ input, labels, images = (
+ ds[0]["tokens"],
+ ds[0]["labels"],
+ ds[0]["images"][0],
+ )
+
+ assert input == [
+ 0,
+ -2,
+ 9,
+ 4,
+ 2,
+ 11,
+ 3,
+ 10,
+ 4,
+ 3,
+ 8,
+ 2,
+ 6,
+ 2,
+ 6,
+ 7,
+ 2,
+ 8,
+ 2,
+ 4,
+ 6,
+ 4,
+ 3,
+ 7,
+ 7,
+ 1,
+ -1,
+ ]
+ assert labels.count(CROSS_ENTROPY_IGNORE_IDX) == 24
+ torch.testing.assert_close(PILToTensor()(images), image_tensor)
diff --git a/torchtune/data/__init__.py b/torchtune/data/__init__.py
@@ -20,6 +20,7 @@
  Message,
  Role,
  ShareGPTToMessages,
+ validate_messages,
 )
 from torchtune.data._prompt_templates import (
  ChatMLTemplate,
@@ -29,7 +30,7 @@
  QuestionAnswerTemplate,
  SummarizeTemplate,
 )
-from torchtune.data._utils import truncate, validate_messages
+from torchtune.data._utils import split_text_by_image_tag, truncate
 
 __all__ = [
  "ChatFormat",
@@ -46,6 +47,7 @@
  "Message",
  "validate_messages",
  "Role",
+ "split_text_by_image_tag",
  "PromptTemplateInterface",
  "PromptTemplate",
  "InputOutputToMessages",