From c8e611673cdceeed1807d3f934c134c68bd2a9c2 Mon Sep 17 00:00:00 2001
From: Chris Cummins <chrisc.101@gmail.com>
Date: Fri, 26 Feb 2021 15:24:26 +0000
Subject: [PATCH] Rename Dataset tuple to LegacyDataset.

In preparation for introducing a new Dataset class.

Issue #45.
---
 compiler_gym/bin/datasets.py       | 10 ++++++++--
 compiler_gym/datasets/__init__.py  | 10 ++++++++--
 compiler_gym/datasets/dataset.py   | 22 ++++++++++++----------
 compiler_gym/envs/compiler_env.py  | 26 +++++++++++++++-----------
 compiler_gym/envs/llvm/datasets.py | 24 ++++++++++++------------
 tests/compiler_env_test.py         |  6 +++---
 6 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/compiler_gym/bin/datasets.py b/compiler_gym/bin/datasets.py
index 4cab24bc89..c6eec47306 100644
--- a/compiler_gym/bin/datasets.py
+++ b/compiler_gym/bin/datasets.py
@@ -139,7 +139,13 @@
 import humanize
 from absl import app, flags
 
-from compiler_gym.datasets.dataset import Dataset, activate, deactivate, delete, require
+from compiler_gym.datasets.dataset import (
+    LegacyDataset,
+    activate,
+    deactivate,
+    delete,
+    require,
+)
 from compiler_gym.util.flags.env_from_flags import env_from_flags
 from compiler_gym.util.tabulate import tabulate
 
@@ -183,7 +189,7 @@ def enumerate_directory(name: str, path: Path):
     for path in path.iterdir():
         if not path.is_file() or not path.name.endswith(".json"):
             continue
-        dataset = Dataset.from_json_file(path)
+        dataset = LegacyDataset.from_json_file(path)
         rows.append(
             (dataset.name, dataset.license, dataset.file_count, dataset.size_bytes)
         )
diff --git a/compiler_gym/datasets/__init__.py b/compiler_gym/datasets/__init__.py
index 6b9af1e543..b0dc9440c5 100644
--- a/compiler_gym/datasets/__init__.py
+++ b/compiler_gym/datasets/__init__.py
@@ -3,6 +3,12 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 """Manage datasets of benchmarks."""
-from compiler_gym.datasets.dataset import Dataset, activate, deactivate, delete, require
+from compiler_gym.datasets.dataset import (
+    LegacyDataset,
+    activate,
+    deactivate,
+    delete,
+    require,
+)
 
-__all__ = ["Dataset", "require", "activate", "deactivate", "delete"]
+__all__ = ["LegacyDataset", "require", "activate", "deactivate", "delete"]
diff --git a/compiler_gym/datasets/dataset.py b/compiler_gym/datasets/dataset.py
index bed7c9f375..32390b27d8 100644
--- a/compiler_gym/datasets/dataset.py
+++ b/compiler_gym/datasets/dataset.py
@@ -15,7 +15,7 @@
 from compiler_gym.util.download import download
 
 
-class Dataset(NamedTuple):
+class LegacyDataset(NamedTuple):
     """A collection of benchmarks for use by an environment."""
 
     name: str
@@ -48,11 +48,11 @@ class Dataset(NamedTuple):
     """A list of platforms supported by this dataset. Allowed platforms 'macos' and 'linux'."""
 
     @classmethod
-    def from_json_file(cls, path: Path) -> "Dataset":
+    def from_json_file(cls, path: Path) -> "LegacyDataset":
         """Construct a dataset form a JSON metadata file.
 
         :param path: Path of the JSON metadata.
-        :return: A Dataset instance.
+        :return: A LegacyDataset instance.
         """
         try:
             with open(str(path), "rb") as f:
@@ -136,7 +136,7 @@ def deactivate(env, name: str) -> bool:
         return True
 
 
-def require(env, dataset: Union[str, Dataset]) -> bool:
+def require(env, dataset: Union[str, LegacyDataset]) -> bool:
     """Require that the given dataset is available to the environment.
 
     This will download and activate the dataset if it is not already installed.
@@ -151,12 +151,14 @@ def require(env, dataset: Union[str, Dataset]) -> bool:
 
     :param env: The environment that this dataset is required for.
     :param dataset: The name of the dataset to download, the URL of the dataset,
-        or a :class:`Dataset` instance.
+        or a :class:`LegacyDataset` instance.
     :return: :code:`True` if the dataset was downloaded, or :code:`False` if the
         dataset was already available.
     """
 
-    def download_and_unpack_archive(url: str, sha256: Optional[str] = None) -> Dataset:
+    def download_and_unpack_archive(
+        url: str, sha256: Optional[str] = None
+    ) -> LegacyDataset:
         json_files_before = {
             f
             for f in env.inactive_datasets_site_path.iterdir()
@@ -173,9 +175,9 @@ def download_and_unpack_archive(url: str, sha256: Optional[str] = None) -> Datas
         new_json = json_files_after - json_files_before
         if not len(new_json):
             raise OSError(f"Downloaded dataset {url} contains no metadata JSON file")
-        return Dataset.from_json_file(list(new_json)[0])
+        return LegacyDataset.from_json_file(list(new_json)[0])
 
-    def unpack_local_archive(path: Path) -> Dataset:
+    def unpack_local_archive(path: Path) -> LegacyDataset:
         if not path.is_file():
             raise FileNotFoundError(f"File not found: {path}")
         json_files_before = {
@@ -193,12 +195,12 @@ def unpack_local_archive(path: Path) -> Dataset:
         new_json = json_files_after - json_files_before
         if not len(new_json):
             raise OSError(f"Downloaded dataset {url} contains no metadata JSON file")
-        return Dataset.from_json_file(list(new_json)[0])
+        return LegacyDataset.from_json_file(list(new_json)[0])
 
     with fasteners.InterProcessLock(env.datasets_site_path / "LOCK"):
         # Resolve the name and URL of the dataset.
         sha256 = None
-        if isinstance(dataset, Dataset):
+        if isinstance(dataset, LegacyDataset):
             name, url = dataset.name, dataset.url
         elif isinstance(dataset, str):
             # Check if we have already downloaded the dataset.
diff --git a/compiler_gym/envs/compiler_env.py b/compiler_gym/envs/compiler_env.py
index 985d874399..26abf8095d 100644
--- a/compiler_gym/envs/compiler_env.py
+++ b/compiler_gym/envs/compiler_env.py
@@ -19,7 +19,7 @@
 from gym.spaces import Space
 
 from compiler_gym.compiler_env_state import CompilerEnvState
-from compiler_gym.datasets.dataset import Dataset, require
+from compiler_gym.datasets.dataset import LegacyDataset, require
 from compiler_gym.service import (
     CompilerGymServiceConnection,
     ConnectionOpts,
@@ -103,9 +103,9 @@ class CompilerEnv(gym.Env):
         to store benchmarks.
     :vartype datasets_site_path: Optional[Path]
 
-    :ivar available_datasets: A mapping from dataset name to :class:`Dataset`
+    :ivar available_datasets: A mapping from dataset name to :class:`LegacyDataset`
         objects that are available to download.
-    :vartype available_datasets: Dict[str, Dataset]
+    :vartype available_datasets: Dict[str, LegacyDataset]
 
     :ivar observation: A view of the available observation spaces that permits
         on-demand computation of observations.
@@ -196,7 +196,7 @@ def __init__(
         self._service_endpoint: Union[str, Path] = service
         self._connection_settings = connection_settings or ConnectionOpts()
         self.datasets_site_path: Optional[Path] = None
-        self.available_datasets: Dict[str, Dataset] = {}
+        self.available_datasets: Dict[str, LegacyDataset] = {}
 
         # The benchmark that is currently being used, and the benchmark that
         # the user requested. Those do not always correlate, since the user
@@ -818,7 +818,7 @@ def _reward_view_type(self):
         """
         return RewardView
 
-    def require_datasets(self, datasets: List[Union[str, Dataset]]) -> None:
+    def require_datasets(self, datasets: List[Union[str, LegacyDataset]]) -> None:
         """Require that the given datasets are available to the environment.
 
         Example usage:
@@ -834,8 +834,11 @@ def require_datasets(self, datasets: List[Union[str, Dataset]]) -> None:
 
         :param datasets: A list of datasets to require. Each dataset is the name
             of an available dataset, the URL of a dataset to download, or a
-            :class:`Dataset` instance.
+            :class:`LegacyDataset` instance.
+
+        :return: Whether a new dataset was downloaded.
         """
+        self.logger.debug("Requiring datasets: %s", datasets)
         dataset_installed = False
         for dataset in datasets:
             dataset_installed |= require(self, dataset)
@@ -849,15 +852,16 @@ def require_datasets(self, datasets: List[Union[str, Dataset]]) -> None:
                 ),
             )
             self.make_manifest_file()
+        return dataset_installed
 
-    def require_dataset(self, dataset: Union[str, Dataset]) -> None:
+    def require_dataset(self, dataset: Union[str, LegacyDataset]) -> bool:
         """Require that the given dataset is available to the environment.
 
         Alias for
         :meth:`env.require_datasets([dataset]) <compiler_gym.envs.CompilerEnv.require_datasets>`.
 
         :param dataset: The name of the dataset to download, the URL of the dataset, or a
-            :class:`Dataset` instance.
+            :class:`LegacyDataset` instance.
         """
         return self.require_datasets([dataset])
 
@@ -885,7 +889,7 @@ def make_manifest_file(self) -> Path:
                     )
         return manifest_path
 
-    def register_dataset(self, dataset: Dataset) -> bool:
+    def register_dataset(self, dataset: LegacyDataset) -> bool:
         """Register a new dataset.
 
         After registering, the dataset name may be used by
@@ -894,13 +898,13 @@ def register_dataset(self, dataset: Dataset) -> bool:
 
         Example usage:
 
-            >>> my_dataset = Dataset(name="my-dataset-v0", ...)
+            >>> my_dataset = LegacyDataset(name="my-dataset-v0", ...)
             >>> env = gym.make("llvm-v0")
             >>> env.register_dataset(my_dataset)
             >>> env.require_dataset("my-dataset-v0")
             >>> env.benchmark = "my-dataset-v0/1"
 
-        :param dataset: A :class:`Dataset` instance describing the new dataset.
+        :param dataset: A :class:`LegacyDataset` instance describing the new dataset.
         :return: :code:`True` if the dataset was added, else :code:`False`.
         :raises ValueError: If a dataset with this name is already registered.
         """
diff --git a/compiler_gym/envs/llvm/datasets.py b/compiler_gym/envs/llvm/datasets.py
index cc3d52f84c..cf7d58fce0 100644
--- a/compiler_gym/envs/llvm/datasets.py
+++ b/compiler_gym/envs/llvm/datasets.py
@@ -19,7 +19,7 @@
 
 import fasteners
 
-from compiler_gym.datasets.dataset import Dataset
+from compiler_gym.datasets.dataset import LegacyDataset
 from compiler_gym.util.download import download
 from compiler_gym.util.runfiles_path import cache_path, runfiles_path, site_data_path
 from compiler_gym.util.timer import Timer
@@ -42,7 +42,7 @@
     _COMPILE_ARGS = []
 
 LLVM_DATASETS = [
-    Dataset(
+    LegacyDataset(
         name="blas-v0",
         url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-blas-v0.tar.bz2",
         license="BSD 3-Clause",
@@ -52,7 +52,7 @@
         size_bytes=3969036,
         sha256="e724a8114709f8480adeb9873d48e426e8d9444b00cddce48e342b9f0f2b096d",
     ),
-    Dataset(
+    LegacyDataset(
         name="cBench-v0",
         url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-cBench-v0-macos.tar.bz2",
         license="BSD 3-Clause",
@@ -63,7 +63,7 @@
         sha256="072a730c86144a07bba948c49afe543e4f06351f1cb17f7de77f91d5c1a1b120",
         platforms=["macos"],
     ),
-    Dataset(
+    LegacyDataset(
         name="cBench-v0",
         url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-cBench-v0-linux.tar.bz2",
         license="BSD 3-Clause",
@@ -74,7 +74,7 @@
         sha256="9b5838a90895579aab3b9375e8eeb3ed2ae58e0ad354fec7eb4f8b31ecb4a360",
         platforms=["linux"],
     ),
-    Dataset(
+    LegacyDataset(
         name="github-v0",
         url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-github-v0.tar.bz2",
         license="CC BY 4.0",
@@ -84,7 +84,7 @@
         size_bytes=725974100,
         sha256="880269dd7a5c2508ea222a2e54c318c38c8090eb105c0a87c595e9dd31720764",
     ),
-    Dataset(
+    LegacyDataset(
         name="linux-v0",
         url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-linux-v0.tar.bz2",
         license="GPL-2.0",
@@ -94,7 +94,7 @@
         size_bytes=516031044,
         sha256="a1ae5c376af30ab042c9e54dc432f89ce75f9ebaee953bc19c08aff070f12566",
     ),
-    Dataset(
+    LegacyDataset(
         name="mibench-v0",
         url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-mibench-v0.tar.bz2",
         license="BSD 3-Clause",
@@ -104,7 +104,7 @@
         size_bytes=238480,
         sha256="128c090c40b955b99fdf766da167a5f642018fb35c16a1d082f63be2e977eb13",
     ),
-    Dataset(
+    LegacyDataset(
         name="npb-v0",
         url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-npb-v0.tar.bz2",
         license="NASA Open Source Agreement v1.3",
@@ -114,7 +114,7 @@
         size_bytes=2287444,
         sha256="793ac2e7a4f4ed83709e8a270371e65b724da09eaa0095c52e7f4209f63bb1f2",
     ),
-    Dataset(
+    LegacyDataset(
         name="opencv-v0",
         url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-opencv-v0.tar.bz2",
         license="Apache 2.0",
@@ -124,7 +124,7 @@
         size_bytes=21903008,
         sha256="003df853bd58df93572862ca2f934c7b129db2a3573bcae69a2e59431037205c",
     ),
-    Dataset(
+    LegacyDataset(
         name="poj104-v0",
         url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-poj104-v0.tar.bz2",
         license="BSD 3-Clause",
@@ -134,7 +134,7 @@
         size_bytes=304207752,
         sha256="6254d629887f6b51efc1177788b0ce37339d5f3456fb8784415ed3b8c25cce27",
     ),
-    Dataset(
+    LegacyDataset(
         name="polybench-v0",
         url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-polybench-v0.tar.bz2",
         license="BSD 3-Clause",
@@ -144,7 +144,7 @@
         size_bytes=162624,
         sha256="968087e68470e5b44dc687dae195143000c7478a23d6631b27055bb3bb3116b1",
     ),
-    Dataset(
+    LegacyDataset(
         name="tensorflow-v0",
         url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-tensorflow-v0.tar.bz2",
         license="Apache 2.0",
diff --git a/tests/compiler_env_test.py b/tests/compiler_env_test.py
index bafe8d86e9..c2554e9907 100644
--- a/tests/compiler_env_test.py
+++ b/tests/compiler_env_test.py
@@ -8,14 +8,14 @@
 import gym
 import pytest
 
-from compiler_gym.datasets import Dataset
+from compiler_gym.datasets import LegacyDataset
 from compiler_gym.envs import CompilerEnv
 from tests.test_main import main
 
 pytest_plugins = ["tests.pytest_plugins.llvm"]
 
 
-def make_dataset(**kwargs) -> Dataset:
+def make_dataset(**kwargs) -> LegacyDataset:
     default_kwargs = {
         "name": "test-dataset-v0",
         "url": "https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-blas-v0.tar.bz2",
@@ -27,7 +27,7 @@ def make_dataset(**kwargs) -> Dataset:
         "sha256": "e724a8114709f8480adeb9873d48e426e8d9444b00cddce48e342b9f0f2b096d",
     }
     default_kwargs.update(kwargs)
-    return Dataset(**default_kwargs)
+    return LegacyDataset(**default_kwargs)
 
 
 def test_register_dataset(env: CompilerEnv):