From c8e611673cdceeed1807d3f934c134c68bd2a9c2 Mon Sep 17 00:00:00 2001 From: Chris Cummins Date: Fri, 26 Feb 2021 15:24:26 +0000 Subject: [PATCH] Rename Dataset tuple to LegacyDataset. In preparation for introducing a new Dataset class. Issue #45. --- compiler_gym/bin/datasets.py | 10 ++++++++-- compiler_gym/datasets/__init__.py | 10 ++++++++-- compiler_gym/datasets/dataset.py | 22 ++++++++++++---------- compiler_gym/envs/compiler_env.py | 26 +++++++++++++++----------- compiler_gym/envs/llvm/datasets.py | 24 ++++++++++++------------ tests/compiler_env_test.py | 6 +++--- 6 files changed, 58 insertions(+), 40 deletions(-) diff --git a/compiler_gym/bin/datasets.py b/compiler_gym/bin/datasets.py index 4cab24bc89..c6eec47306 100644 --- a/compiler_gym/bin/datasets.py +++ b/compiler_gym/bin/datasets.py @@ -139,7 +139,13 @@ import humanize from absl import app, flags -from compiler_gym.datasets.dataset import Dataset, activate, deactivate, delete, require +from compiler_gym.datasets.dataset import ( + LegacyDataset, + activate, + deactivate, + delete, + require, +) from compiler_gym.util.flags.env_from_flags import env_from_flags from compiler_gym.util.tabulate import tabulate @@ -183,7 +189,7 @@ def enumerate_directory(name: str, path: Path): for path in path.iterdir(): if not path.is_file() or not path.name.endswith(".json"): continue - dataset = Dataset.from_json_file(path) + dataset = LegacyDataset.from_json_file(path) rows.append( (dataset.name, dataset.license, dataset.file_count, dataset.size_bytes) ) diff --git a/compiler_gym/datasets/__init__.py b/compiler_gym/datasets/__init__.py index 6b9af1e543..b0dc9440c5 100644 --- a/compiler_gym/datasets/__init__.py +++ b/compiler_gym/datasets/__init__.py @@ -3,6 +3,12 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """Manage datasets of benchmarks.""" -from compiler_gym.datasets.dataset import Dataset, activate, deactivate, delete, require +from compiler_gym.datasets.dataset import ( + LegacyDataset, + activate, + deactivate, + delete, + require, +) -__all__ = ["Dataset", "require", "activate", "deactivate", "delete"] +__all__ = ["LegacyDataset", "require", "activate", "deactivate", "delete"] diff --git a/compiler_gym/datasets/dataset.py b/compiler_gym/datasets/dataset.py index bed7c9f375..32390b27d8 100644 --- a/compiler_gym/datasets/dataset.py +++ b/compiler_gym/datasets/dataset.py @@ -15,7 +15,7 @@ from compiler_gym.util.download import download -class Dataset(NamedTuple): +class LegacyDataset(NamedTuple): """A collection of benchmarks for use by an environment.""" name: str @@ -48,11 +48,11 @@ class Dataset(NamedTuple): """A list of platforms supported by this dataset. Allowed platforms 'macos' and 'linux'.""" @classmethod - def from_json_file(cls, path: Path) -> "Dataset": + def from_json_file(cls, path: Path) -> "LegacyDataset": """Construct a dataset form a JSON metadata file. :param path: Path of the JSON metadata. - :return: A Dataset instance. + :return: A LegacyDataset instance. """ try: with open(str(path), "rb") as f: @@ -136,7 +136,7 @@ def deactivate(env, name: str) -> bool: return True -def require(env, dataset: Union[str, Dataset]) -> bool: +def require(env, dataset: Union[str, LegacyDataset]) -> bool: """Require that the given dataset is available to the environment. This will download and activate the dataset if it is not already installed. @@ -151,12 +151,14 @@ def require(env, dataset: Union[str, Dataset]) -> bool: :param env: The environment that this dataset is required for. :param dataset: The name of the dataset to download, the URL of the dataset, - or a :class:`Dataset` instance. + or a :class:`LegacyDataset` instance. :return: :code:`True` if the dataset was downloaded, or :code:`False` if the dataset was already available. """ - def download_and_unpack_archive(url: str, sha256: Optional[str] = None) -> Dataset: + def download_and_unpack_archive( + url: str, sha256: Optional[str] = None + ) -> LegacyDataset: json_files_before = { f for f in env.inactive_datasets_site_path.iterdir() @@ -173,9 +175,9 @@ def download_and_unpack_archive(url: str, sha256: Optional[str] = None) -> Datas new_json = json_files_after - json_files_before if not len(new_json): raise OSError(f"Downloaded dataset {url} contains no metadata JSON file") - return Dataset.from_json_file(list(new_json)[0]) + return LegacyDataset.from_json_file(list(new_json)[0]) - def unpack_local_archive(path: Path) -> Dataset: + def unpack_local_archive(path: Path) -> LegacyDataset: if not path.is_file(): raise FileNotFoundError(f"File not found: {path}") json_files_before = { @@ -193,12 +195,12 @@ def unpack_local_archive(path: Path) -> Dataset: new_json = json_files_after - json_files_before if not len(new_json): raise OSError(f"Downloaded dataset {url} contains no metadata JSON file") - return Dataset.from_json_file(list(new_json)[0]) + return LegacyDataset.from_json_file(list(new_json)[0]) with fasteners.InterProcessLock(env.datasets_site_path / "LOCK"): # Resolve the name and URL of the dataset. sha256 = None - if isinstance(dataset, Dataset): + if isinstance(dataset, LegacyDataset): name, url = dataset.name, dataset.url elif isinstance(dataset, str): # Check if we have already downloaded the dataset. diff --git a/compiler_gym/envs/compiler_env.py b/compiler_gym/envs/compiler_env.py index 985d874399..26abf8095d 100644 --- a/compiler_gym/envs/compiler_env.py +++ b/compiler_gym/envs/compiler_env.py @@ -19,7 +19,7 @@ from gym.spaces import Space from compiler_gym.compiler_env_state import CompilerEnvState -from compiler_gym.datasets.dataset import Dataset, require +from compiler_gym.datasets.dataset import LegacyDataset, require from compiler_gym.service import ( CompilerGymServiceConnection, ConnectionOpts, @@ -103,9 +103,9 @@ class CompilerEnv(gym.Env): to store benchmarks. :vartype datasets_site_path: Optional[Path] - :ivar available_datasets: A mapping from dataset name to :class:`Dataset` + :ivar available_datasets: A mapping from dataset name to :class:`LegacyDataset` objects that are available to download. - :vartype available_datasets: Dict[str, Dataset] + :vartype available_datasets: Dict[str, LegacyDataset] :ivar observation: A view of the available observation spaces that permits on-demand computation of observations. @@ -196,7 +196,7 @@ def __init__( self._service_endpoint: Union[str, Path] = service self._connection_settings = connection_settings or ConnectionOpts() self.datasets_site_path: Optional[Path] = None - self.available_datasets: Dict[str, Dataset] = {} + self.available_datasets: Dict[str, LegacyDataset] = {} # The benchmark that is currently being used, and the benchmark that # the user requested. Those do not always correlate, since the user @@ -818,7 +818,7 @@ def _reward_view_type(self): """ return RewardView - def require_datasets(self, datasets: List[Union[str, Dataset]]) -> None: + def require_datasets(self, datasets: List[Union[str, LegacyDataset]]) -> None: """Require that the given datasets are available to the environment. Example usage: @@ -834,8 +834,11 @@ def require_datasets(self, datasets: List[Union[str, Dataset]]) -> None: :param datasets: A list of datasets to require. Each dataset is the name of an available dataset, the URL of a dataset to download, or a - :class:`Dataset` instance. + :class:`LegacyDataset` instance. + + :return: Whether a new dataset was downloaded. """ + self.logger.debug("Requiring datasets: %s", datasets) dataset_installed = False for dataset in datasets: dataset_installed |= require(self, dataset) @@ -849,15 +852,16 @@ def require_datasets(self, datasets: List[Union[str, Dataset]]) -> None: ), ) self.make_manifest_file() + return dataset_installed - def require_dataset(self, dataset: Union[str, Dataset]) -> None: + def require_dataset(self, dataset: Union[str, LegacyDataset]) -> bool: """Require that the given dataset is available to the environment. Alias for :meth:`env.require_datasets([dataset]) `. :param dataset: The name of the dataset to download, the URL of the dataset, or a - :class:`Dataset` instance. + :class:`LegacyDataset` instance. """ return self.require_datasets([dataset]) @@ -885,7 +889,7 @@ def make_manifest_file(self) -> Path: ) return manifest_path - def register_dataset(self, dataset: Dataset) -> bool: + def register_dataset(self, dataset: LegacyDataset) -> bool: """Register a new dataset. After registering, the dataset name may be used by @@ -894,13 +898,13 @@ def register_dataset(self, dataset: Dataset) -> bool: Example usage: - >>> my_dataset = Dataset(name="my-dataset-v0", ...) + >>> my_dataset = LegacyDataset(name="my-dataset-v0", ...) >>> env = gym.make("llvm-v0") >>> env.register_dataset(my_dataset) >>> env.require_dataset("my-dataset-v0") >>> env.benchmark = "my-dataset-v0/1" - :param dataset: A :class:`Dataset` instance describing the new dataset. + :param dataset: A :class:`LegacyDataset` instance describing the new dataset. :return: :code:`True` if the dataset was added, else :code:`False`. :raises ValueError: If a dataset with this name is already registered. """ diff --git a/compiler_gym/envs/llvm/datasets.py b/compiler_gym/envs/llvm/datasets.py index cc3d52f84c..cf7d58fce0 100644 --- a/compiler_gym/envs/llvm/datasets.py +++ b/compiler_gym/envs/llvm/datasets.py @@ -19,7 +19,7 @@ import fasteners -from compiler_gym.datasets.dataset import Dataset +from compiler_gym.datasets.dataset import LegacyDataset from compiler_gym.util.download import download from compiler_gym.util.runfiles_path import cache_path, runfiles_path, site_data_path from compiler_gym.util.timer import Timer @@ -42,7 +42,7 @@ _COMPILE_ARGS = [] LLVM_DATASETS = [ - Dataset( + LegacyDataset( name="blas-v0", url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-blas-v0.tar.bz2", license="BSD 3-Clause", @@ -52,7 +52,7 @@ size_bytes=3969036, sha256="e724a8114709f8480adeb9873d48e426e8d9444b00cddce48e342b9f0f2b096d", ), - Dataset( + LegacyDataset( name="cBench-v0", url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-cBench-v0-macos.tar.bz2", license="BSD 3-Clause", @@ -63,7 +63,7 @@ sha256="072a730c86144a07bba948c49afe543e4f06351f1cb17f7de77f91d5c1a1b120", platforms=["macos"], ), - Dataset( + LegacyDataset( name="cBench-v0", url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-cBench-v0-linux.tar.bz2", license="BSD 3-Clause", @@ -74,7 +74,7 @@ sha256="9b5838a90895579aab3b9375e8eeb3ed2ae58e0ad354fec7eb4f8b31ecb4a360", platforms=["linux"], ), - Dataset( + LegacyDataset( name="github-v0", url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-github-v0.tar.bz2", license="CC BY 4.0", @@ -84,7 +84,7 @@ size_bytes=725974100, sha256="880269dd7a5c2508ea222a2e54c318c38c8090eb105c0a87c595e9dd31720764", ), - Dataset( + LegacyDataset( name="linux-v0", url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-linux-v0.tar.bz2", license="GPL-2.0", @@ -94,7 +94,7 @@ size_bytes=516031044, sha256="a1ae5c376af30ab042c9e54dc432f89ce75f9ebaee953bc19c08aff070f12566", ), - Dataset( + LegacyDataset( name="mibench-v0", url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-mibench-v0.tar.bz2", license="BSD 3-Clause", @@ -104,7 +104,7 @@ size_bytes=238480, sha256="128c090c40b955b99fdf766da167a5f642018fb35c16a1d082f63be2e977eb13", ), - Dataset( + LegacyDataset( name="npb-v0", url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-npb-v0.tar.bz2", license="NASA Open Source Agreement v1.3", @@ -114,7 +114,7 @@ size_bytes=2287444, sha256="793ac2e7a4f4ed83709e8a270371e65b724da09eaa0095c52e7f4209f63bb1f2", ), - Dataset( + LegacyDataset( name="opencv-v0", url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-opencv-v0.tar.bz2", license="Apache 2.0", @@ -124,7 +124,7 @@ size_bytes=21903008, sha256="003df853bd58df93572862ca2f934c7b129db2a3573bcae69a2e59431037205c", ), - Dataset( + LegacyDataset( name="poj104-v0", url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-poj104-v0.tar.bz2", license="BSD 3-Clause", @@ -134,7 +134,7 @@ size_bytes=304207752, sha256="6254d629887f6b51efc1177788b0ce37339d5f3456fb8784415ed3b8c25cce27", ), - Dataset( + LegacyDataset( name="polybench-v0", url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-polybench-v0.tar.bz2", license="BSD 3-Clause", @@ -144,7 +144,7 @@ size_bytes=162624, sha256="968087e68470e5b44dc687dae195143000c7478a23d6631b27055bb3bb3116b1", ), - Dataset( + LegacyDataset( name="tensorflow-v0", url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-tensorflow-v0.tar.bz2", license="Apache 2.0", diff --git a/tests/compiler_env_test.py b/tests/compiler_env_test.py index bafe8d86e9..c2554e9907 100644 --- a/tests/compiler_env_test.py +++ b/tests/compiler_env_test.py @@ -8,14 +8,14 @@ import gym import pytest -from compiler_gym.datasets import Dataset +from compiler_gym.datasets import LegacyDataset from compiler_gym.envs import CompilerEnv from tests.test_main import main pytest_plugins = ["tests.pytest_plugins.llvm"] -def make_dataset(**kwargs) -> Dataset: +def make_dataset(**kwargs) -> LegacyDataset: default_kwargs = { "name": "test-dataset-v0", "url": "https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-blas-v0.tar.bz2", @@ -27,7 +27,7 @@ def make_dataset(**kwargs) -> Dataset: "sha256": "e724a8114709f8480adeb9873d48e426e8d9444b00cddce48e342b9f0f2b096d", } default_kwargs.update(kwargs) - return Dataset(**default_kwargs) + return LegacyDataset(**default_kwargs) def test_register_dataset(env: CompilerEnv):