[Cherry-Picks] for 1.7 as of Wed, Jan 24th, 2024 11:01PM (#413)

* `RegistryMixin` improved alias management (#404) * initial commit * add docstrings * simplify * hardening * refactor * format registry lookup strings to be lowercases * standardise aliases * Move evaluator registry (#411) * More control over external data size (#412) --------- Co-authored-by: Rahul Tuli <[email protected]>
neuralmagic · Jan 24, 2024 · 3fc414d · 3fc414d
1 parent 820e2c6
commit 3fc414d
Show file tree

Hide file tree

Showing 9 changed files with 590 additions and 57 deletions.
diff --git a/src/sparsezoo/evaluation/__init__.py b/src/sparsezoo/evaluation/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+
+from .registry import *
diff --git a/src/sparsezoo/evaluation/registry.py b/src/sparsezoo/evaluation/registry.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Implementation of a registry for evaluation functions
+"""
+
+from sparsezoo.utils.registry import RegistryMixin
+
+
+__all__ = ["EvaluationRegistry"]
+
+
+class EvaluationRegistry(RegistryMixin):
+ """
+ Extends the RegistryMixin to enable registering
+ and loading of evaluation functions.
+ """
diff --git a/src/sparsezoo/evaluation/results.py b/src/sparsezoo/evaluation/results.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, List, Optional, Union
+
+import numpy
+import yaml
+from pydantic import BaseModel, Field
+
+
+__all__ = [
+ "Metric",
+ "Dataset",
+ "EvalSample",
+ "Evaluation",
+ "Result",
+ "save_result",
+]
+
+
+def prep_for_serialization(
+ data: Union[BaseModel, numpy.ndarray, list]
+) -> Union[BaseModel, list]:
+ """
+ Prepares input data for JSON serialization by converting any numpy array
+ field to a list. For large numpy arrays, this operation will take a while to run.
+
+ :param data: data to that is to be processed before
+ serialization. Nested objects are supported.
+ :return: Pipeline_outputs with potential numpy arrays
+ converted to lists
+ """
+ if isinstance(data, BaseModel):
+ for field_name in data.__fields__.keys():
+ field_value = getattr(data, field_name)
+ if isinstance(field_value, (numpy.ndarray, BaseModel, list)):
+ setattr(
+ data,
+ field_name,
+ prep_for_serialization(field_value),
+ )
+
+ elif isinstance(data, numpy.ndarray):
+ data = data.tolist()
+
+ elif isinstance(data, list):
+ for i, value in enumerate(data):
+ data[i] = prep_for_serialization(value)
+
+ elif isinstance(data, dict):
+ for key, value in data.items():
+ data[key] = prep_for_serialization(value)
+
+ return data
+
+
+class Metric(BaseModel):
+ name: str = Field(description="Name of the metric")
+ value: float = Field(description="Value of the metric")
+
+
+class Dataset(BaseModel):
+ type: Optional[str] = Field(description="Type of dataset")
+ name: str = Field(description="Name of the dataset")
+ config: Any = Field(description="Configuration for the dataset")
+ split: Optional[str] = Field(description="Split of the dataset")
+
+
+class EvalSample(BaseModel):
+ input: Any = Field(description="Sample input to the model")
+ output: Any = Field(description="Sample output from the model")
+
+
+class Evaluation(BaseModel):
+ task: str = Field(
+ description="Name of the evaluation integration "
+ "that the evaluation was performed on"
+ )
+ dataset: Dataset = Field(description="Dataset that the evaluation was performed on")
+ metrics: List[Metric] = Field(description="List of metrics for the evaluation")
+ samples: Optional[List[EvalSample]] = Field(
+ description="List of samples for the evaluation"
+ )
+
+
+class Result(BaseModel):
+ formatted: List[Evaluation] = Field(
+ description="Evaluation result represented in the unified, structured format"
+ )
+ raw: Any = Field(
+ description="Evaluation result represented in the raw format "
+ "(characteristic for the specific evaluation integration)"
+ )
+
+
+def save_result(
+ result: Result,
+ save_path: str,
+ save_format: str = "json",
+):
+ """
+ Saves a list of Evaluation objects to a file in the specified format.
+
+ :param result: Result object to save
+ :param save_path: Path to save the evaluations to.
+ :param save_format: Format to save the evaluations in.
+ :return: The serialized evaluations
+ """
+ # prepare the Result object for serialization
+ result: Result = prep_for_serialization(result)
+ if save_format == "json":
+ _save_to_json(result, save_path)
+ elif save_format == "yaml":
+ _save_to_yaml(result, save_path)
+ else:
+ NotImplementedError("Currently only json and yaml formats are supported")
+
+
+def _save_to_json(result: Result, save_path: str):
+ _save(result.json(), save_path, expected_ext=".json")
+
+
+def _save_to_yaml(result: Result, save_path: str):
+ _save(yaml.dump(result.dict()), save_path, expected_ext=".yaml")
+
+
+def _save(data: str, save_path: str, expected_ext: str):
+ if not save_path.endswith(expected_ext):
+ raise ValueError(f"save_path must end with extension: {expected_ext}")
+ with open(save_path, "w") as f:
+ f.write(data)
diff --git a/src/sparsezoo/utils/onnx/external_data.py b/src/sparsezoo/utils/onnx/external_data.py
@@ -39,6 +39,14 @@
 
 EXTERNAL_ONNX_DATA_NAME = "model.data"
 
+# DUMP_EXTERNAL_DATA_TRESHOLD is a limiting value
+# for the model saved with external data. If the model
+# is larger than this value, it will be saved with external data.
+# The threshold is expressed in bits and corresponds
+# set to 500MB. This is roughly the size of
+# 250 million parameters (assuming fp16).
+DUMP_EXTERNAL_DATA_THRESHOLD = 4e9
+
 
 def onnx_includes_external_data(model: ModelProto) -> bool:
  """
@@ -66,6 +74,7 @@ def onnx_includes_external_data(model: ModelProto) -> bool:
 def save_onnx(
  model: ModelProto,
  model_path: str,
+ max_external_file_size: int = 16e9,
  external_data_file: Optional[str] = None,
 ) -> bool:
  """
@@ -84,10 +93,15 @@ def save_onnx(
  large to be saved as a single protobuf, and this argument is None,
  the external data file will be coerced to take the default name
  specified in the variable EXTERNAL_ONNX_DATA_NAME
+ :param max_external_file_size: The maximum file size in bytes of a single split
+ external data out file. Defaults to 16000000000 (16e9 = 16GB)
  :return True if the model was saved with external data, False otherwise.
  """
  if external_data_file is not None:
- _LOGGER.debug(f"Saving with external data: {external_data_file}")
+ _LOGGER.debug(
+ f"Saving with external data, with file chunks of maximum size "
+ f"{max_external_file_size / 1e9} GB"
+ )
  _check_for_old_external_data(
  model_path=model_path, external_data_file=external_data_file
  )
@@ -98,13 +112,15 @@ def save_onnx(
  all_tensors_to_one_file=True,
  location=external_data_file,
  )
+ split_external_data(model_path, max_file_size=max_external_file_size)
  return True
 
- if model.ByteSize() > onnx.checker.MAXIMUM_PROTOBUF:
+ if model.ByteSize() > DUMP_EXTERNAL_DATA_THRESHOLD:
  external_data_file = external_data_file or EXTERNAL_ONNX_DATA_NAME
- _LOGGER.warning(
+ _LOGGER.debug(
  "The ONNX model is too large to be saved as a single protobuf. "
- f"Saving with external data: {external_data_file}"
+ "Saving with external data, with file chunks of maximum size "
+ f"{max_external_file_size / 1e9} GB"
  )
  _check_for_old_external_data(
  model_path=model_path, external_data_file=external_data_file
@@ -116,6 +132,7 @@ def save_onnx(
  all_tensors_to_one_file=True,
  location=external_data_file,
  )
+ split_external_data(model_path, max_file_size=max_external_file_size)
  return True
 
  onnx.save(model, model_path)