added environment variable to disable extensions autoload (#1095)

* added environment variable to disable extensions autoload --------- Co-authored-by: zilto <tjean@DESKTOP-V6JDCS2>
DAGWorks-Inc · Aug 16, 2024 · 8251ae4 · 8251ae4
1 parent 9e6f87b
commit 8251ae4
Show file tree

Hide file tree

Showing 6 changed files with 247 additions and 54 deletions.
diff --git a/docs/how-tos/extensions-autoloading.rst b/docs/how-tos/extensions-autoloading.rst
@@ -0,0 +1,97 @@
+=====================
+Extension autoloading
+=====================
+
+Under ``hamilton.plugins``, there are many modules named ``*_extensions`` (e.g., ``hamilton.plugins.pandas_extensions``, ``hamilton.plugins.mlflow_extensions``). They implement Hamilton features for 3rd party libraries, including ``@extract_columns``, materializers (``to.parquet``, ``from_.mlflow``), and more.
+
+
+Autoloading behavior
+--------------------
+
+By default, Hamilton attempts to load all extensions one-by-one. This means that as you have more Python packages in your environment (e.g., ``pandas``, ``pyspark``, ``mlflow``, ``xgboost``), importing Hamilton appears to become slower because it actually imports many packages.
+
+This behavior can be less desirable when your Hamilton dataflow doesn't use any of these packages, but you need them in your Python environment nonetheless. For example, if only ``pandas`` is needed for your dataflow, but you have ``mlflow`` and ``xgboost`` in your environment their respective extensions will be loaded each time.
+
+
+Disable autoloading
+--------------------
+
+Disabling extension autoloading allows to import Hamilton without any extensions, which can reduce import time from 2-3 sec to less than 0.5 sec. This speedup is welcomed when you need to restart a notebook's kernel often or you're operating in a low RAM environment (some Python packages are larger than 50Mbs).
+
+There are three ways to opt-out: programmatically, environment variables, configuration file. You must opt-out before having any other ``hamilton`` import.
+
+1. Programmatically
+~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+ from hamilton import registry
+ registry.disable_autoload()
+
+2. Environment variables
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+From the console
+
+.. code-block:: console
+
+ export HAMILTON_AUTOLOAD_EXTENSIONS=0
+
+Programmatically via Python ``os.environ``.
+
+.. code-block:: python
+
+ import os
+ os.environ["HAMILTON_AUTOLOAD_EXTENSIONS"] = "0"
+
+Programmatically in Jupyter notebooks
+
+.. code-block:: python
+
+ %env HAMILTON_AUTOLOAD_EXTENSIONS=0
+
+3. Configuration file
+~~~~~~~~~~~~~~~~~~~~~
+
+Using the following command disables autoloading via the configuration file ``./hamilton.conf``. Hamilton won't autoload extensions anymore (i.e., you won't need to use approach 1 or 2 each time).
+
+.. code-block:: console
+
+ hamilton-disable-autoload-extensions
+
+To revert this configuration use the following command
+
+.. code-block:: console
+
+ hamilton-enable-autoload-extensions
+
+To reenable autoloading in specific files, you can delete the environment variable or use ``registry.enable_autoload()`` before calling ``registry.initialize()``
+
+.. code-block:: python
+
+ from hamilton import registry
+ registry.enable_autoload()
+ registry.initialize()
+
+
+Manually loading extensions
+----------------------------
+
+If you disabled autoloading, extensions need to be loaded manually. You should load them before having any other ``hamilton`` import to avoid hard-to-track bugs. There are two ways.
+
+1. Importing the extension
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+ from hamilton.plugins import pandas_extensions, mlflow_extensions
+
+2. Registering the extension
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This approach has good IDE support via ``typing.Literal``
+
+.. code-block:: python
+
+ from hamilton import registry
+ registry.load_extensions("mlflow")
diff --git a/docs/how-tos/index.rst b/docs/how-tos/index.rst
@@ -18,6 +18,7 @@ directory. If there's an example you want but don't see, reach out or open an is
  cache-nodes
  scale-up
  microservice
+ extensions-autoloading
  wrapping-driver
  cli-reference
  pre-commit-hooks
diff --git a/hamilton/function_modifiers/base.py b/hamilton/function_modifiers/base.py
@@ -20,38 +20,7 @@
  # Trigger load of extensions here because decorators are the only thing that use the registry
  # right now. Side note: ray serializes things weirdly, so we need to do this here rather than in
  # in the other choice of hamilton/base.py.
- plugins_modules = [
- "yaml",
- "matplotlib",
- "numpy",
- "pandas",
- "plotly",
- "polars",
- "polars_lazyframe",
- "pyspark_pandas",
- "spark",
- "dask",
- "geopandas",
- "xgboost",
- "lightgbm",
- "sklearn_plot",
- "vaex",
- "ibis",
- "dlt",
- "kedro",
- "huggingface",
- "mlflow",
- ]
- for plugin_module in plugins_modules:
- try:
- registry.load_extension(plugin_module)
- except NotImplementedError as e:
- logger.debug(f"Did not load {plugin_module} extension because {str(e)}.")
- except ModuleNotFoundError as e:
- logger.debug(f"Did not load {plugin_module} extension because {e.msg}.")
- except ImportError as e:
- logger.debug(f"Did not load {plugin_module} extension because {str(e)}.")
- registry.INITIALIZED = True
+ registry.initialize()
 
 
 def sanitize_function_name(name: str) -> str:

diff --git a/hamilton/registry.py b/hamilton/registry.py
@@ -1,13 +1,43 @@
 import collections
+import configparser
 import functools
 import importlib
 import logging
-from typing import Any, Dict, Optional, Type
+import os
+import pathlib
+from typing import Any, Dict, Literal, Optional, Tuple, Type, get_args
 
 logger = logging.getLogger(__name__)
 
 # Use this to ensure the registry is loaded only once.
 INITIALIZED = False
+ExtensionName = Literal[
+ "yaml",
+ "matplotlib",
+ "numpy",
+ "pandas",
+ "plotly",
+ "polars",
+ "polars_lazyframe",
+ "pyspark_pandas",
+ "spark",
+ "dask",
+ "geopandas",
+ "xgboost",
+ "lightgbm",
+ "sklearn_plot",
+ "vaex",
+ "ibis",
+ "dlt",
+ "kedro",
+ "huggingface",
+ "mlflow",
+]
+HAMILTON_EXTENSIONS: Tuple[ExtensionName, ...] = get_args(ExtensionName)
+HAMILTON_AUTOLOAD_ENV = "HAMILTON_AUTOLOAD_EXTENSIONS"
+# NOTE the variable DEFAULT_CONFIG_LOCAITON is redundant with `hamilton.telemetry`
+# but this `registry` module must avoid circular imports
+DEFAULT_CONFIG_LOCATION = pathlib.Path("~/.hamilton.conf").expanduser()
 
 # This is a dictionary of extension name -> dict with dataframe and column types.
 DF_TYPE_AND_COLUMN_TYPES: Dict[str, Dict[str, Type]] = {}
@@ -16,6 +46,108 @@
 DATAFRAME_TYPE = "dataframe_type"
 
 
+def load_autoload_config() -> configparser.ConfigParser:
+ """Load the Hamilton config file and set the autoloading environment variable"""
+ config = configparser.ConfigParser()
+ config.read(DEFAULT_CONFIG_LOCATION)
+
+ if config.has_option("DEFAULT", HAMILTON_AUTOLOAD_ENV):
+ os.environ[HAMILTON_AUTOLOAD_ENV] = config.get("DEFAULT", HAMILTON_AUTOLOAD_ENV)
+
+ return config
+
+
+load_autoload_config()
+
+
+def load_extension(plugin_module: ExtensionName):
+ """Given a module name, loads it for Hamilton to use.
+
+ :param plugin_module: the module name sans .py. e.g. pandas, polars, pyspark_pandas.
+ """
+ mod = importlib.import_module(f"hamilton.plugins.{plugin_module}_extensions")
+ # We have various plugin extensions. We default to assuming it's a dataframe extension with columns,
+ # unless it explicitly says it's not.
+ # We need to check the following if we are to enable `@extract_columns` for example.
+ extractable = getattr(mod, "COLUMN_FRIENDLY_DF_TYPE", True)
+ if extractable:
+ assert hasattr(mod, "register_types"), "Error extension missing function register_types()"
+ assert hasattr(
+ mod, f"get_column_{plugin_module}"
+ ), f"Error extension missing get_column_{plugin_module}"
+ assert hasattr(
+ mod, f"fill_with_scalar_{plugin_module}"
+ ), f"Error extension missing fill_with_scalar_{plugin_module}"
+ logger.info(f"Detected {plugin_module} and successfully loaded Hamilton extensions.")
+
+
+def initialize():
+ """Iterate over all extensions and try to load them"""
+ logger.debug(f"{HAMILTON_AUTOLOAD_ENV}={os.environ.get(HAMILTON_AUTOLOAD_ENV)}")
+ for extension_name in HAMILTON_EXTENSIONS:
+ # skip modules that aren't explicitly imported by the user
+ if str(os.environ.get(HAMILTON_AUTOLOAD_ENV)) == "0":
+ continue
+
+ try:
+ load_extension(extension_name)
+ except NotImplementedError as e:
+ logger.debug(f"Did not load {extension_name} extension because {str(e)}.")
+ except ModuleNotFoundError as e:
+ logger.debug(f"Did not load {extension_name} extension because {e.msg}.")
+ except ImportError as e:
+ logger.debug(f"Did not load {extension_name} extension because {str(e)}.")
+
+ global INITIALIZED
+ INITIALIZED = True
+
+
+def disable_autoload():
+ """Disable extension autoloading by setting an environment variable.
+ This needs to be done before hamilton.driver is imported.
+ """
+ os.environ[HAMILTON_AUTOLOAD_ENV] = "0"
+
+
+def enable_autoload():
+ """Enable extension autoloading by deleting an environment variable.
+ This needs to be done before hamilton.driver is imported.
+ """
+ del os.environ[HAMILTON_AUTOLOAD_ENV]
+
+
+def config_enable_autoload():
+ """Modify the Hamilton config file to enable extension autoloading.
+ Autoloading can be disabled manually via `hamilton.registry.disable_autoload()`
+ before importing `hamilton.driver`.
+
+ NOTE the function name is tied to an entrypoint in `pyproject.toml`
+ """
+ config = load_autoload_config()
+ if "DEFAULT" not in config:
+ config.add_section("DEFAULT")
+
+ config.remove_option("DEFAULT", HAMILTON_AUTOLOAD_ENV)
+ with DEFAULT_CONFIG_LOCATION.open("w") as f:
+ config.write(f)
+
+
+def config_disable_autoload():
+ """Modify the Hamilton config file to disable extension autoloading.
+ Autoloading can be enabled manually via `hamilton.registry.enable_autoload()`
+ before importing `hamilton.driver`.
+
+ NOTE the function name is tied to an entrypoint in `pyproject.toml`
+ """
+ config = load_autoload_config()
+ if "DEFAULT" not in config:
+ config.add_section("DEFAULT")
+
+ config.set("DEFAULT", HAMILTON_AUTOLOAD_ENV, "0")
+ with DEFAULT_CONFIG_LOCATION.open("w") as f:
+ config.write(f)
+
+
 def register_types(extension_name: str, dataframe_type: Type, column_type: Optional[Type]):
  """Registers the dataframe and column types for the extension. Note that column types are optional
  as some extensions may not have a column type (E.G. spark). In this case, this is not included
@@ -73,27 +205,6 @@ def get_column_type_from_df_type(dataframe_type: Type) -> Type:
  )
 
 
-def load_extension(plugin_module: str):
- """Given a module name, loads it for Hamilton to use.
-
- :param plugin_module: the module name sans .py. e.g. pandas, polars, pyspark_pandas.
- """
- mod = importlib.import_module(f"hamilton.plugins.{plugin_module}_extensions")
- # We have various plugin extensions. We default to assuming it's a dataframe extension with columns,
- # unless it explicitly says it's not.
- # We need to check the following if we are to enable `@extract_columns` for example.
- extractable = getattr(mod, "COLUMN_FRIENDLY_DF_TYPE", True)
- if extractable:
- assert hasattr(mod, "register_types"), "Error extension missing function register_types()"
- assert hasattr(
- mod, f"get_column_{plugin_module}"
- ), f"Error extension missing get_column_{plugin_module}"
- assert hasattr(
- mod, f"fill_with_scalar_{plugin_module}"
- ), f"Error extension missing fill_with_scalar_{plugin_module}"
- logger.info(f"Detected {plugin_module} and successfully loaded Hamilton extensions.")
-
-
 LOADER_REGISTRY = collections.defaultdict(list)
 SAVER_REGISTRY = collections.defaultdict(list)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -153,6 +153,8 @@ h_experiments = "hamilton.plugins.h_experiments.__main__:main"
 hamilton = "hamilton.cli.__main__:cli"
 hamilton-admin-build-ui = "hamilton.admin:build_ui"
 hamilton-admin-build-and-publish = "hamilton.admin:build_and_publish"
+hamilton-disable-autoload-extensions = "hamilton.registry:config_disable_autoload"
+hamilton-enable-autoload-extensions = "hamilton.registry:config_enable_autoload"
 
 [project.urls]
 homepage = "https://www.tryhamilton.dev/"

diff --git a/tests/test_registry.py b/tests/test_registry.py
@@ -0,0 +1,13 @@
+import pytest
+
+from hamilton import registry
+
+
+@pytest.mark.parametrize("entrypoint", ["config_disable_autoload", "config_enable_autoload"])
+def test_command_entrypoints_arent_renamed(entrypoint: str):
+ """Ensures that functions associated with an entrypoint in
+ pyproject.toml aren't renamed.
+
+ This doesn't prevent the entrypoints from being renamed
+ """
+ assert hasattr(registry, entrypoint)