Completes B letter flake8-bugbear rules (#1099)

* Completes Ruff B letter flake8-bugbear rules Refactors to take this rule into account.
DAGWorks-Inc · Aug 20, 2024 · 3ce39a1 · 3ce39a1
1 parent 8251ae4
commit 3ce39a1
Show file tree

Hide file tree

Showing 69 changed files with 151 additions and 146 deletions.
diff --git a/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/__init__.py b/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/__init__.py
@@ -262,7 +262,7 @@ def construct_df(
  negatives_per_positive: int = 1,
  random_seed: int = 123,
 ) -> pd.DataFrame:
- f"""Return dataframe of {base_df} paris with negatives added."""
+ """Return dataframe of {base_df} paris with negatives added."""
  return pd.concat(
  [
  base_df,

diff --git a/contrib/hamilton/contrib/user/zilto/lancedb_vdb/__init__.py b/contrib/hamilton/contrib/user/zilto/lancedb_vdb/__init__.py
@@ -59,9 +59,9 @@ def table_ref(
 
  try:
  table = client.open_table(table_name)
- except FileNotFoundError:
+ except FileNotFoundError as e:
  if schema is None:
- raise ValueError("`schema` must be provided to create table.")
+ raise ValueError("`schema` must be provided to create table.") from e
 
  table = _create_table(
  client=client,

diff --git a/contrib/hamilton/contrib/user/zilto/nixtla_statsforecast/__init__.py b/contrib/hamilton/contrib/user/zilto/nixtla_statsforecast/__init__.py
@@ -125,7 +125,7 @@ def best_model_per_series(cross_validation_evaluation: pd.DataFrame) -> pd.Serie
 def inference_predictions(
  forecaster: StatsForecast,
  inference_forecast_steps: int = 12,
- inference_confidence_percentile: list[float] = [90.0],
+ inference_confidence_percentile: list[float] = [90.0], # noqa: B006
 ) -> pd.DataFrame:
  """Infer values using the training harness. Fitted models aren't stored
 
@@ -141,7 +141,7 @@ def plotting_config(
  plot_uids: Optional[list[str]] = None,
  plot_models: Optional[list[str]] = None,
  plot_anomalies: bool = False,
- plot_confidence_percentile: list[float] = [90.0],
+ plot_confidence_percentile: list[float] = [90.0], # noqa: B006
  plot_engine: str = "matplotlib",
 ) -> dict:
  """Configuration for plotting functions"""

diff --git a/contrib/hamilton/contrib/user/zilto/webscraper/__init__.py b/contrib/hamilton/contrib/user/zilto/webscraper/__init__.py
@@ -54,8 +54,8 @@ def html_page(url: str) -> str:
 def parsed_html(
  url: str,
  html_page: str,
- tags_to_extract: List[str] = ["p", "li", "div"],
- tags_to_remove: List[str] = ["script", "style"],
+ tags_to_extract: List[str] = ["p", "li", "div"], # noqa: B006
+ tags_to_remove: List[str] = ["script", "style"], # noqa: B006
 ) -> ParsingResult:
  """Parse an HTML string using BeautifulSoup
 

diff --git a/contrib/hamilton/contrib/user/zilto/xgboost_optuna/__init__.py b/contrib/hamilton/contrib/user/zilto/xgboost_optuna/__init__.py
@@ -133,7 +133,7 @@ def cross_validation_folds(
 
 def study(
  higher_is_better: bool,
- pruner: Optional[optuna.pruners.BasePruner] = optuna.pruners.MedianPruner(),
+ pruner: Optional[optuna.pruners.BasePruner] = None,
  sampler: Optional[optuna.samplers.BaseSampler] = None,
  study_storage: Optional[str] = None,
  study_name: Optional[str] = None,
@@ -142,6 +142,8 @@ def study(
  """Create an optuna study; use the XGBoost + Optuna integration for pruning
  ref: https:/optuna/optuna-examples/blob/main/xgboost/xgboost_integration.py
  """
+ if pruner is None:
+ pruner = optuna.pruners.MedianPruner()
  return optuna.create_study(
  direction="maximize" if higher_is_better else "minimize",
  pruner=pruner,

diff --git a/contrib/setup.py b/contrib/setup.py
@@ -10,8 +10,8 @@
 try:
  with open("README.md") as readme_file:
  readme = readme_file.read()
-except Exception:
- warnings.warn("README.md not found")
+except FileNotFoundError:
+ warnings.warn("README.md not found") # noqa
  readme = None
 
 REQUIREMENTS_FILES = ["requirements.txt"]

diff --git a/examples/LLM_Workflows/GraphRAG/ingest_fighters.py b/examples/LLM_Workflows/GraphRAG/ingest_fighters.py
@@ -17,7 +17,7 @@ def raw_fighter_details() -> pd.DataFrame:
 
 def fighter(raw_fighter_details: pd.DataFrame) -> Parallelizable[pd.Series]:
  """We then want to do something for each record. That's what this code sets up"""
- for idx, row in raw_fighter_details.iterrows():
+ for _, row in raw_fighter_details.iterrows():
  yield row
 
 

diff --git a/examples/LLM_Workflows/image_telephone/streamlit.py b/examples/LLM_Workflows/image_telephone/streamlit.py
@@ -403,8 +403,7 @@ def explore_display():
  image_urls_to_display = image_urls[0 : len(projection)]
  if len(image_urls_to_display) != len(projection):
  image_url_length = len(image_urls_to_display)
- for i in range(len(projection) - len(image_urls_to_display)):
- image_urls_to_display.append(image_urls[image_url_length - 1])
+ image_urls_to_display.append(image_urls[image_url_length - 1])
  embedding_path_plot(projection, image_urls_to_display, selected_entry, prompt_path)
  # highlight_point(projection, selected_entry)
 

diff --git a/examples/LLM_Workflows/knowledge_retrieval/state.py b/examples/LLM_Workflows/knowledge_retrieval/state.py
@@ -137,7 +137,7 @@ def call_arxiv_function(messages, full_message):
  return response
  except Exception as e:
  logger.error(type(e))
- raise Exception("Function chat request failed")
+ raise Exception("Function chat request failed") from e
 
  elif full_message["message"]["function_call"]["name"] == "read_article_and_summarize":
  parsed_output = json.loads(full_message["message"]["function_call"]["arguments"])

diff --git a/examples/LLM_Workflows/knowledge_retrieval/summarize_text.py b/examples/LLM_Workflows/knowledge_retrieval/summarize_text.py
@@ -56,7 +56,7 @@ def pdf_text(pdf_path: pd.Series) -> pd.Series:
  :return: Series of strings of the PDFs' contents
  """
  _pdf_text = []
- for i, file_path in pdf_path.items():
+ for _i, file_path in pdf_path.items():
  # creating a pdf reader object
  reader = PdfReader(file_path)
  text = ""

diff --git a/examples/LLM_Workflows/retrieval_augmented_generation/backend/server.py b/examples/LLM_Workflows/retrieval_augmented_generation/backend/server.py
@@ -59,7 +59,7 @@ class SummaryResponse(pydantic.BaseModel):
 
 
 @app.post("/store_arxiv", tags=["Ingestion"])
-async def store_arxiv(arxiv_ids: list[str] = fastapi.Form(...)) -> JSONResponse:
+async def store_arxiv(arxiv_ids: list[str] = fastapi.Form(...)) -> JSONResponse: # noqa: B008
  """Retrieve PDF files of arxiv articles for arxiv_ids\n
  Read the PDF as text, create chunks, and embed them using OpenAI API\n
  Store chunks with embeddings in Weaviate.

diff --git a/examples/LLM_Workflows/scraping_and_chunking/spark/doc_pipeline.py b/examples/LLM_Workflows/scraping_and_chunking/spark/doc_pipeline.py
@@ -27,8 +27,8 @@ def article_text(url: str, article_regex: str) -> str:
  """
  try:
  html = requests.get(url)
- except requests.exceptions.RequestException:
- raise Exception(f"Failed to get URL: {url}")
+ except requests.exceptions.RequestException as e:
+ raise Exception(f"Failed to get URL: {url}") from e
  article = re.findall(article_regex, html.text, re.DOTALL)
  if not article:
  raise ValueError(f"No article found in {url}")

diff --git a/examples/LLM_Workflows/scraping_and_chunking/spark/spark_pipeline.py b/examples/LLM_Workflows/scraping_and_chunking/spark/spark_pipeline.py
@@ -27,7 +27,9 @@ def sitemap_text(sitemap_url: str = "https://hamilton.dagworks.io/en/latest/site
  try:
  sitemap = requests.get(sitemap_url)
  except Exception as e:
- raise RuntimeError(f"Failed to fetch sitemap from {sitemap_url}. Original error: {str(e)}")
+ raise RuntimeError(
+ f"Failed to fetch sitemap from {sitemap_url}. Original error: {str(e)}"
+ ) from e
  return sitemap.text
 
 

diff --git a/examples/dagster/dagster_code/tutorial/assets.py b/examples/dagster/dagster_code/tutorial/assets.py
@@ -55,7 +55,7 @@ def most_frequent_words() -> MaterializeResult:
  for raw_title in topstories["title"]:
  title = raw_title.lower()
  for word in title.split():
- cleaned_word = word.strip(".,-!?:;()[]'\"-")
+ cleaned_word = word.strip(".,-!?:;()[]'\"-") # noqa
  if cleaned_word not in stopwords and len(cleaned_word) > 0:
  word_counts[cleaned_word] = word_counts.get(cleaned_word, 0) + 1
 

diff --git a/examples/dagster/dagster_code/tutorial/resources/__init__.py b/examples/dagster/dagster_code/tutorial/resources/__init__.py
@@ -93,7 +93,7 @@ def get_signups_for_date(self, date: datetime) -> Sequence[Signup]:
  signups = []
  num_signups = self.random.randint(25, 100)
 
- for i in range(num_signups):
+ for _ in range(num_signups):
  signup = self.generate_signup(date)
  signups.append(signup.to_dict())
 

diff --git a/examples/dagster/hamilton_code/dataflow.py b/examples/dagster/hamilton_code/dataflow.py
@@ -31,7 +31,7 @@ def most_frequent_words(title: pd.Series) -> dict[str, int]:
  word_counts = {}
  for raw_title in title:
  for word in raw_title.lower().split():
- word = word.strip(".,-!?:;()[]'\"-")
+ word = word.strip(".,-!?:;()[]'\"-") # noqa
  if len(word) == 0:
  continue
 

diff --git a/examples/dagster/hamilton_code/mock_api.py b/examples/dagster/hamilton_code/mock_api.py
@@ -94,7 +94,7 @@ def get_signups_for_date(self, date: datetime) -> Sequence[Signup]:
  signups = []
  num_signups = self.random.randint(25, 100)
 
- for i in range(num_signups):
+ for _ in range(num_signups):
  signup = self.generate_signup(date)
  signups.append(signup.to_dict())
 

diff --git a/examples/decoupling_io/adapters.py b/examples/decoupling_io/adapters.py
@@ -7,8 +7,8 @@
  import sklearn.inspection
  import sklearn.metrics
  import sklearn.model_selection
-except ImportError:
- raise NotImplementedError("scikit-learn is not installed.")
+except ImportError as e:
+ raise NotImplementedError("scikit-learn is not installed.") from e
 
 
 from hamilton import registry

diff --git a/examples/dlt/slack/__init__.py b/examples/dlt/slack/__init__.py
@@ -168,12 +168,7 @@ def get_thread_replies(messages: List[Dict[str, Any]]) -> Iterable[TDataItem]:
  write_disposition=write_disposition,
  )
  def messages_resource(
- created_at: dlt.sources.incremental[DateTime] = dlt.sources.incremental(
- "ts",
- initial_value=start_dt,
- end_value=end_dt,
- allow_external_schedulers=True,
- ),
+ created_at: dlt.sources.incremental[DateTime] = None,
  ) -> Iterable[TDataItem]:
  """
  Yield all messages for a set of selected channels as a DLT resource. Keep blocks column without normalization.
@@ -184,19 +179,21 @@ def messages_resource(
  Yields:
  Iterable[TDataItem]: A list of messages.
  """
+ if created_at is None:
+ created_at = dlt.sources.incremental(
+ "ts",
+ initial_value=start_dt,
+ end_value=end_dt,
+ allow_external_schedulers=True,
+ )
  start_date_ts = ensure_dt_type(created_at.last_value, to_ts=True)
  end_date_ts = ensure_dt_type(created_at.end_value, to_ts=True)
  for channel_data in fetched_selected_channels:
  yield from get_messages(channel_data, start_date_ts, end_date_ts)
 
  def per_table_messages_resource(
  channel_data: Dict[str, Any],
- created_at: dlt.sources.incremental[DateTime] = dlt.sources.incremental(
- "ts",
- initial_value=start_dt,
- end_value=end_dt,
- allow_external_schedulers=True,
- ),
+ created_at: dlt.sources.incremental[DateTime] = None,
  ) -> Iterable[TDataItem]:
  """Yield all messages for a given channel as a DLT resource. Keep blocks column without normalization.
 
@@ -207,6 +204,13 @@ def per_table_messages_resource(
  Yields:
  Iterable[TDataItem]: A list of messages.
  """
+ if created_at is None:
+ created_at = dlt.sources.incremental(
+ "ts",
+ initial_value=start_dt,
+ end_value=end_dt,
+ allow_external_schedulers=True,
+ )
  start_date_ts = ensure_dt_type(created_at.last_value, to_ts=True)
  end_date_ts = ensure_dt_type(created_at.end_value, to_ts=True)
  yield from get_messages(channel_data, start_date_ts, end_date_ts)

diff --git a/examples/due_date_probabilities/probability_estimation.py b/examples/due_date_probabilities/probability_estimation.py
@@ -125,10 +125,9 @@ def raw_probabilities(raw_data: str) -> pd.DataFrame:
 
 def resampled(raw_probabilities: pd.DataFrame) -> List[int]:
  sample_data = []
- for index, row in raw_probabilities.iterrows():
+ for _idx, row in raw_probabilities.iterrows():
  count = row.probability * 1000
- for i in range(int(count)):
- sample_data.append(row.days)
+ sample_data.extend([row.days] * int(count))
  return sample_data
 
 

diff --git a/examples/people_data_labs/analysis.py b/examples/people_data_labs/analysis.py
@@ -115,7 +115,7 @@ def stock_growth_rate_since_last_funding_round(
  df = pd.merge(left=stock_data, right=period_start, on="ticker", how="inner")
 
  stock_growth = dict()
- for idx, row in df.iterrows():
+ for _, row in df.iterrows():
  history = pd.json_normalize(row["historical_price"]).astype({"date": "datetime64[ns]"})
 
  # skip ticker if history is empty

diff --git a/examples/prefect/run.py b/examples/prefect/run.py
@@ -72,15 +72,15 @@ def train_and_evaluate_model_task(
 )
 def absenteeism_prediction_flow(
  raw_data_location: str = "./data/Absenteeism_at_work.csv",
- feature_set: list[str] = [
+ feature_set: list[str] = [ # noqa: B006
  "age_zero_mean_unit_variance",
  "has_children",
  "has_pet",
  "is_summer",
  "service_time",
  ],
  label: str = "absenteeism_time_in_hours",
- validation_user_ids: list[str] = [
+ validation_user_ids: list[str] = [ # noqa: B006
  "1",
  "2",
  "4",

diff --git a/examples/spark/world_of_warcraft/zone_features__spark_v1.py b/examples/spark/world_of_warcraft/zone_features__spark_v1.py
@@ -12,10 +12,9 @@ def world_of_warcraft(spark_session: ps.SparkSession) -> ps.DataFrame:
 
 def zone_flags(world_of_warcraft: ps.DataFrame) -> ps.DataFrame:
  zone_flags = world_of_warcraft
- for zone in ["durotar", "darkshore"]:
- zone_flags = zone_flags.withColumn(
- "darkshore_flag", sf.when(sf.col("zone") == " Darkshore", 1).otherwise(0)
- ).withColumn("durotar_flag", sf.when(sf.col("zone") == " Durotar", 1).otherwise(0))
+ zone_flags = zone_flags.withColumn(
+ "darkshore_flag", sf.when(sf.col("zone") == " Darkshore", 1).otherwise(0)
+ ).withColumn("durotar_flag", sf.when(sf.col("zone") == " Durotar", 1).otherwise(0))
  return zone_flags
 
 

diff --git a/hamilton/cli/__main__.py b/hamilton/cli/__main__.py
@@ -127,7 +127,7 @@ def _try_command(cmd: Callable, **cmd_kwargs) -> Any:
  command=cmd_name, success=False, message={"error": str(type(e)), "details": str(e)}
  )
  logger.error(dataclasses.asdict(response))
- raise typer.Exit(code=1)
+ raise typer.Exit(code=1) from e
 
  return result
 
@@ -297,12 +297,12 @@ def ui(
  """Runs the Hamilton UI on sqllite in port 8241"""
  try:
  from hamilton_ui import commands
- except ImportError:
+ except ImportError as e:
  logger.error(
  "hamilton[ui] not installed -- you have to install this to run the UI. "
  'Run `pip install "sf-hamilton[ui]"` to install and get started with the UI!'
  )
- raise typer.Exit(code=1)
+ raise typer.Exit(code=1) from e
 
  ctx.invoke(
  commands.run,

diff --git a/hamilton/cli/logic.py b/hamilton/cli/logic.py
@@ -27,8 +27,8 @@ def get_git_base_directory() -> str:
  else:
  print("Error:", result.stderr.strip())
  raise OSError(f"{result.stderr.strip()}")
- except FileNotFoundError:
- raise FileNotFoundError("Git command not found. Please make sure Git is installed.")
+ except FileNotFoundError as e:
+ raise FileNotFoundError("Git command not found. Please make sure Git is installed.") from e
 
 
 def get_git_reference(git_relative_path: Union[str, Path], git_reference: str) -> str:
@@ -51,8 +51,8 @@ def get_git_reference(git_relative_path: Union[str, Path], git_reference: str) -
  return
  else:
  return
- except FileNotFoundError:
- raise FileNotFoundError("Git command not found. Please make sure Git is installed.")
+ except FileNotFoundError as e:
+ raise FileNotFoundError("Git command not found. Please make sure Git is installed.") from e
 
 
 def version_hamilton_functions(module: ModuleType) -> Dict[str, str]:
@@ -184,7 +184,7 @@ def diff_versions(current_map: Dict[str, str], reference_map: Dict[str, str]) ->
  if v1 != v2:
  edit.append(node_name)
 
- for node_name, v2 in reference_map.items():
+ for node_name, _ in reference_map.items():
  v1 = current_map.get(node_name)
  if v1 is None:
  reference_only.append(node_name)

diff --git a/hamilton/dataflows/__init__.py b/hamilton/dataflows/__init__.py
@@ -498,10 +498,10 @@ def are_py_dependencies_satisfied(dataflow, user=None, version="latest"):
  else:
  package_name = line
  required_version = None
- required_version # here for now...
+ required_version # noqa here for now...
  try:
  installed_version = pkg_version(package_name)
- installed_version # here for now..
+ installed_version # noqa here for now..
  except PackageNotFoundError:
  logger.info(f"Package '{package_name}' is not installed.")
  return False

diff --git a/hamilton/execution/executors.py b/hamilton/execution/executors.py
@@ -99,7 +99,7 @@ def base_execute_task(task: TaskImplementation) -> Dict[str, Any]:
  for node_ in task.nodes:
  if not getattr(node_, "callable_modified", False):
  node_._callable = _modify_callable(node_.node_role, node_.callable)
- setattr(node_, "callable_modified", True)
+ node_.callable_modified = True
  if task.adapter.does_hook("pre_task_execute", is_async=False):
  task.adapter.call_all_lifecycle_hooks_sync(
  "pre_task_execute",