Refactor flatten DataFrame code

mrpowers-io · Oct 8, 2023 · 92932e7 · 92932e7
1 parent 6a7c3ef
commit 92932e7
Show file tree

Hide file tree

Showing 8 changed files with 342 additions and 317 deletions.
diff --git a/quinn/extensions/column_ext.py b/quinn/extensions/column_ext.py
@@ -63,7 +63,7 @@ def isNullOrBlank(self: Column) -> Column:
  blank characters, or ``False`` otherwise.
  :rtype: Column
  """
- return (self.isNull()) | (trim(self) == "")
+ return (self.isNull()) | (trim(self) == "") # noqa: PLC1901
 
 
 def isNotIn(self: Column, _list: list[Any]) -> Column:

diff --git a/quinn/functions.py b/quinn/functions.py
@@ -3,23 +3,22 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
+ from collections.abc import Callable
  from numbers import Number
 
- from pyspark.sql import Column, DataFrame
+ from pyspark.sql import Column
  from pyspark.sql.functions import udf
 
 
 import re
 import uuid
-from typing import Any, Callable
+from typing import Any
 
 import pyspark.sql.functions as F # noqa: N812
 from pyspark.sql.types import (
  ArrayType,
  BooleanType,
- MapType,
  StringType,
- StructType,
 )
 
 
@@ -243,166 +242,6 @@ def regexp_extract_all(s: Column, regexp: Column) -> Column:
  return None if s is None else re.findall(regexp, s)
 
 
-def sanitize_column_name(name: str, replace_char: str = "_") -> str:
- """Sanitizes column names by replacing special characters with the specified character.
-
- :param name: The original column name.
- :type name: str
- :param replace_char: The character to replace special characters with, defaults to '_'.
- :type replace_char: str, optional
- :return: The sanitized column name.
- :rtype: str
- """
- return re.sub(r"[^a-zA-Z0-9_]", replace_char, name)
-
-
-def _get_complex_fields(df: DataFrame) -> dict[str, object]:
- """Returns a dictionary of complex field names and their data types from the input DataFrame's schema.
-
- :param df: The input PySpark DataFrame.
- :type df: DataFrame
- :return: A dictionary with complex field names as keys and their respective data types as values.
- :rtype: Dict[str, object]
- """
- return {
- field.name: field.dataType
- for field in df.schema.fields
- if isinstance(field.dataType, (ArrayType, StructType, MapType))
- }
-
-
-def flatten_struct(df: DataFrame, col_name: str, sep: str = ":") -> DataFrame:
- """Flattens the specified StructType column in the input DataFrame and returns a new DataFrame with the flattened columns.
-
- :param df: The input PySpark DataFrame.
- :type df: DataFrame
- :param col_name: The column name of the StructType to be flattened.
- :type col_name: str
- :param sep: The separator to use in the resulting flattened column names, defaults to ':'.
- :type sep: str, optional
- :return: The DataFrame with the flattened StructType column.
- :rtype: List[Column]
- """
- struct_type = _get_complex_fields(df)[col_name]
- expanded = [
- F.col(f"`{col_name}`.`{k}`").alias(col_name + sep + k)
- for k in [n.name for n in struct_type.fields]
- ]
- return df.select("*", *expanded).drop(F.col(f"`{col_name}`"))
-
-
-def explode_array(df: DataFrame, col_name: str) -> DataFrame:
- """Explodes the specified ArrayType column in the input DataFrame and returns a new DataFrame with the exploded column.
-
- :param df: The input PySpark DataFrame.
- :type df: DataFrame
- :param col_name: The column name of the ArrayType to be exploded.
- :type col_name: str
- :return: The DataFrame with the exploded ArrayType column.
- :rtype: DataFrame
- """
- return df.select("*", F.explode_outer(F.col(f"`{col_name}`")).alias(col_name)).drop(
- col_name,
- )
-
-
-def flatten_map(df: DataFrame, col_name: str, sep: str = ":") -> DataFrame:
- """Flattens the specified MapType column in the input DataFrame and returns a new DataFrame with the flattened columns.
-
- :param df: The input PySpark DataFrame.
- :type df: DataFrame
- :param col_name: The column name of the MapType to be flattened.
- :type col_name: str
- :param sep: The separator to use in the resulting flattened column names, defaults to ":".
- :type sep: str, optional
- :return: The DataFrame with the flattened MapType column.
- :rtype: DataFrame
- """
- keys_df = df.select(F.explode_outer(F.map_keys(F.col(f"`{col_name}`")))).distinct()
- keys = [row[0] for row in keys_df.collect()]
- key_cols = [
- F.col(f"`{col_name}`").getItem(k).alias(col_name + sep + k) for k in keys
- ]
- return df.select(
- [F.col(f"`{col}`") for col in df.columns if col != col_name] + key_cols,
- )
-
-
-def flatten_dataframe(
- df: DataFrame,
- sep: str = ":",
- replace_char: str = "_",
- sanitized_columns: bool = False, # noqa: FBT001, FBT002
-) -> DataFrame:
- """Flattens the complex columns in the DataFrame.
-
- :param df: The input PySpark DataFrame.
- :type df: DataFrame
- :param sep: The separator to use in the resulting flattened column names, defaults to ":".
- :type sep: str, optional
- :param replace_char: The character to replace special characters with in column names, defaults to "_".
- :type replace_char: str, optional
- :param sanitized_columns: Whether to sanitize column names, defaults to False.
- :type sanitized_columns: bool, optional
- :return: The DataFrame with all complex data types flattened.
- :rtype: DataFrame
-
- .. note:: This function assumes the input DataFrame has a consistent schema across all rows. If you have files with
- different schemas, process each separately instead.
-
- .. example:: Example usage:
-
- >>> data = [
- (
- 1,
- ("Alice", 25),
- {"A": 100, "B": 200},
- ["apple", "banana"],
- {"key": {"nested_key": 10}},
- {"A#": 1000, "B@": 2000},
- ),
- (
- 2,
- ("Bob", 30),
- {"A": 150, "B": 250},
- ["orange", "grape"],
- {"key": {"nested_key": 20}},
- {"A#": 1500, "B@": 2500},
- ),
- ]
-
- >>> df = spark.createDataFrame(data)
- >>> flattened_df = flatten_dataframe(df)
- >>> flattened_df.show()
- >>> flattened_df_with_hyphen = flatten_dataframe(df, replace_char="-")
- >>> flattened_df_with_hyphen.show()
- """
- complex_fields = _get_complex_fields(df)
-
- while len(complex_fields) != 0:
- col_name = next(iter(complex_fields.keys()))
-
- if isinstance(complex_fields[col_name], StructType):
- df = flatten_struct(df, col_name, sep) # noqa: PD901
-
- elif isinstance(complex_fields[col_name], ArrayType):
- df = explode_array(df, col_name) # noqa: PD901
-
- elif isinstance(complex_fields[col_name], MapType):
- df = flatten_map(df, col_name, sep) # noqa: PD901
-
- complex_fields = _get_complex_fields(df)
-
- # Sanitize column names with the specified replace_char
- if sanitized_columns:
- sanitized_columns = [
- sanitize_column_name(col_name, replace_char) for col_name in df.columns
- ]
- df = df.toDF(*sanitized_columns) # noqa: PD901
-
- return df
-
-
 def business_days_between(start_date: Column, end_date: Column) -> Column: # noqa: ARG001
  """Function takes two Spark `Columns` and returns a `Column` with the number of business days between the start and the end date.
 

diff --git a/quinn/schema_helpers.py b/quinn/schema_helpers.py
@@ -1,7 +1,10 @@
+from __future__ import annotations
+
 import json
 
 from pyspark.sql import SparkSession
 from pyspark.sql import types as T # noqa: N812
+from typing import Union
 
 
 def print_schema_as_code(dtype: T.DataType) -> str:
@@ -40,8 +43,9 @@ def print_schema_as_code(dtype: T.DataType) -> str:
  elif isinstance(dtype, T.DecimalType):
  res.append(f"DecimalType({dtype.precision}, {dtype.scale})")
 
- else: # noqa: PLR5501
- if str(dtype).endswith("()"): # PySpark 3.3+
+ else:
+ # PySpark 3.3+
+ if str(dtype).endswith("()"): # noqa: PLR5501
  res.append(str(dtype))
  else:
  res.append(f"{dtype}()")
@@ -149,3 +153,19 @@ def _convert_nullable(null_str: str) -> bool:
  fields.append(field)
 
  return T.StructType(fields=fields)
+
+
+def complex_fields(schema: T.StructType) -> dict[str, object]:
+ """Returns a dictionary of complex field names and their data types from the input DataFrame's schema.
+
+ :param df: The input PySpark DataFrame.
+ :type df: DataFrame
+ :return: A dictionary with complex field names as keys and their respective data types as values.
+ :rtype: Dict[str, object]
+ """
+ return {
+ field.name: field.dataType
+ for field in schema.fields
+ if isinstance(field.dataType, (T.ArrayType, T.StructType, T.MapType))
+ }
+
diff --git a/quinn/split_columns.py b/quinn/split_columns.py
@@ -68,7 +68,7 @@ def _num_delimiter(col_value1: str) -> int:
 
  # If the length of split_value is same as new_col_names, check if any of the split values is None or empty string
  elif any( # noqa: RET506
- x is None or x.strip() == "" for x in split_value[: len(new_col_names)]
+ x is None or x.strip() == "" for x in split_value[: len(new_col_names)] # noqa: PLC1901
  ):
  msg = "Null or empty values are not accepted for columns in strict mode"
  raise ValueError(
@@ -93,7 +93,7 @@ def _num_delimiter(col_value1: str) -> int:
  if mode == "strict":
  # Create an array of select expressions to create new columns from the split values
  select_exprs = [
- when(split_col_expr.getItem(i) != "", split_col_expr.getItem(i)).alias(
+ when(split_col_expr.getItem(i) != "", split_col_expr.getItem(i)).alias( # noqa: PLC1901
  new_col_names[i],
  )
  for i in range(len(new_col_names))