DAGWorks-Inc · skrawcz · Apr 1, 2024 · Mar 18, 2024 · Mar 24, 2024 · Mar 31, 2024
diff --git a/examples/aws/glue/README.md b/examples/aws/glue/README.md
@@ -0,0 +1,111 @@
+# Deploy Hamilton Functions as an AWS Glue Job
+
+[AWS Glue](https://aws.amazon.com/glue/) is a serverless data integration service. This guide demonstrates deploying a "hello-world" [processing job](https://docs.aws.amazon.com/glue/latest/dg/add-job-python.html) using Hamilton functions on AWS Glue.
+
+## Prerequisites
+
+- **AWS CLI Setup**: Make sure the AWS CLI is set up on your machine. If you haven't done this yet, no worries! You can follow the [Quick Start guide](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html) for easy setup instructions.
+
+## Step-by-Step Guide
+
+### 1. Build wheel with Hamilton functions
+
+First things first, AWS Glue jobs run a single python script, but you can include external code (like our Hamilton functions) by adding it as a python wheel. So, let's package our code and get it ready for action.
+
+- **Install build package:**
+
+ This command installs the 'build' package, which we'll use to create our python wheel.
+
+ ```shell
+ pip install build
+ ```
+
+- **Build python wheel:**
+
+ ```shell
+ cd app \
+ && python -m build --wheel --skip-dependency-check \
+ && cd ..
+ ```
+
+### 2. Upload all necessary files to S3
+
+- **Upload the wheel file to S3:**
+
+ Replace `<YOUR_PATH_TO_WHL>` with your specific S3 bucket and path:
+
+ ```shell
+ aws s3 cp \
+ app/dist/hamilton_functions-0.1-py3-none-any.whl \
+ s3://<YOUR_PATH_TO_WHL>/hamilton_functions-0.1-py3-none-any.whl
+ ```
+
+- **Upload main python script to s3:**
+
+ Replace `<YOUR_PATH_TO_SCRIPT>` with your specific S3 bucket and path:
+
+ ```shell
+ aws s3 cp \
+ processing.py \
+ s3://<YOUR_PATH_TO_SCRIPT>/processing.py
+ ```
+
+- **Upload input data to s3:**
+
+ Replace `<YOUR_PATH_TO_INPUT_DATA>` with your specific S3 bucket and path:
+
+ ```shell
+ aws s3 cp \
+ data/input_table.csv \
+ s3://<YOUR_PATH_TO_INPUT_DATA>
+ ```
+
+### 3. Create a simple role for AWS Glue job execution
+
+- **Create the Role**:
+
+ ```shell
+ aws iam create-role \
+ --role-name GlueProcessorRole \
+ --assume-role-policy-document '{"Version": "2012-10-17", "Statement": [{ "Effect": "Allow", "Principal": { "Service": "glue.amazonaws.com"}, "Action": "sts:AssumeRole"}]}'
+ ```
+
+- **Attach Policies to the Role**:
+
+ Here we grant full access to S3 as an example. For production environments it's important to restrict access appropriately.
+
+ ```shell
+ aws iam attach-role-policy \
+ --role-name GlueProcessorRole \
+ --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess
+ aws iam attach-role-policy \
+ --role-name GlueProcessorRole \
+ --policy-arn arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole
+ ```
+
+### 4. Create and run the job
+
+- **Create a job:**
+
+ Ensure all paths are correctly replaced with the actual ones:
+
+ ```shell
+ aws glue create-job \
+ --name test_hamilton_script \
+ --role GlueProcessorRole \
+ --command '{"Name" : "pythonshell", "PythonVersion": "3.9", "ScriptLocation" : "s3://<YOUR_PATH_TO_SCRIPT>/processing.py"}' \
+ --max-capacity 0.0625 \
+ --default-arguments '{"--extra-py-files" : "s3://<YOUR_PATH_TO_WHL>/hamilton_functions-0.1-py3-none-any.whl", "--additional-python-modules" : "sf-hamilton"}'
+ ```
+
+- **Run the job:**
+
+ Ensure all paths are correctly replaced with the actual ones:
+
+ ```shell
+ aws glue start-job-run \
+ --job-name test_hamilton_script \
+ --arguments '{"--input-table" : "s3://<YOUR_PATH_TO_INPUT_DATA>", "--output-table" : "s3://<YOUR_PATH_TO_OUTPUT_DATA>"}'
+ ```
+
+ Once you've run the job, you should see an output file at `s3://<YOUR_PATH_TO_OUTPUT_DATA>`.
diff --git a/examples/aws/glue/app/hamilton_functions/__init__.py b/examples/aws/glue/app/hamilton_functions/__init__.py
diff --git a/examples/aws/glue/app/hamilton_functions/functions.py b/examples/aws/glue/app/hamilton_functions/functions.py
@@ -0,0 +1,38 @@
+import pandas as pd
+
+from hamilton.function_modifiers import extract_columns
+
+
+@extract_columns("spend", "signups")
+def raw_table(input_table: pd.DataFrame) -> pd.DataFrame:
+ return input_table
+
+
+def avg_3wk_spend(spend: pd.Series) -> pd.Series:
+ """Rolling 3 week average spend."""
+ return spend.rolling(3).mean()
+
+
+def spend_per_signup(spend: pd.Series, signups: pd.Series) -> pd.Series:
+ """The cost per signup in relation to spend."""
+ return spend / signups
+
+
+def spend_mean(spend: pd.Series) -> float:
+ """Shows function creating a scalar. In this case it computes the mean of the entire column."""
+ return spend.mean()
+
+
+def spend_zero_mean(spend: pd.Series, spend_mean: float) -> pd.Series:
+ """Shows function that takes a scalar. In this case to zero mean spend."""
+ return spend - spend_mean
+
+
+def spend_std_dev(spend: pd.Series) -> float:
+ """Function that computes the standard deviation of the spend column."""
+ return spend.std()
+
+
+def spend_zero_mean_unit_variance(spend_zero_mean: pd.Series, spend_std_dev: float) -> pd.Series:
+ """Function showing one way to make spend have zero mean and unit variance."""
+ return spend_zero_mean / spend_std_dev
diff --git a/examples/aws/glue/app/setup.py b/examples/aws/glue/app/setup.py
@@ -0,0 +1,3 @@
+from setuptools import setup
+
+setup(name="hamilton_functions", version="0.1", packages=["hamilton_functions"])
diff --git a/examples/aws/glue/data/input_table.csv b/examples/aws/glue/data/input_table.csv
@@ -0,0 +1,7 @@
+signups,spend
+1,10
+10,10
+50,20
+100,40
+200,40
+400,50
diff --git a/examples/aws/glue/processing.py b/examples/aws/glue/processing.py
@@ -0,0 +1,31 @@
+import sys
+
+import pandas as pd
+
+# awsglue is installed in the AWS Glue worker environment
+from awsglue.utils import getResolvedOptions
+from hamilton_functions import functions
+
+from hamilton import driver
+
+if __name__ == "__main__":
+
+ args = getResolvedOptions(sys.argv, ["input-table", "output-table"])
+
+ df = pd.read_csv(args["input_table"])
+
+ dr = driver.Driver({}, functions)
+
+ inputs = {"input_table": df}
+
+ output_columns = [
+ "spend",
+ "signups",
+ "avg_3wk_spend",
+ "spend_per_signup",
+ "spend_zero_mean_unit_variance",
+ ]
+
+ # DAG execution
+ df_result = dr.execute(output_columns, inputs=inputs)
+ df_result.to_csv(args["output_table"])
diff --git a/examples/aws/lambda/Dockerfile b/examples/aws/lambda/Dockerfile
@@ -0,0 +1,10 @@
+FROM public.ecr.aws/lambda/python:3.11
+
+COPY requirements.txt ./
+RUN pip install -r requirements.txt
+
+ENV HAMILTON_TELEMETRY_ENABLED=false
+
+COPY app ./app
+
+CMD ["app.lambda_handler.lambda_handler"]
diff --git a/examples/aws/lambda/README.md b/examples/aws/lambda/README.md
@@ -0,0 +1,96 @@
+# Deploy Hamilton in AWS Lambda
+
+[AWS Lambda](https://aws.amazon.com/lambda/) - serverless computation service in AWS.
+
+Here we have an example how to deploy "hello-world" AWS Lambda with Hamilton functions.
+This example is based on the official instruction: https://docs.aws.amazon.com/lambda/latest/dg/python-image.html#python-image-instructions
+
+## Prerequisites
+
+- **AWS CLI Setup**: Make sure the AWS CLI is set up on your machine. If you haven't done this yet, no worries! You can follow the [Quick Start guide](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html) for easy setup instructions.
+
+## Step-by-Step Guide
+
+### 1. Build Docker image:
+
+- **Build Docker image for deploy in AWS ECR**
+
+ ```shell
+ docker build --platform linux/amd64 -t aws-lambda-hamilton .
+ ```
+
+- **Local tests:**
+
+ Run Docker container:
+
+ ```shell
+ docker run -p 9000:8080 aws-lambda-hamilton
+ ```
+
+ Send test request to check if Docker container executes it correctly:
+
+ ```shell
+ curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": {"columns":["signups","spend"],"index":[0,1,2,3,4,5],"data":[[1,10],[10,10],[50,20],[100,40],[200,40],[400,50]]}}'
+ ```
+
+### 2. Create AWS ECR repository:
+
+Ensure the AWS account number (`111122223333`) is correctly replaced with yours:
+
+- **Authenticate Docker to Amazon ECR**:
+
+ Retrieve an authentication token to authenticate your Docker client to your Amazon Elastic Container Registry (ECR):
+
+ ```shell
+ aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 111122223333.dkr.ecr.us-east-1.amazonaws.com
+ ```
+
+- **Create the ECR Repository**:
+
+ ```shell
+ aws ecr create-repository --repository-name aws-lambda-hamilton \
+ --region us-east-1 \
+ --image-scanning-configuration scanOnPush=true \
+ --image-tag-mutability MUTABLE
+ ```
+
+### 3. Deploy the Image to AWS ECR
+
+Ensure the AWS account number (`111122223333`) is correctly replaced with yours:
+
+```shell
+docker tag aws-lambda-hamilton 111122223333.dkr.ecr.us-east-1.amazonaws.com/aws-lambda-hamilton:latest
+docker push 111122223333.dkr.ecr.us-east-1.amazonaws.com/aws-lambda-hamilton:latest
+```
+
+### 4. Create a simple AWS Lambda role:
+
+Example of creating an AWS Role for Lambda execution:
+
+```shell
+aws iam create-role \
+ --role-name lambda-ex \
+ --assume-role-policy-document '{"Version": "2012-10-17","Statement": [{ "Effect": "Allow", "Principal": {"Service": "lambda.amazonaws.com"}, "Action": "sts:AssumeRole"}]}'
+```
+
+### 5. Create AWS Lambda
+
+Ensure the AWS account number (`111122223333`) is correctly replaced with yours:
+
+```shell
+aws lambda create-function \
+ --function-name aws-lambda-hamilton \
+ --package-type Image \
+ --code ImageUri=111122223333.dkr.ecr.us-east-1.amazonaws.com/aws-lambda-hamilton:latest \
+ --role arn:aws:iam::111122223333:role/lambda-ex
+```
+
+### 6. Test AWS Lambda
+
+```shell
+aws lambda invoke \
+ --function-name aws-lambda-hamilton \
+ --cli-binary-format raw-in-base64-out \
+ --payload '{"body": {"columns":["signups","spend"],"index":[0,1,2,3,4,5],"data":[[1,10],[10,10],[50,20],[100,40],[200,40],[400,50]]}}' \
+ response.json
+```
diff --git a/examples/aws/lambda/app/__init__.py b/examples/aws/lambda/app/__init__.py
diff --git a/examples/aws/lambda/app/functions.py b/examples/aws/lambda/app/functions.py
@@ -0,0 +1,38 @@
+import pandas as pd
+
+from hamilton.function_modifiers import extract_columns
+
+
+@extract_columns("spend", "signups")
+def raw_table(input_table: pd.DataFrame) -> pd.DataFrame:
+ return input_table
+
+
+def avg_3wk_spend(spend: pd.Series) -> pd.Series:
+ """Rolling 3 week average spend."""
+ return spend.rolling(3).mean()
+
+
+def spend_per_signup(spend: pd.Series, signups: pd.Series) -> pd.Series:
+ """The cost per signup in relation to spend."""
+ return spend / signups
+
+
+def spend_mean(spend: pd.Series) -> float:
+ """Shows function creating a scalar. In this case it computes the mean of the entire column."""
+ return spend.mean()
+
+
+def spend_zero_mean(spend: pd.Series, spend_mean: float) -> pd.Series:
+ """Shows function that takes a scalar. In this case to zero mean spend."""
+ return spend - spend_mean
+
+
+def spend_std_dev(spend: pd.Series) -> float:
+ """Function that computes the standard deviation of the spend column."""
+ return spend.std()
+
+
+def spend_zero_mean_unit_variance(spend_zero_mean: pd.Series, spend_std_dev: float) -> pd.Series:
+ """Function showing one way to make spend have zero mean and unit variance."""
+ return spend_zero_mean / spend_std_dev
diff --git a/examples/aws/lambda/app/lambda_handler.py b/examples/aws/lambda/app/lambda_handler.py
@@ -0,0 +1,24 @@
+import pandas as pd
+
+from hamilton import driver
+
+from . import functions
+
+
+def lambda_handler(event, context):
+
+ df = pd.DataFrame(**event["body"])
+
+ dr = driver.Driver({}, functions)
+
+ output_columns = [
+ "spend",
+ "signups",
+ "avg_3wk_spend",
+ "spend_per_signup",
+ "spend_zero_mean_unit_variance",
+ ]
+
+ df_result = dr.execute(output_columns, inputs={"input_table": df})
+
+ return {"statusCode": 200, "body": df_result.to_json(orient="split")}
diff --git a/examples/aws/lambda/requirements.txt b/examples/aws/lambda/requirements.txt
@@ -0,0 +1,2 @@
+pandas
+sf-hamilton