Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CAI-185] Chatbot/docker compose with Redis and DynamoDB for local development #1193

Open
wants to merge 28 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ac9b39a
feat(chatbot): session GSI
batdevis Oct 8, 2024
36c89af
feat(chatbot): docker compose
batdevis Oct 9, 2024
a2c16c7
fix(chatbot): dynamodb and redis for local development with docker co…
batdevis Oct 9, 2024
c633df6
Merge branch 'chatbot/docker-compose-complete' into chatbot/sessions-…
batdevis Oct 9, 2024
08a7fec
chore(chatbot):remove duplicate imports
batdevis Oct 9, 2024
7fe9101
chore(chatbot): linting
batdevis Oct 9, 2024
9b2eb61
fix(chatbot):create index in docker
batdevis Oct 9, 2024
b251837
chore(chatbot): llamaindex index id
batdevis Oct 10, 2024
4ecd4b3
fix(chatbot): create vector index with all docs
batdevis Oct 10, 2024
a84581c
Merge branch 'main' into chatbot/docker-compose-complete
batdevis Oct 10, 2024
ea7d3db
chore(chatbot): terraform lint
batdevis Oct 10, 2024
28695e3
fix(chatbot): terraform syntax
batdevis Oct 10, 2024
238edfd
chore(chatbot): remove dynamodb options
batdevis Oct 10, 2024
5f63560
chore(chatbot): from global to local secondary index
batdevis Oct 10, 2024
859c298
Merge branch 'main' into chatbot/docker-compose-complete
batdevis Oct 11, 2024
f43a771
chore: merge main
batdevis Oct 11, 2024
d96a9f9
chore: remove old var
batdevis Oct 11, 2024
9526c17
chore: merge main
batdevis Oct 11, 2024
a5177df
Update apps/chatbot/docker/compose.yaml
batdevis Oct 11, 2024
4daccf8
chore: remove logs
batdevis Oct 11, 2024
f5bdfd3
Merge branch 'chatbot/docker-compose-complete' of github.com:pagopa/d…
batdevis Oct 11, 2024
c123b5c
fix(chatbot): compose vars
batdevis Oct 13, 2024
aa59ca5
Merge branch 'main' into chatbot/docker-compose-complete
batdevis Oct 13, 2024
5e07dbe
Update modules
mdciri Oct 16, 2024
da8a41c
Update config prompts
mdciri Oct 16, 2024
b57d55c
Update env example
mdciri Oct 16, 2024
62fffa1
Merge branch 'main' into chatbot/docker-compose-complete
batdevis Oct 16, 2024
f7b05e6
Merge branch 'languages/chatbot/cai-198' into chatbot/docker-compose-…
batdevis Oct 16, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions apps/chatbot/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,5 @@ CHB_ENGINE_SIMILARITY_CUTOFF=...
CHB_ENGINE_USE_ASYNC=...
CHB_ENGINE_USE_STREAMING=...
CHB_QUERY_TABLE_PREFIX=chatbot-local
CHB_REDIS_INDEX_NAME=zero
CHB_DYNAMODB_URL=http://locahost:8080
6 changes: 5 additions & 1 deletion apps/chatbot/docker/app.local.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
FROM python:3.12.4-slim-bullseye
ARG DEBIAN_FRONTEND=noninteractive

RUN apt-get update && \
apt-get install -y \
curl

ENV PYTHONPATH=/app

RUN pip install --upgrade pip \
Expand All @@ -14,4 +18,4 @@ RUN poetry install

COPY . .

CMD ["fastapi", "dev", "src/app/main.py", "--port", "8080"]
CMD ["fastapi", "dev", "src/app/main.py", "--port", "8080", "--host", "0.0.0.0"]
56 changes: 56 additions & 0 deletions apps/chatbot/docker/compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
services:
api:
build:
context: ..
dockerfile: docker/app.local.Dockerfile
ports:
- "8080:8080"
volumes:
- ..:/app
- ./files/.aws:/root/.aws
- ./build-devp/out:/app/build-devp/out
depends_on:
redis:
condition: service_started
dynamodb:
condition: service_started
networks:
- ntw

dynamodb:
image: amazon/dynamodb-local:2.5.2
environment:
- AWS_ACCESS_KEY_ID=dummy
- AWS_SECRET_ACCESS_KEY=dummy
- AWS_DEFAULT_REGION=local
ports:
- "8000:8000"
networks:
- ntw

redis:
image: redis/redis-stack:7.2.0-v13
ports:
- "6379:6379"
networks:
- ntw

create_index:
build:
context: ..
dockerfile: docker/app.local.Dockerfile
ports:
- "8080:8080"
volumes:
- ..:/app
batdevis marked this conversation as resolved.
Show resolved Hide resolved
- ./build-devp/out:/app/build-devp/out
command: "python src/modules/create_vector_index.py --params config/params.yaml"
tty: true
depends_on:
redis:
condition: service_started
networks:
- ntw

networks:
ntw:
2 changes: 2 additions & 0 deletions apps/chatbot/docker/docker-compose-up-api.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
docker compose -f docker/compose.yaml -p chatbot up api
2 changes: 2 additions & 0 deletions apps/chatbot/docker/docker-run-create-index.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
docker compose -f docker/compose.yaml -p chatbot up create_index
2 changes: 2 additions & 0 deletions apps/chatbot/docker/docker-run-local-bash.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
docker run -it --env-file ./.env fastapi-local bash
2 changes: 2 additions & 0 deletions apps/chatbot/docker/files/.aws/config
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[profile default]
region = eu-south-1
3 changes: 3 additions & 0 deletions apps/chatbot/docker/files/.aws/credentials
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[default]
aws_access_key_id = 123
aws_secret_access_key = xyz
christian-calabrese marked this conversation as resolved.
Show resolved Hide resolved
1,583 changes: 888 additions & 695 deletions apps/chatbot/poetry.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion apps/chatbot/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@ llama-index-llms-gemini = "^0.3.4"
google-generativeai = "^0.5.2"
llama-index-embeddings-gemini = "^0.2.0"
llama-index-llms-bedrock-converse = "^0.3.0"
chromedriver-py = "^129.0.6668.91"
llama-index-postprocessor-presidio = "^0.2.0"


[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
build-backend = "poetry.core.masonry.api"
51 changes: 29 additions & 22 deletions apps/chatbot/src/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,30 +18,30 @@

params = yaml.safe_load(open("config/params.yaml", "r"))
prompts = yaml.safe_load(open("config/prompts.yaml", "r"))
chatbot = Chatbot(params, prompts)

AWS_DEFAULT_REGION = os.getenv('CHB_AWS_DEFAULT_REGION', os.getenv('AWS_DEFAULT_REGION', None))

chatbot = Chatbot(params, prompts)


class Query(BaseModel):
question: str
queriedAt: str | None = None

if (os.getenv('environment', 'dev') == 'local'):
profile_name='dummy'
endpoint_url='http://localhost:8000'
region_name = AWS_DEFAULT_REGION

boto3_session = boto3.session.Session(
profile_name = locals().get('profile_name', None),
region_name=locals().get('region_name', None)
region_name=AWS_DEFAULT_REGION
)

dynamodb = boto3_session.resource(
'dynamodb',
endpoint_url=locals().get('endpoint_url', None),
region_name=locals().get('region_name', None),
)
if (os.getenv('environment', 'dev') == 'local'):
dynamodb = boto3_session.resource(
'dynamodb',
endpoint_url=os.getenv('CHB_DYNAMODB_URL', 'http://localhost:8000'),
region_name=AWS_DEFAULT_REGION
)
else:
dynamodb = boto3_session.resource(
'dynamodb',
region_name=AWS_DEFAULT_REGION
)

table_queries = dynamodb.Table(
f"{os.getenv('CHB_QUERY_TABLE_PREFIX', 'chatbot')}-queries"
Expand Down Expand Up @@ -160,12 +160,13 @@ async def sessions_fetching(
raise HTTPException(status_code=422, detail=f"[sessions_fetching] userId: {userId}, error: {e}")

# TODO: pagination
items = db_response.get('Items', [])
result = {
"items": db_response['Items'],
"items": items,
"page": 1,
"pages": 1,
"size": len(db_response['Items']),
"total": len(db_response['Items']),
"size": len(items),
"total": len(items),
}
return result

Expand Down Expand Up @@ -214,20 +215,26 @@ async def queries_fetching(
sessionId = last_session_id(userId)

try:
# TODO: add userId filter
db_response = table_queries.query(
KeyConditionExpression=Key("sessionId").eq(sessionId)
KeyConditionExpression=Key("sessionId").eq(sessionId) &
Key("id").eq(userId)
)
except (BotoCoreError, ClientError) as e:
raise HTTPException(status_code=422, detail=f"[queries_fetching] sessionId: {sessionId}, error: {e}")

result = db_response['Items']
result = db_response.get('Items', [])
return result


def last_session_id(userId: str):
# TODO: retrieve last user session
return '1'
db_response = table_sessions.query(
IndexName='SessionsByCreatedAtIndex',
KeyConditionExpression=Key('userId').eq(userId),
ScanIndexForward=False,
Limit=1
)
items = db_response.get('Items', [])
return items[0] if items else None

@app.patch("/queries/{id}")
async def query_feedback (badAnswer: bool):
Expand Down
3 changes: 2 additions & 1 deletion apps/chatbot/src/modules/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def get_ssm_parameter(name: str, default: str | None = None) -> str | None:
:param default: The default value to return if the parameter is not found.
:return: The value of the requested parameter.
"""

ssm = boto3.client(
"ssm",
aws_access_key_id=AWS_ACCESS_KEY_ID,
Expand All @@ -37,4 +38,4 @@ def get_ssm_parameter(name: str, default: str | None = None) -> str | None:
return default

logging.debug(f"Parameter {name} retrieved from SSM")
return response["Parameter"]["Value"]
return response["Parameter"]["Value"]
39 changes: 30 additions & 9 deletions apps/chatbot/src/modules/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@
import html2text
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from typing import List, Tuple
from chromedriver_py import binary_path

from bs4 import BeautifulSoup
from selenium import webdriver
Expand Down Expand Up @@ -39,7 +42,6 @@

load_dotenv()


PROVIDER = os.getenv("CHB_PROVIDER")
assert PROVIDER in ["google", "aws"]

Expand Down Expand Up @@ -115,15 +117,18 @@ def filter_html_files(html_files: List[str]) -> List[str]:
pattern = re.compile(r"/v\d{1,2}.")
pattern2 = re.compile(r"/\d{1,2}.")
filtered_files = [file for file in html_files if not pattern.search(file) and not pattern2.search(file)]
logging.info(f"[vector_database.py] filter_html_files len(filtered_files): {len(filtered_files)}")
return filtered_files


def get_html_files(root_folder: str) -> List[str]:
logging.info(f"[vector_database.py] get_html_files({root_folder})")
html_files = []
for root, _, files in os.walk(root_folder):
for file in files:
if file.endswith(".html"):
html_files.append(os.path.join(root, file))
logging.info(f"[vector_database.py] get_html_files len(html_files): {len(html_files)}")
return sorted(filter_html_files(html_files))


Expand Down Expand Up @@ -154,10 +159,14 @@ def create_documentation(
if documentation_dir[-1] != "/":
documentation_dir += "/"

logging.info(f"Getting documentation from: {documentation_dir}")
logging.info(f"[vector_database.py] Getting documentation from: {documentation_dir}")
logging.info(f"[vector_database.py] create_documentation: DYNAMIC_HTML: {DYNAMIC_HTMLS}")
logging.info(f"[vector_database.py] create_documentation: documentation_dir: {documentation_dir}")

html_files = get_html_files(documentation_dir)
logging.info(f"[vector_database.py] create_documentation: len(html_files): {len(html_files)}")
dynamic_htmls = [os.path.join(documentation_dir, path) for path in DYNAMIC_HTMLS]
logging.info(f"[vector_database.py] create_documentation: len(dynamic_htmls): {len(dynamic_htmls)}")
documents = []
hash_table = {}
empty_pages = []
Expand All @@ -166,17 +175,28 @@ def create_documentation(

for file in tqdm.tqdm(html_files, total=len(html_files), desc="Extracting HTML"):

if file in dynamic_htmls:
# FIX: resolve webdriver.Chrome "self.assert_process_still_running" error in docker
# if file in dynamic_htmls:
if 6 == 9:
url = file.replace(documentation_dir, f"{website_url}/").replace(".html", "")
driver = webdriver.Chrome()

# svc = webdriver.ChromeService(executable_path=binary_path)
service = Service(executable_path=binary_path)
options = webdriver.ChromeOptions()
options.add_argument('--headless=new')
options.add_argument('--no-sandbox')
options.add_argument('user-agent=fake-useragent')
driver = webdriver.Chrome(service=service, options=options)

logging.info(f"[vector_database.py] create_documentation: driver.get({url})")
driver.get(url)
time.sleep(5)
title, text = html2markdown(driver.page_source)
driver.quit()
else:
title, text = html2markdown(open(file))

if text == None or text == "" or text == "None":
if text is None or text == "" or text == "None":
# print(file)
empty_pages.append(file)

Expand Down Expand Up @@ -220,7 +240,7 @@ def build_automerging_index_redis(
chunk_overlap: int
) -> VectorStoreIndex:

logging.info("Storing vector index and hash table on Redis..")
logging.info("[vector_database.py] Storing vector index and hash table on Redis..")

Settings.llm = llm
Settings.embed_model = embed_model
Expand All @@ -236,9 +256,9 @@ def build_automerging_index_redis(
key=key,
val=value
)
logging.info("Hash table is now on Redis.")
logging.info("[vector_database.py] Hash table is now on Redis.")

logging.info("Creating index...")
logging.info("[vector_database.py] Creating index...")
nodes = Settings.node_parser.get_nodes_from_documents(documents)
leaf_nodes = get_leaf_nodes(nodes)

Expand All @@ -262,6 +282,7 @@ def build_automerging_index_redis(
automerging_index.set_index_id(INDEX_ID)
logging.info("Created vector index successfully and stored on Redis.")

automerging_index.set_index_id("1234")
return automerging_index


Expand All @@ -285,7 +306,7 @@ def load_automerging_index_redis(
schema=REDIS_SCHEMA
)

logging.info(f"Loading vector index from Redis...")
logging.info("[vector_database.py] Loading vector index from Redis...")
storage_context = StorageContext.from_defaults(
vector_store=redis_vector_store,
docstore=REDIS_DOCSTORE,
Expand Down
14 changes: 14 additions & 0 deletions apps/infrastructure/src/modules/chatbot/dynamodb.tf
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,19 @@ module "dynamodb_chatbot_sessions" {
name = "userId"
type = "S"
},
{
name = "createdAt"
type = "S"
},
]

# LSI for query on created_at
local_secondary_indexes = [
{
name = "SessionsByCreatedAtIndex"
hash_key = "userId"
range_key = "createdAt"
projection_type = "ALL"
}
]
}
Loading