Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

U/kinnym/multimodalwork #2090

Draft
wants to merge 8 commits into
base: 0.2
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions notebook/agentchat_mic_in.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'distutils'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[1], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mspeech_recognition\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01msr\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mdistutils\u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;66;03m#from distutils.version import LooseVersion\u001b[39;00m\n\u001b[0;32m 6\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_text_input\u001b[39m():\n\u001b[0;32m 8\u001b[0m \u001b[38;5;66;03m# Ask user for text input\u001b[39;00m\n",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'distutils'"
]
}
],
"source": [
"#\n",
"# Simple Sample to Read input from a speaker and convert it to Text\n",
"#\n",
"import speech_recognition as sr\n",
"import distutils\n",
"\n",
"#\n",
"# from distutils.version import LooseVersion\n",
"#\n",
"\n",
"\n",
"def get_text_input():\n",
" # Ask user for text input\n",
" text = input(\"Enter your text: \")\n",
" return text\n",
"\n",
"\n",
"def get_voice_input():\n",
" # Initialize recognizer\n",
" recognizer = sr.Recognizer()\n",
"\n",
" with sr.Microphone() as source:\n",
" print(\"Speak now...\")\n",
" # Adjust for ambient noise\n",
" recognizer.adjust_for_ambient_noise(source)\n",
" # Listen to user's voice\n",
" audio = recognizer.listen(source)\n",
"\n",
" try:\n",
" print(\"Processing voice input...\")\n",
" # Use recognizer to convert speech to text\n",
" text = recognizer.recognize_google(audio)\n",
" return text\n",
" except sr.UnknownValueError:\n",
" print(\"Sorry, I could not understand your speech.\")\n",
" return \"\"\n",
" except sr.RequestError as e:\n",
" print(f\"Error occurred; {e}\")\n",
" return \"\"\n",
"\n",
"\n",
"def main():\n",
" # Prompt user to choose input method\n",
" print(\"Select input method:\")\n",
" print(\"1. Text input\")\n",
" print(\"2. Voice input\")\n",
" choice = input(\"Enter your choice (1/2): \")\n",
"\n",
" if choice == \"1\":\n",
" text = get_text_input()\n",
" elif choice == \"2\":\n",
" text = get_voice_input()\n",
" else:\n",
" print(\"Invalid choice. Please enter 1 or 2.\")\n",
" return\n",
"\n",
" if text:\n",
" print(\"Your input is:\", text)\n",
"\n",
"\n",
"if __name__ == \"__main__\":\n",
" main()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "kinmap",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
168 changes: 168 additions & 0 deletions notebook/agentchat_webcam_in.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "e6243c38-a8c4-4d6d-b65a-ea6d48cdbc87",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"K:\\Repos\\kinmap\\kinmap\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"#\n",
"# 1. A simple Python application that will opens the Web Camera and displays frames.\n",
"# 2. The frames that are read from the camera are then sent to Google GenAI.\n",
"# 3. The only change that is needed is to plug in your Google API key and then run.\n",
"#\n",
"\n",
"# import cv2\n",
"import threading\n",
"from PIL import Image, ImageTk\n",
"import tkinter as tk\n",
"import cv2\n",
"import io\n",
"from dotenv import load_dotenv\n",
"import os\n",
"import google.generativeai as genai\n",
"import google.ai.generativelanguage as glm\n",
"\n",
"# Plug in the API key to run the code.\n",
"GOOGLE_API_KEY = os.getenv(\"Your Google API Key\")\n",
"genai.configure(api_key=GOOGLE_API_KEY)\n",
"\n",
"load_dotenv()\n",
"\n",
"# genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))\n",
"model = genai.GenerativeModel(\"gemini-pro-vision\")\n",
"\n",
"\n",
"class ContentDescriber:\n",
" def __init__(self, root, user_input, video_handler):\n",
" self.root = root\n",
" self.user_input = user_input\n",
" self.video_handler = video_handler\n",
" self.message_var = tk.StringVar()\n",
"\n",
" def describe_content(self):\n",
" current_frame = self.video_handler.get_current_frame()\n",
" if current_frame is not None:\n",
" pil_image = Image.fromarray(cv2.cvtColor(current_frame, cv2.COLOR_BGR2RGB))\n",
" img_byte_arr = io.BytesIO()\n",
" pil_image.save(img_byte_arr, format=\"JPEG\")\n",
" blob = glm.Blob(mime_type=\"image/jpeg\", data=img_byte_arr.getvalue())\n",
" user_request = self.user_input.get()\n",
" response = model.generate_content([user_request, blob], stream=True)\n",
" for chunk in response:\n",
" self.root.after(0, self.update_message, chunk.text)\n",
" else:\n",
" self.root.after(0, self.update_message, \"No frame available\")\n",
"\n",
" def threaded_describe_content(self):\n",
" describe_thread = threading.Thread(target=self.describe_content)\n",
" describe_thread.start()\n",
"\n",
" def update_message(self, new_text):\n",
" current_text = self.message_var.get()\n",
" updated_text = current_text + new_text + \"\\n\"\n",
" self.message_var.set(updated_text)\n",
"\n",
"\n",
"class VideoStreamHandler:\n",
" def __init__(self, root, canvas):\n",
" self.root = root\n",
" self.canvas = canvas\n",
" self.cap = cv2.VideoCapture(0)\n",
" self.photo = None\n",
" self.current_frame = None\n",
"\n",
" def video_stream(self):\n",
" while self.cap.isOpened():\n",
" ret, frame = self.cap.read()\n",
" if ret:\n",
" self.current_frame = frame\n",
" cv2image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n",
" img = Image.fromarray(cv2image)\n",
" self.photo = ImageTk.PhotoImage(image=img)\n",
" self.canvas.create_image(0, 0, image=self.photo, anchor=tk.NW)\n",
" self.root.update()\n",
"\n",
" def start_stream(self):\n",
" thread = threading.Thread(target=self.video_stream)\n",
" thread.start()\n",
"\n",
" def stop_video(self):\n",
" if self.cap.isOpened():\n",
" self.cap.release()\n",
" self.root.destroy()\n",
"\n",
" def get_current_frame(self):\n",
" return self.current_frame\n",
"\n",
"\n",
"# Main GUI setup and button handlers\n",
"root = tk.Tk()\n",
"root.title(\"Webcam Stream\")\n",
"\n",
"user_input = tk.Entry(root, width=50)\n",
"user_input.pack()\n",
"\n",
"canvas = tk.Canvas(root, width=640, height=480)\n",
"canvas.pack()\n",
"\n",
"video_handler = VideoStreamHandler(root, canvas)\n",
"content_describer = ContentDescriber(root, user_input, video_handler)\n",
"\n",
"button = tk.Button(root, text=\"Stop\", width=50, command=video_handler.stop_video)\n",
"button.pack(anchor=tk.CENTER, expand=True)\n",
"\n",
"describe_button = tk.Button(\n",
" root, text=\"Describe the frame\", width=50, command=content_describer.threaded_describe_content\n",
")\n",
"describe_button.pack(anchor=tk.CENTER, expand=True)\n",
"\n",
"message_label = tk.Label(root, textvariable=content_describer.message_var, wraplength=500)\n",
"message_label.pack()\n",
"\n",
"video_handler.start_stream()\n",
"\n",
"root.mainloop()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "66c27126-b5b1-479a-b96d-f103301d8f32",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading