microsoft · kinnym · Mar 14, 2024 · Mar 19, 2024 · Mar 19, 2024 · Mar 19, 2024
diff --git a/notebook/agentchat_mic_in.ipynb b/notebook/agentchat_mic_in.ipynb
@@ -0,0 +1,107 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "ModuleNotFoundError",
+ "evalue": "No module named 'distutils'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[1;32mIn[1], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mspeech_recognition\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01msr\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mdistutils\u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;66;03m#from distutils.version import LooseVersion\u001b[39;00m\n\u001b[0;32m 6\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_text_input\u001b[39m():\n\u001b[0;32m 8\u001b[0m \u001b[38;5;66;03m# Ask user for text input\u001b[39;00m\n",
+ "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'distutils'"
+ ]
+ }
+ ],
+ "source": [
+ "#\n",
+ "# Simple Sample to Read input from a speaker and convert it to Text\n",
+ "#\n",
+ "import speech_recognition as sr\n",
+ "import distutils\n",
+ "\n",
+ "#\n",
+ "# from distutils.version import LooseVersion\n",
+ "#\n",
+ "\n",
+ "\n",
+ "def get_text_input():\n",
+ " # Ask user for text input\n",
+ " text = input(\"Enter your text: \")\n",
+ " return text\n",
+ "\n",
+ "\n",
+ "def get_voice_input():\n",
+ " # Initialize recognizer\n",
+ " recognizer = sr.Recognizer()\n",
+ "\n",
+ " with sr.Microphone() as source:\n",
+ " print(\"Speak now...\")\n",
+ " # Adjust for ambient noise\n",
+ " recognizer.adjust_for_ambient_noise(source)\n",
+ " # Listen to user's voice\n",
+ " audio = recognizer.listen(source)\n",
+ "\n",
+ " try:\n",
+ " print(\"Processing voice input...\")\n",
+ " # Use recognizer to convert speech to text\n",
+ " text = recognizer.recognize_google(audio)\n",
+ " return text\n",
+ " except sr.UnknownValueError:\n",
+ " print(\"Sorry, I could not understand your speech.\")\n",
+ " return \"\"\n",
+ " except sr.RequestError as e:\n",
+ " print(f\"Error occurred; {e}\")\n",
+ " return \"\"\n",
+ "\n",
+ "\n",
+ "def main():\n",
+ " # Prompt user to choose input method\n",
+ " print(\"Select input method:\")\n",
+ " print(\"1. Text input\")\n",
+ " print(\"2. Voice input\")\n",
+ " choice = input(\"Enter your choice (1/2): \")\n",
+ "\n",
+ " if choice == \"1\":\n",
+ " text = get_text_input()\n",
+ " elif choice == \"2\":\n",
+ " text = get_voice_input()\n",
+ " else:\n",
+ " print(\"Invalid choice. Please enter 1 or 2.\")\n",
+ " return\n",
+ "\n",
+ " if text:\n",
+ " print(\"Your input is:\", text)\n",
+ "\n",
+ "\n",
+ "if __name__ == \"__main__\":\n",
+ " main()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "kinmap",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebook/agentchat_webcam_in.ipynb b/notebook/agentchat_webcam_in.ipynb
@@ -0,0 +1,168 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "e6243c38-a8c4-4d6d-b65a-ea6d48cdbc87",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "K:\\Repos\\kinmap\\kinmap\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n"
+ ]
+ }
+ ],
+ "source": [
+ "#\n",
+ "# 1. A simple Python application that will opens the Web Camera and displays frames.\n",
+ "# 2. The frames that are read from the camera are then sent to Google GenAI.\n",
+ "# 3. The only change that is needed is to plug in your Google API key and then run.\n",
+ "#\n",
+ "\n",
+ "# import cv2\n",
+ "import threading\n",
+ "from PIL import Image, ImageTk\n",
+ "import tkinter as tk\n",
+ "import cv2\n",
+ "import io\n",
+ "from dotenv import load_dotenv\n",
+ "import os\n",
+ "import google.generativeai as genai\n",
+ "import google.ai.generativelanguage as glm\n",
+ "\n",
+ "# Plug in the API key to run the code.\n",
+ "GOOGLE_API_KEY = os.getenv(\"Your Google API Key\")\n",
+ "genai.configure(api_key=GOOGLE_API_KEY)\n",
+ "\n",
+ "load_dotenv()\n",
+ "\n",
+ "# genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))\n",
+ "model = genai.GenerativeModel(\"gemini-pro-vision\")\n",
+ "\n",
+ "\n",
+ "class ContentDescriber:\n",
+ " def __init__(self, root, user_input, video_handler):\n",
+ " self.root = root\n",
+ " self.user_input = user_input\n",
+ " self.video_handler = video_handler\n",
+ " self.message_var = tk.StringVar()\n",
+ "\n",
+ " def describe_content(self):\n",
+ " current_frame = self.video_handler.get_current_frame()\n",
+ " if current_frame is not None:\n",
+ " pil_image = Image.fromarray(cv2.cvtColor(current_frame, cv2.COLOR_BGR2RGB))\n",
+ " img_byte_arr = io.BytesIO()\n",
+ " pil_image.save(img_byte_arr, format=\"JPEG\")\n",
+ " blob = glm.Blob(mime_type=\"image/jpeg\", data=img_byte_arr.getvalue())\n",
+ " user_request = self.user_input.get()\n",
+ " response = model.generate_content([user_request, blob], stream=True)\n",
+ " for chunk in response:\n",
+ " self.root.after(0, self.update_message, chunk.text)\n",
+ " else:\n",
+ " self.root.after(0, self.update_message, \"No frame available\")\n",
+ "\n",
+ " def threaded_describe_content(self):\n",
+ " describe_thread = threading.Thread(target=self.describe_content)\n",
+ " describe_thread.start()\n",
+ "\n",
+ " def update_message(self, new_text):\n",
+ " current_text = self.message_var.get()\n",
+ " updated_text = current_text + new_text + \"\\n\"\n",
+ " self.message_var.set(updated_text)\n",
+ "\n",
+ "\n",
+ "class VideoStreamHandler:\n",
+ " def __init__(self, root, canvas):\n",
+ " self.root = root\n",
+ " self.canvas = canvas\n",
+ " self.cap = cv2.VideoCapture(0)\n",
+ " self.photo = None\n",
+ " self.current_frame = None\n",
+ "\n",
+ " def video_stream(self):\n",
+ " while self.cap.isOpened():\n",
+ " ret, frame = self.cap.read()\n",
+ " if ret:\n",
+ " self.current_frame = frame\n",
+ " cv2image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n",
+ " img = Image.fromarray(cv2image)\n",
+ " self.photo = ImageTk.PhotoImage(image=img)\n",
+ " self.canvas.create_image(0, 0, image=self.photo, anchor=tk.NW)\n",
+ " self.root.update()\n",
+ "\n",
+ " def start_stream(self):\n",
+ " thread = threading.Thread(target=self.video_stream)\n",
+ " thread.start()\n",
+ "\n",
+ " def stop_video(self):\n",
+ " if self.cap.isOpened():\n",
+ " self.cap.release()\n",
+ " self.root.destroy()\n",
+ "\n",
+ " def get_current_frame(self):\n",
+ " return self.current_frame\n",
+ "\n",
+ "\n",
+ "# Main GUI setup and button handlers\n",
+ "root = tk.Tk()\n",
+ "root.title(\"Webcam Stream\")\n",
+ "\n",
+ "user_input = tk.Entry(root, width=50)\n",
+ "user_input.pack()\n",
+ "\n",
+ "canvas = tk.Canvas(root, width=640, height=480)\n",
+ "canvas.pack()\n",
+ "\n",
+ "video_handler = VideoStreamHandler(root, canvas)\n",
+ "content_describer = ContentDescriber(root, user_input, video_handler)\n",
+ "\n",
+ "button = tk.Button(root, text=\"Stop\", width=50, command=video_handler.stop_video)\n",
+ "button.pack(anchor=tk.CENTER, expand=True)\n",
+ "\n",
+ "describe_button = tk.Button(\n",
+ " root, text=\"Describe the frame\", width=50, command=content_describer.threaded_describe_content\n",
+ ")\n",
+ "describe_button.pack(anchor=tk.CENTER, expand=True)\n",
+ "\n",
+ "message_label = tk.Label(root, textvariable=content_describer.message_var, wraplength=500)\n",
+ "message_label.pack()\n",
+ "\n",
+ "video_handler.start_stream()\n",
+ "\n",
+ "root.mainloop()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "66c27126-b5b1-479a-b96d-f103301d8f32",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}