Merge pull request #31 from keitakn/feature/issue17

AI Assistantからのレスポンス型を整理 + 一時的にWebカメラの画像送信機能を無効化
keitakn · Jan 15, 2025 · 73aed0f · 73aed0f
2 parents 178c1d6 + 28af460
commit 73aed0f
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 37 deletions.
diff --git a/frontend/src/app/_components/Camera.tsx b/frontend/src/app/_components/Camera.tsx
@@ -1,3 +1,4 @@
+import { Alert } from '@nextui-org/react';
 import React, { useEffect, useRef, useState } from 'react';
 import { useVideoDeviceList } from './_hooks/useVideoDeviceList';
 
@@ -75,6 +76,7 @@ export function Camera({ onStreamChange, videoRef }: Props) {
 
   return (
     <div>
+      <Alert color="warning" title="ビデオ画像送信に問題が発生しています" description="現在AI Assistantはカメラの画像を認識出来ません" />
       <video
         ref={actualVideoRef}
         autoPlay

diff --git a/frontend/src/app/_components/InputPromptForm.tsx b/frontend/src/app/_components/InputPromptForm.tsx
@@ -10,6 +10,7 @@ import {
 } from '@nextui-org/react';
 import Image from 'next/image';
 import { type ChangeEventHandler, type FormEvent, type KeyboardEventHandler, useCallback, useEffect, useRef, useState } from 'react';
+import { z } from 'zod';
 import { Camera } from './Camera';
 import { PromptInput } from './PromptInput';
 
@@ -22,27 +23,19 @@ declare global {
   }
 }
 
-// Responseクラスの実装
-class Response {
-  text: string | null;
-  audioData: string | null;
-  endOfTurn: boolean | null;
+// Responseの型定義とバリデーションスキーマ
+const AssistantResponseSchema = z.object({
+  text: z.string().optional(),
+  audio: z.string().optional(),
+  endOfTurn: z.boolean().optional(),
+});
 
-  constructor(data: any) {
-    this.text = null;
-    this.audioData = null;
-    this.endOfTurn = null;
+type AssistantResponse = z.infer<typeof AssistantResponseSchema>;
 
-    if (data.text) {
-      this.text = data.text;
-    }
-    if (data.audio) {
-      this.audioData = data.audio;
-    }
-    if (data.endOfTurn) {
-      this.endOfTurn = data.endOfTurn;
-    }
-  }
+function isAssistantResponse(value: unknown): value is AssistantResponse {
+  AssistantResponseSchema.parse(value);
+
+  return true;
 }
 
 type Message = {
@@ -63,7 +56,7 @@ export function InputPromptForm() {
   const [messages, setMessages] = useState<Message[]>([]);
   const [isRecording, setIsRecording] = useState(false);
   const [stream, setStream] = useState<MediaStream | null>(null);
-  const currentFrameB64 = useRef<string | null>(null);
+  const base64CurrentFrame = useRef<string | null>(null);
   const recordingAudioContextRef = useRef<AudioContext | null>(null);
   const playAudioContextRef = useRef<AudioContext | null>(null);
   const audioWorkletNodeRef = useRef<AudioWorkletNode | null>(null);
@@ -187,22 +180,25 @@ export function InputPromptForm() {
 
     ws.onmessage = async (event) => {
       try {
-        const messageData = JSON.parse(event.data);
-        log.info('受信したWebSocketメッセージ:', messageData);
+        const assistantResponse = JSON.parse(event.data);
+        log.info('受信したWebSocketメッセージ:', assistantResponse);
+
+        if (!isAssistantResponse(assistantResponse)) {
+          log.error('レスポンスが意図した形ではありません:', assistantResponse);
+          return;
+        }
 
         // ユーザーからの新しい入力があった場合は再生中の音声を停止
-        if (messageData.text || messageData.audio) {
+        if (assistantResponse.text || assistantResponse.audio) {
           stopCurrentAudio();
         }
 
-        const response = new Response(messageData);
-
-        if (response.text != null && response.text) {
-          newResponseMessage += response.text;
+        if (assistantResponse.text != null && assistantResponse.text) {
+          newResponseMessage += assistantResponse.text;
           setStreamingMessage(newResponseMessage);
         }
 
-        if (response.endOfTurn === true) {
+        if (assistantResponse.endOfTurn === true) {
           const lastAssistantMessage = newResponseMessage;
           setMessages(prev => [...prev, {
             role: 'assistant',
@@ -212,9 +208,9 @@ export function InputPromptForm() {
           setStreamingMessage('');
         }
 
-        if (response.audioData) {
-          log.info('音声データを受信:', response.audioData.substring(0, 50));
-          audioUrl.current = response.audioData;
+        if (assistantResponse.audio) {
+          log.info('音声データを受信:', assistantResponse.audio.substring(0, 50));
+          audioUrl.current = assistantResponse.audio;
           log.info('音声URL設定:', audioUrl.current?.substring(0, 50));
 
           // 音声初期化が必要な場合は初期化を行う
@@ -257,7 +253,7 @@ export function InputPromptForm() {
           canvasRef.current.height = videoRef.current.videoHeight;
           context.drawImage(videoRef.current, 0, 0, canvasRef.current.width, canvasRef.current.height);
           const imageData = canvasRef.current.toDataURL('image/jpeg').split(',')[1].trim();
-          currentFrameB64.current = imageData;
+          base64CurrentFrame.current = imageData;
         }
       }
     }, 3000);
@@ -298,18 +294,14 @@ export function InputPromptForm() {
             String.fromCharCode(...new Uint8Array(event.data.data.int16arrayBuffer)),
           );
 
-          if (webSocketRef.current && currentFrameB64.current) {
+          if (webSocketRef.current) {
             const payload = {
               realtime_input: {
                 media_chunks: [
                   {
                     mime_type: 'audio/pcm',
                     data: base64,
                   },
-                  {
-                    mime_type: 'image/jpeg',
-                    data: currentFrameB64.current,
-                  },
                 ],
               },
             };