Skip to content

Commit

Permalink
Merge pull request #31 from keitakn/feature/issue17
Browse files Browse the repository at this point in the history
AI Assistantからのレスポンス型を整理 + 一時的にWebカメラの画像送信機能を無効化
  • Loading branch information
keitakn authored Jan 15, 2025
2 parents 178c1d6 + 28af460 commit 73aed0f
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 37 deletions.
2 changes: 2 additions & 0 deletions frontend/src/app/_components/Camera.tsx
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { Alert } from '@nextui-org/react';
import React, { useEffect, useRef, useState } from 'react';
import { useVideoDeviceList } from './_hooks/useVideoDeviceList';

Expand Down Expand Up @@ -75,6 +76,7 @@ export function Camera({ onStreamChange, videoRef }: Props) {

return (
<div>
<Alert color="warning" title="ビデオ画像送信に問題が発生しています" description="現在AI Assistantはカメラの画像を認識出来ません" />
<video
ref={actualVideoRef}
autoPlay
Expand Down
66 changes: 29 additions & 37 deletions frontend/src/app/_components/InputPromptForm.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import {
} from '@nextui-org/react';
import Image from 'next/image';
import { type ChangeEventHandler, type FormEvent, type KeyboardEventHandler, useCallback, useEffect, useRef, useState } from 'react';
import { z } from 'zod';
import { Camera } from './Camera';
import { PromptInput } from './PromptInput';

Expand All @@ -22,27 +23,19 @@ declare global {
}
}

// Responseクラスの実装
class Response {
text: string | null;
audioData: string | null;
endOfTurn: boolean | null;
// Responseの型定義とバリデーションスキーマ
const AssistantResponseSchema = z.object({
text: z.string().optional(),
audio: z.string().optional(),
endOfTurn: z.boolean().optional(),
});

constructor(data: any) {
this.text = null;
this.audioData = null;
this.endOfTurn = null;
type AssistantResponse = z.infer<typeof AssistantResponseSchema>;

if (data.text) {
this.text = data.text;
}
if (data.audio) {
this.audioData = data.audio;
}
if (data.endOfTurn) {
this.endOfTurn = data.endOfTurn;
}
}
function isAssistantResponse(value: unknown): value is AssistantResponse {
AssistantResponseSchema.parse(value);

return true;
}

type Message = {
Expand All @@ -63,7 +56,7 @@ export function InputPromptForm() {
const [messages, setMessages] = useState<Message[]>([]);
const [isRecording, setIsRecording] = useState(false);
const [stream, setStream] = useState<MediaStream | null>(null);
const currentFrameB64 = useRef<string | null>(null);
const base64CurrentFrame = useRef<string | null>(null);
const recordingAudioContextRef = useRef<AudioContext | null>(null);
const playAudioContextRef = useRef<AudioContext | null>(null);
const audioWorkletNodeRef = useRef<AudioWorkletNode | null>(null);
Expand Down Expand Up @@ -187,22 +180,25 @@ export function InputPromptForm() {

ws.onmessage = async (event) => {
try {
const messageData = JSON.parse(event.data);
log.info('受信したWebSocketメッセージ:', messageData);
const assistantResponse = JSON.parse(event.data);
log.info('受信したWebSocketメッセージ:', assistantResponse);

if (!isAssistantResponse(assistantResponse)) {
log.error('レスポンスが意図した形ではありません:', assistantResponse);
return;
}

// ユーザーからの新しい入力があった場合は再生中の音声を停止
if (messageData.text || messageData.audio) {
if (assistantResponse.text || assistantResponse.audio) {
stopCurrentAudio();
}

const response = new Response(messageData);

if (response.text != null && response.text) {
newResponseMessage += response.text;
if (assistantResponse.text != null && assistantResponse.text) {
newResponseMessage += assistantResponse.text;
setStreamingMessage(newResponseMessage);
}

if (response.endOfTurn === true) {
if (assistantResponse.endOfTurn === true) {
const lastAssistantMessage = newResponseMessage;
setMessages(prev => [...prev, {
role: 'assistant',
Expand All @@ -212,9 +208,9 @@ export function InputPromptForm() {
setStreamingMessage('');
}

if (response.audioData) {
log.info('音声データを受信:', response.audioData.substring(0, 50));
audioUrl.current = response.audioData;
if (assistantResponse.audio) {
log.info('音声データを受信:', assistantResponse.audio.substring(0, 50));
audioUrl.current = assistantResponse.audio;
log.info('音声URL設定:', audioUrl.current?.substring(0, 50));

// 音声初期化が必要な場合は初期化を行う
Expand Down Expand Up @@ -257,7 +253,7 @@ export function InputPromptForm() {
canvasRef.current.height = videoRef.current.videoHeight;
context.drawImage(videoRef.current, 0, 0, canvasRef.current.width, canvasRef.current.height);
const imageData = canvasRef.current.toDataURL('image/jpeg').split(',')[1].trim();
currentFrameB64.current = imageData;
base64CurrentFrame.current = imageData;
}
}
}, 3000);
Expand Down Expand Up @@ -298,18 +294,14 @@ export function InputPromptForm() {
String.fromCharCode(...new Uint8Array(event.data.data.int16arrayBuffer)),
);

if (webSocketRef.current && currentFrameB64.current) {
if (webSocketRef.current) {
const payload = {
realtime_input: {
media_chunks: [
{
mime_type: 'audio/pcm',
data: base64,
},
{
mime_type: 'image/jpeg',
data: currentFrameB64.current,
},
],
},
};
Expand Down

0 comments on commit 73aed0f

Please sign in to comment.