From 0e888d62af1434fef231e11a5c307a5b48a8deb1 Mon Sep 17 00:00:00 2001
From: Matthias Nott <mnott@mnsoft.org>
Date: Sat, 07 Mar 2026 10:49:07 +0100
Subject: [PATCH] feat: singleton audio, transcript reflection, voice persistence
---
services/wol.ts | 39 +++++-
types/index.ts | 8 +
components/chat/InputBar.tsx | 2
services/audio.ts | 104 +++++++++++++----
contexts/ConnectionContext.tsx | 5
app/chat.tsx | 10 -
components/chat/MessageBubble.tsx | 125 +++++++++++---------
contexts/ChatContext.tsx | 48 +++++++-
components/chat/VoiceButton.tsx | 8 +
9 files changed, 243 insertions(+), 106 deletions(-)
diff --git a/app/chat.tsx b/app/chat.tsx
index 49f93bc..8d4e95e 100644
--- a/app/chat.tsx
+++ b/app/chat.tsx
@@ -11,7 +11,7 @@
import { ImageCaptionModal } from "../components/chat/ImageCaptionModal";
import { StatusDot } from "../components/ui/StatusDot";
import { SessionDrawer } from "../components/SessionDrawer";
-import { playAudio, stopPlayback, isPlaying, onPlayingChange } from "../services/audio";
+import { playSingle, stopPlayback, isPlaying, onPlayingChange } from "../services/audio";
interface StagedImage {
base64: string;
@@ -32,7 +32,7 @@
const [stagedImage, setStagedImage] = useState<StagedImage | null>(null);
useEffect(() => {
- return onPlayingChange(setAudioPlaying);
+ return onPlayingChange((uri) => setAudioPlaying(uri !== null));
}, []);
const handleScreenshot = useCallback(() => {
@@ -137,10 +137,8 @@
}
for (let i = messages.length - 1; i >= 0; i--) {
const msg = messages[i];
- if (msg.role === "assistant") {
- if (msg.audioUri) {
- playAudio(msg.audioUri).catch(() => {});
- }
+ if (msg.role === "assistant" && msg.audioUri) {
+ playSingle(msg.audioUri).catch(() => {});
return;
}
}
diff --git a/components/chat/InputBar.tsx b/components/chat/InputBar.tsx
index ee2fa20..b002775 100644
--- a/components/chat/InputBar.tsx
+++ b/components/chat/InputBar.tsx
@@ -12,7 +12,7 @@
interface InputBarProps {
onSendText: (text: string) => void;
- onVoiceRecorded: (uri: string) => void;
+ onVoiceRecorded: (uri: string, durationMs?: number) => void;
onReplay: () => void;
isTextMode: boolean;
onToggleMode: () => void;
diff --git a/components/chat/MessageBubble.tsx b/components/chat/MessageBubble.tsx
index 8d3bd9a..5edf2f8 100644
--- a/components/chat/MessageBubble.tsx
+++ b/components/chat/MessageBubble.tsx
@@ -1,7 +1,7 @@
import React, { useCallback, useEffect, useState } from "react";
import { Image, Pressable, Text, View } from "react-native";
import { Message } from "../../types";
-import { playAudio, stopPlayback, onPlayingChange } from "../../services/audio";
+import { playSingle, stopPlayback, onPlayingChange } from "../../services/audio";
import { ImageViewer } from "./ImageViewer";
import { useTheme } from "../../contexts/ThemeContext";
@@ -27,11 +27,12 @@
const [showViewer, setShowViewer] = useState(false);
const { colors, isDark } = useTheme();
+ // Track whether THIS bubble's audio is playing via the singleton URI
useEffect(() => {
- return onPlayingChange((playing) => {
- if (!playing) setIsPlaying(false);
+ return onPlayingChange((uri) => {
+ setIsPlaying(uri !== null && uri === message.audioUri);
});
- }, []);
+ }, [message.audioUri]);
const isUser = message.role === "user";
const isSystem = message.role === "system";
@@ -40,11 +41,11 @@
if (!message.audioUri) return;
if (isPlaying) {
+ // This bubble is playing — stop it
await stopPlayback();
- setIsPlaying(false);
} else {
- setIsPlaying(true);
- await playAudio(message.audioUri, () => setIsPlaying(false));
+ // Play this bubble (stops anything else automatically)
+ await playSingle(message.audioUri, () => {});
}
}, [isPlaying, message.audioUri]);
@@ -114,56 +115,70 @@
/>
</View>
) : message.type === "voice" ? (
- <Pressable
- onPress={handleVoicePress}
- style={{ flexDirection: "row", alignItems: "center", gap: 12 }}
- >
- <View
- style={{
- width: 36,
- height: 36,
- borderRadius: 18,
- alignItems: "center",
- justifyContent: "center",
- backgroundColor: isPlaying
- ? "#FF9F43"
- : isUser
- ? "rgba(255,255,255,0.2)"
- : colors.border,
- }}
+ <View>
+ <Pressable
+ onPress={handleVoicePress}
+ style={{ flexDirection: "row", alignItems: "center", gap: 12 }}
>
- <Text style={{ fontSize: 14, color: isUser ? "#FFF" : colors.text }}>
- {isPlaying ? "\u23F8" : "\u25B6"}
+ <View
+ style={{
+ width: 36,
+ height: 36,
+ borderRadius: 18,
+ alignItems: "center",
+ justifyContent: "center",
+ backgroundColor: isPlaying
+ ? "#FF9F43"
+ : isUser
+ ? "rgba(255,255,255,0.2)"
+ : colors.border,
+ }}
+ >
+ <Text style={{ fontSize: 14, color: isUser ? "#FFF" : colors.text }}>
+ {isPlaying ? "\u23F8" : "\u25B6"}
+ </Text>
+ </View>
+
+ <View style={{ flex: 1, flexDirection: "row", alignItems: "center", gap: 1, height: 32 }}>
+ {Array.from({ length: 20 }).map((_, i) => (
+ <View
+ key={i}
+ style={{
+ flex: 1,
+ borderRadius: 2,
+ backgroundColor: isPlaying && i < 10
+ ? "#FF9F43"
+ : isUser
+ ? "rgba(255,255,255,0.5)"
+ : colors.textMuted,
+ height: `${20 + Math.sin(i * 0.8) * 60}%`,
+ }}
+ />
+ ))}
+ </View>
+
+ <Text
+ style={{
+ fontSize: 11,
+ color: isUser ? "rgba(255,255,255,0.8)" : colors.textSecondary,
+ }}
+ >
+ {formatDuration(message.duration)}
</Text>
- </View>
-
- <View style={{ flex: 1, flexDirection: "row", alignItems: "center", gap: 1, height: 32 }}>
- {Array.from({ length: 20 }).map((_, i) => (
- <View
- key={i}
- style={{
- flex: 1,
- borderRadius: 2,
- backgroundColor: isPlaying && i < 10
- ? "#FF9F43"
- : isUser
- ? "rgba(255,255,255,0.5)"
- : colors.textMuted,
- height: `${20 + Math.sin(i * 0.8) * 60}%`,
- }}
- />
- ))}
- </View>
-
- <Text
- style={{
- fontSize: 11,
- color: isUser ? "rgba(255,255,255,0.8)" : colors.textSecondary,
- }}
- >
- {formatDuration(message.duration)}
- </Text>
- </Pressable>
+ </Pressable>
+ {message.content ? (
+ <Text
+ style={{
+ fontSize: 14,
+ lineHeight: 20,
+ marginTop: 8,
+ color: isUser ? "rgba(255,255,255,0.9)" : colors.textSecondary,
+ }}
+ >
+ {message.content}
+ </Text>
+ ) : null}
+ </View>
) : (
<Text
style={{
diff --git a/components/chat/VoiceButton.tsx b/components/chat/VoiceButton.tsx
index 93b357a..9a8a2b8 100644
--- a/components/chat/VoiceButton.tsx
+++ b/components/chat/VoiceButton.tsx
@@ -10,7 +10,7 @@
import { stopPlayback } from "../../services/audio";
interface VoiceButtonProps {
- onVoiceRecorded: (uri: string) => void;
+ onVoiceRecorded: (uri: string, durationMs?: number) => void;
}
const VOICE_BUTTON_SIZE = 72;
@@ -98,7 +98,11 @@
});
const uri = recorder.uri;
if (uri) {
- onVoiceRecorded(uri);
+ // currentTime is in seconds after stop
+ const durationMs = recorder.currentTime > 0
+ ? Math.round(recorder.currentTime * 1000)
+ : undefined;
+ onVoiceRecorded(uri, durationMs);
}
} catch (err) {
console.error("Failed to stop recording:", err);
diff --git a/contexts/ChatContext.tsx b/contexts/ChatContext.tsx
index 0865375..151f6f7 100644
--- a/contexts/ChatContext.tsx
+++ b/contexts/ChatContext.tsx
@@ -26,7 +26,9 @@
const MESSAGES_DIR = "pailot-messages";
-/** Strip heavy fields (base64 images, audio URIs) before persisting. */
+/** Strip heavy fields (base64 images, audio URIs) before persisting.
+ * Voice messages keep their content (transcript) but lose audioUri
+ * since cache files won't survive app restarts. */
function lightMessage(m: Message): Message {
const light = { ...m };
if (light.imageBase64) light.imageBase64 = undefined;
@@ -63,7 +65,16 @@
if (!file.endsWith(".json")) continue;
const sessionId = file.replace(".json", "");
const content = await fs.readAsStringAsync(`${dir}${file}`);
- result[sessionId] = JSON.parse(content) as Message[];
+ result[sessionId] = (JSON.parse(content) as Message[])
+ // Drop voice messages with no audio and no content (empty chunks)
+ .filter((m) => !(m.type === "voice" && !m.audioUri && !m.content))
+ .map((m) => {
+ // Voice messages without audio but with transcript → show as text
+ if (m.type === "voice" && !m.audioUri && m.content) {
+ return { ...m, type: "text" };
+ }
+ return m;
+ });
}
return result;
} catch {
@@ -179,12 +190,15 @@
}
}, [messages]);
- // On connect: ask gateway to detect the focused iTerm2 session and sync
+ // On connect: ask gateway to sync sessions. If we already had a session
+ // selected, tell the gateway so it preserves our selection instead of
+ // jumping to whatever iTerm has focused on the Mac.
useEffect(() => {
if (status === "connected") {
needsSync.current = true;
- sendCommand("sync");
+ sendCommand("sync", activeSessionId ? { activeSessionId } : undefined);
}
+ // eslint-disable-next-line react-hooks/exhaustive-deps — only fire on status change
}, [status, sendCommand]);
// Helper: add a message to the active session
@@ -233,6 +247,23 @@
},
[]
);
+
+ // Update a message's content (e.g., voice transcript reflection)
+ const updateMessageContent = useCallback((id: string, content: string) => {
+ setMessages((prev) => {
+ const next = prev.map((m) =>
+ m.id === id ? { ...m, content } : m
+ );
+ setActiveSessionId((sessId) => {
+ if (sessId) {
+ messagesMapRef.current[sessId] = next;
+ debouncedSave(messagesMapRef.current);
+ }
+ return sessId;
+ });
+ return next;
+ });
+ }, []);
// Handle incoming WebSocket messages
useEffect(() => {
@@ -322,6 +353,11 @@
sendCommand("sessions");
break;
}
+ case "transcript": {
+ // Voice → text reflection: replace voice bubble with transcribed text
+ updateMessageContent(data.messageId, data.content);
+ break;
+ }
case "error": {
const msg: Message = {
id: generateId(),
@@ -339,7 +375,7 @@
return () => {
onMessageReceived.current = null;
};
- }, [onMessageReceived, sendCommand, addMessageToActive, syncActiveFromSessions]);
+ }, [onMessageReceived, sendCommand, addMessageToActive, updateMessageContent, syncActiveFromSessions]);
const sendTextMessage = useCallback(
(text: string) => {
@@ -375,7 +411,7 @@
addMessageToActive(msg);
try {
const base64 = await encodeAudioToBase64(audioUri);
- const sent = wsVoice(base64);
+ const sent = wsVoice(base64, "", id);
updateMessageStatus(id, sent ? "sent" : "error");
} catch (err) {
console.error("Failed to encode audio:", err);
diff --git a/contexts/ConnectionContext.tsx b/contexts/ConnectionContext.tsx
index e01d867..6734d9c 100644
--- a/contexts/ConnectionContext.tsx
+++ b/contexts/ConnectionContext.tsx
@@ -24,7 +24,7 @@
connect: (config?: ServerConfig) => void;
disconnect: () => void;
sendTextMessage: (text: string) => boolean;
- sendVoiceMessage: (audioBase64: string, transcript?: string) => boolean;
+ sendVoiceMessage: (audioBase64: string, transcript?: string, messageId?: string) => boolean;
sendImageMessage: (imageBase64: string, caption: string, mimeType: string) => boolean;
sendCommand: (command: string, args?: Record<string, unknown>) => boolean;
saveServerConfig: (config: ServerConfig) => Promise<void>;
@@ -120,11 +120,12 @@
}, []);
const sendVoiceMessage = useCallback(
- (audioBase64: string, transcript: string = ""): boolean => {
+ (audioBase64: string, transcript: string = "", messageId?: string): boolean => {
return wsClient.send({
type: "voice",
content: transcript,
audioBase64,
+ messageId,
});
},
[]
diff --git a/services/audio.ts b/services/audio.ts
index 5fa8bd5..ea43236 100644
--- a/services/audio.ts
+++ b/services/audio.ts
@@ -10,20 +10,34 @@
durationMs: number;
}
+// --- Singleton audio player ---
+// Only ONE audio can play at a time. Any new play request stops the current one.
+
let currentPlayer: ReturnType<typeof createAudioPlayer> | null = null;
-const playingListeners = new Set<(playing: boolean) => void>();
+let currentUri: string | null = null;
+let cancelCurrent: (() => void) | null = null;
-// Audio queue for chaining sequential voice notes
-const audioQueue: Array<{ uri: string; onFinish?: () => void }> = [];
-let processingQueue = false;
+// Listeners get the URI of what's playing (or null when stopped)
+const playingListeners = new Set<(uri: string | null) => void>();
-function notifyListeners(playing: boolean): void {
- for (const cb of playingListeners) cb(playing);
+function notifyListeners(uri: string | null): void {
+ currentUri = uri;
+ for (const cb of playingListeners) cb(uri);
}
-export function onPlayingChange(cb: (playing: boolean) => void): () => void {
+/** Subscribe to playing state changes. Returns unsubscribe function. */
+export function onPlayingChange(cb: (uri: string | null) => void): () => void {
playingListeners.add(cb);
return () => { playingListeners.delete(cb); };
+}
+
+/** Get the URI currently playing, or null. */
+export function playingUri(): string | null {
+ return currentUri;
+}
+
+export function isPlaying(): boolean {
+ return currentPlayer !== null;
}
export async function requestPermissions(): Promise<boolean> {
@@ -44,9 +58,13 @@
return tmpPath;
}
+// --- Audio queue for chaining sequential voice notes (autoplay) ---
+const audioQueue: Array<{ uri: string; onFinish?: () => void }> = [];
+let processingQueue = false;
+
/**
- * Queue audio for playback. Multiple calls chain sequentially —
- * the next voice note plays only after the current one finishes.
+ * Play audio. Stops any current playback first (singleton).
+ * Multiple calls chain sequentially via queue (for chunked voice notes).
*/
export async function playAudio(
uri: string,
@@ -56,6 +74,18 @@
if (!processingQueue) {
processAudioQueue();
}
+}
+
+/**
+ * Play a single audio file, stopping any current playback first.
+ * Does NOT queue — immediately replaces whatever is playing.
+ */
+export async function playSingle(
+ uri: string,
+ onFinish?: () => void
+): Promise<void> {
+ await stopPlayback();
+ await playOneAudio(uri, onFinish);
}
async function processAudioQueue(): Promise<void> {
@@ -72,35 +102,57 @@
function playOneAudio(uri: string, onFinish?: () => void): Promise<void> {
return new Promise<void>(async (resolve) => {
+ let settled = false;
+ const finish = () => {
+ if (settled) return;
+ settled = true;
+ cancelCurrent = null;
+ clearTimeout(timer);
+ onFinish?.();
+ try { player?.pause(); } catch { /* ignore */ }
+ try { player?.remove(); } catch { /* ignore */ }
+ if (currentPlayer === player) {
+ currentPlayer = null;
+ notifyListeners(null);
+ }
+ resolve();
+ };
+
+ // Stop any currently playing audio first
+ if (cancelCurrent) {
+ cancelCurrent();
+ }
+
+ // Register cancel callback so stopPlayback can abort us
+ cancelCurrent = finish;
+
+ // Safety timeout
+ const timer = setTimeout(finish, 5 * 60 * 1000);
+ let player: ReturnType<typeof createAudioPlayer> | null = null;
+
try {
await setAudioModeAsync({ playsInSilentMode: true });
- const player = createAudioPlayer(uri);
+ player = createAudioPlayer(uri);
currentPlayer = player;
- notifyListeners(true);
+ notifyListeners(uri);
player.addListener("playbackStatusUpdate", (status) => {
- if (!status.playing && status.currentTime >= status.duration && status.duration > 0) {
- onFinish?.();
- player.remove();
- if (currentPlayer === player) {
- currentPlayer = null;
- if (audioQueue.length === 0) notifyListeners(false);
- }
- resolve();
+ if (!status.playing && status.currentTime > 0 &&
+ (status.duration <= 0 || status.currentTime >= status.duration)) {
+ finish();
}
});
player.play();
} catch (error) {
console.error("Failed to play audio:", error);
+ settled = true;
+ cancelCurrent = null;
+ clearTimeout(timer);
resolve();
}
});
-}
-
-export function isPlaying(): boolean {
- return currentPlayer !== null;
}
/**
@@ -108,7 +160,9 @@
*/
export async function stopPlayback(): Promise<void> {
audioQueue.length = 0;
- if (currentPlayer) {
+ if (cancelCurrent) {
+ cancelCurrent();
+ } else if (currentPlayer) {
try {
currentPlayer.pause();
currentPlayer.remove();
@@ -116,7 +170,7 @@
// Ignore cleanup errors
}
currentPlayer = null;
- notifyListeners(false);
+ notifyListeners(null);
}
}
diff --git a/services/wol.ts b/services/wol.ts
index 9929a56..d3b6a4d 100644
--- a/services/wol.ts
+++ b/services/wol.ts
@@ -56,12 +56,31 @@
}
}
+ const TIMEOUT_MS = 5000;
+
return new Promise<void>((resolve, reject) => {
+ let settled = false;
+ const settle = (fn: () => void) => {
+ if (settled) return;
+ settled = true;
+ clearTimeout(timer);
+ fn();
+ };
+
+ const timer = setTimeout(() => {
+ settle(() => {
+ try { socket.close(); } catch { /* ignore */ }
+ reject(new Error("WoL timed out — magic packet may not have been sent"));
+ });
+ }, TIMEOUT_MS);
+
const socket = dgram.createSocket({ type: "udp4" });
socket.once("error", (err: Error) => {
- try { socket.close(); } catch { /* ignore */ }
- reject(err);
+ settle(() => {
+ try { socket.close(); } catch { /* ignore */ }
+ reject(err);
+ });
});
socket.bind(0, () => {
@@ -72,20 +91,22 @@
}
let pending = broadcastAddresses.length;
- let failed = false;
for (const addr of broadcastAddresses) {
socket.send(packet, 0, packet.length, 9, addr, (err?: Error) => {
- if (err && !failed) {
- failed = true;
- try { socket.close(); } catch { /* ignore */ }
- reject(err);
+ if (err) {
+ settle(() => {
+ try { socket.close(); } catch { /* ignore */ }
+ reject(err);
+ });
return;
}
pending--;
if (pending === 0) {
- try { socket.close(); } catch { /* ignore */ }
- resolve();
+ settle(() => {
+ try { socket.close(); } catch { /* ignore */ }
+ resolve();
+ });
}
});
}
diff --git a/types/index.ts b/types/index.ts
index 9c59488..57d1765 100644
--- a/types/index.ts
+++ b/types/index.ts
@@ -34,6 +34,7 @@
type: "voice";
audioBase64: string;
content: string;
+ messageId?: string;
}
export interface WsImageMessage {
@@ -95,6 +96,12 @@
name: string;
}
+export interface WsIncomingTranscript {
+ type: "transcript";
+ messageId: string;
+ content: string;
+}
+
export interface WsIncomingError {
type: "error";
message: string;
@@ -112,5 +119,6 @@
| WsIncomingSessions
| WsIncomingSessionSwitched
| WsIncomingSessionRenamed
+ | WsIncomingTranscript
| WsIncomingError
| WsIncomingStatus;
--
Gitblit v1.3.1