Matthias Nott
2026-03-07 0e888d62af1434fef231e11a5c307a5b48a8deb1
feat: singleton audio, transcript reflection, voice persistence

- audio.ts: true singleton player — only one audio at a time, with
URI-based tracking so each bubble knows its play state
- audio.ts: playSingle() for manual playback, playAudio() for
autoplay queue, explicit pause before remove to stop native audio
- MessageBubble: show transcript text below voice player, track
playing state via singleton URI
- VoiceButton: pass recording duration from recorder.currentTime
- ChatContext: handle incoming 'transcript' type to update voice
bubbles with transcribed text, send messageId with voice messages
- ChatContext: voice messages persist transcript text, empty chunks
are filtered on reload, transcribed voices become text bubbles
- ConnectionContext: pass messageId with voice messages to gateway
- wol.ts: add 5s timeout with settled guard to prevent hanging
- types: add WsIncomingTranscript, messageId on WsVoiceMessage
9 files modified
changed files
app/chat.tsx patch | view | blame | history
components/chat/InputBar.tsx patch | view | blame | history
components/chat/MessageBubble.tsx patch | view | blame | history
components/chat/VoiceButton.tsx patch | view | blame | history
contexts/ChatContext.tsx patch | view | blame | history
contexts/ConnectionContext.tsx patch | view | blame | history
services/audio.ts patch | view | blame | history
services/wol.ts patch | view | blame | history
types/index.ts patch | view | blame | history
app/chat.tsx
....@@ -11,7 +11,7 @@
1111 import { ImageCaptionModal } from "../components/chat/ImageCaptionModal";
1212 import { StatusDot } from "../components/ui/StatusDot";
1313 import { SessionDrawer } from "../components/SessionDrawer";
14
-import { playAudio, stopPlayback, isPlaying, onPlayingChange } from "../services/audio";
14
+import { playSingle, stopPlayback, isPlaying, onPlayingChange } from "../services/audio";
1515
1616 interface StagedImage {
1717 base64: string;
....@@ -32,7 +32,7 @@
3232 const [stagedImage, setStagedImage] = useState<StagedImage | null>(null);
3333
3434 useEffect(() => {
35
- return onPlayingChange(setAudioPlaying);
35
+ return onPlayingChange((uri) => setAudioPlaying(uri !== null));
3636 }, []);
3737
3838 const handleScreenshot = useCallback(() => {
....@@ -137,10 +137,8 @@
137137 }
138138 for (let i = messages.length - 1; i >= 0; i--) {
139139 const msg = messages[i];
140
- if (msg.role === "assistant") {
141
- if (msg.audioUri) {
142
- playAudio(msg.audioUri).catch(() => {});
143
- }
140
+ if (msg.role === "assistant" && msg.audioUri) {
141
+ playSingle(msg.audioUri).catch(() => {});
144142 return;
145143 }
146144 }
components/chat/InputBar.tsx
....@@ -12,7 +12,7 @@
1212
1313 interface InputBarProps {
1414 onSendText: (text: string) => void;
15
- onVoiceRecorded: (uri: string) => void;
15
+ onVoiceRecorded: (uri: string, durationMs?: number) => void;
1616 onReplay: () => void;
1717 isTextMode: boolean;
1818 onToggleMode: () => void;
components/chat/MessageBubble.tsx
....@@ -1,7 +1,7 @@
11 import React, { useCallback, useEffect, useState } from "react";
22 import { Image, Pressable, Text, View } from "react-native";
33 import { Message } from "../../types";
4
-import { playAudio, stopPlayback, onPlayingChange } from "../../services/audio";
4
+import { playSingle, stopPlayback, onPlayingChange } from "../../services/audio";
55 import { ImageViewer } from "./ImageViewer";
66 import { useTheme } from "../../contexts/ThemeContext";
77
....@@ -27,11 +27,12 @@
2727 const [showViewer, setShowViewer] = useState(false);
2828 const { colors, isDark } = useTheme();
2929
30
+ // Track whether THIS bubble's audio is playing via the singleton URI
3031 useEffect(() => {
31
- return onPlayingChange((playing) => {
32
- if (!playing) setIsPlaying(false);
32
+ return onPlayingChange((uri) => {
33
+ setIsPlaying(uri !== null && uri === message.audioUri);
3334 });
34
- }, []);
35
+ }, [message.audioUri]);
3536
3637 const isUser = message.role === "user";
3738 const isSystem = message.role === "system";
....@@ -40,11 +41,11 @@
4041 if (!message.audioUri) return;
4142
4243 if (isPlaying) {
44
+ // This bubble is playing — stop it
4345 await stopPlayback();
44
- setIsPlaying(false);
4546 } else {
46
- setIsPlaying(true);
47
- await playAudio(message.audioUri, () => setIsPlaying(false));
47
+ // Play this bubble (stops anything else automatically)
48
+ await playSingle(message.audioUri, () => {});
4849 }
4950 }, [isPlaying, message.audioUri]);
5051
....@@ -114,56 +115,70 @@
114115 />
115116 </View>
116117 ) : message.type === "voice" ? (
117
- <Pressable
118
- onPress={handleVoicePress}
119
- style={{ flexDirection: "row", alignItems: "center", gap: 12 }}
120
- >
121
- <View
122
- style={{
123
- width: 36,
124
- height: 36,
125
- borderRadius: 18,
126
- alignItems: "center",
127
- justifyContent: "center",
128
- backgroundColor: isPlaying
129
- ? "#FF9F43"
130
- : isUser
131
- ? "rgba(255,255,255,0.2)"
132
- : colors.border,
133
- }}
118
+ <View>
119
+ <Pressable
120
+ onPress={handleVoicePress}
121
+ style={{ flexDirection: "row", alignItems: "center", gap: 12 }}
134122 >
135
- <Text style={{ fontSize: 14, color: isUser ? "#FFF" : colors.text }}>
136
- {isPlaying ? "\u23F8" : "\u25B6"}
123
+ <View
124
+ style={{
125
+ width: 36,
126
+ height: 36,
127
+ borderRadius: 18,
128
+ alignItems: "center",
129
+ justifyContent: "center",
130
+ backgroundColor: isPlaying
131
+ ? "#FF9F43"
132
+ : isUser
133
+ ? "rgba(255,255,255,0.2)"
134
+ : colors.border,
135
+ }}
136
+ >
137
+ <Text style={{ fontSize: 14, color: isUser ? "#FFF" : colors.text }}>
138
+ {isPlaying ? "\u23F8" : "\u25B6"}
139
+ </Text>
140
+ </View>
141
+
142
+ <View style={{ flex: 1, flexDirection: "row", alignItems: "center", gap: 1, height: 32 }}>
143
+ {Array.from({ length: 20 }).map((_, i) => (
144
+ <View
145
+ key={i}
146
+ style={{
147
+ flex: 1,
148
+ borderRadius: 2,
149
+ backgroundColor: isPlaying && i < 10
150
+ ? "#FF9F43"
151
+ : isUser
152
+ ? "rgba(255,255,255,0.5)"
153
+ : colors.textMuted,
154
+ height: `${20 + Math.sin(i * 0.8) * 60}%`,
155
+ }}
156
+ />
157
+ ))}
158
+ </View>
159
+
160
+ <Text
161
+ style={{
162
+ fontSize: 11,
163
+ color: isUser ? "rgba(255,255,255,0.8)" : colors.textSecondary,
164
+ }}
165
+ >
166
+ {formatDuration(message.duration)}
137167 </Text>
138
- </View>
139
-
140
- <View style={{ flex: 1, flexDirection: "row", alignItems: "center", gap: 1, height: 32 }}>
141
- {Array.from({ length: 20 }).map((_, i) => (
142
- <View
143
- key={i}
144
- style={{
145
- flex: 1,
146
- borderRadius: 2,
147
- backgroundColor: isPlaying && i < 10
148
- ? "#FF9F43"
149
- : isUser
150
- ? "rgba(255,255,255,0.5)"
151
- : colors.textMuted,
152
- height: `${20 + Math.sin(i * 0.8) * 60}%`,
153
- }}
154
- />
155
- ))}
156
- </View>
157
-
158
- <Text
159
- style={{
160
- fontSize: 11,
161
- color: isUser ? "rgba(255,255,255,0.8)" : colors.textSecondary,
162
- }}
163
- >
164
- {formatDuration(message.duration)}
165
- </Text>
166
- </Pressable>
168
+ </Pressable>
169
+ {message.content ? (
170
+ <Text
171
+ style={{
172
+ fontSize: 14,
173
+ lineHeight: 20,
174
+ marginTop: 8,
175
+ color: isUser ? "rgba(255,255,255,0.9)" : colors.textSecondary,
176
+ }}
177
+ >
178
+ {message.content}
179
+ </Text>
180
+ ) : null}
181
+ </View>
167182 ) : (
168183 <Text
169184 style={{
components/chat/VoiceButton.tsx
....@@ -10,7 +10,7 @@
1010 import { stopPlayback } from "../../services/audio";
1111
1212 interface VoiceButtonProps {
13
- onVoiceRecorded: (uri: string) => void;
13
+ onVoiceRecorded: (uri: string, durationMs?: number) => void;
1414 }
1515
1616 const VOICE_BUTTON_SIZE = 72;
....@@ -98,7 +98,11 @@
9898 });
9999 const uri = recorder.uri;
100100 if (uri) {
101
- onVoiceRecorded(uri);
101
+ // currentTime is in seconds after stop
102
+ const durationMs = recorder.currentTime > 0
103
+ ? Math.round(recorder.currentTime * 1000)
104
+ : undefined;
105
+ onVoiceRecorded(uri, durationMs);
102106 }
103107 } catch (err) {
104108 console.error("Failed to stop recording:", err);
contexts/ChatContext.tsx
....@@ -26,7 +26,9 @@
2626
2727 const MESSAGES_DIR = "pailot-messages";
2828
29
-/** Strip heavy fields (base64 images, audio URIs) before persisting. */
29
+/** Strip heavy fields (base64 images, audio URIs) before persisting.
30
+ * Voice messages keep their content (transcript) but lose audioUri
31
+ * since cache files won't survive app restarts. */
3032 function lightMessage(m: Message): Message {
3133 const light = { ...m };
3234 if (light.imageBase64) light.imageBase64 = undefined;
....@@ -63,7 +65,16 @@
6365 if (!file.endsWith(".json")) continue;
6466 const sessionId = file.replace(".json", "");
6567 const content = await fs.readAsStringAsync(`${dir}${file}`);
66
- result[sessionId] = JSON.parse(content) as Message[];
68
+ result[sessionId] = (JSON.parse(content) as Message[])
69
+ // Drop voice messages with no audio and no content (empty chunks)
70
+ .filter((m) => !(m.type === "voice" && !m.audioUri && !m.content))
71
+ .map((m) => {
72
+ // Voice messages without audio but with transcript → show as text
73
+ if (m.type === "voice" && !m.audioUri && m.content) {
74
+ return { ...m, type: "text" };
75
+ }
76
+ return m;
77
+ });
6778 }
6879 return result;
6980 } catch {
....@@ -179,12 +190,15 @@
179190 }
180191 }, [messages]);
181192
182
- // On connect: ask gateway to detect the focused iTerm2 session and sync
193
+ // On connect: ask gateway to sync sessions. If we already had a session
194
+ // selected, tell the gateway so it preserves our selection instead of
195
+ // jumping to whatever iTerm has focused on the Mac.
183196 useEffect(() => {
184197 if (status === "connected") {
185198 needsSync.current = true;
186
- sendCommand("sync");
199
+ sendCommand("sync", activeSessionId ? { activeSessionId } : undefined);
187200 }
201
+ // eslint-disable-next-line react-hooks/exhaustive-deps — only fire on status change
188202 }, [status, sendCommand]);
189203
190204 // Helper: add a message to the active session
....@@ -233,6 +247,23 @@
233247 },
234248 []
235249 );
250
+
251
+ // Update a message's content (e.g., voice transcript reflection)
252
+ const updateMessageContent = useCallback((id: string, content: string) => {
253
+ setMessages((prev) => {
254
+ const next = prev.map((m) =>
255
+ m.id === id ? { ...m, content } : m
256
+ );
257
+ setActiveSessionId((sessId) => {
258
+ if (sessId) {
259
+ messagesMapRef.current[sessId] = next;
260
+ debouncedSave(messagesMapRef.current);
261
+ }
262
+ return sessId;
263
+ });
264
+ return next;
265
+ });
266
+ }, []);
236267
237268 // Handle incoming WebSocket messages
238269 useEffect(() => {
....@@ -322,6 +353,11 @@
322353 sendCommand("sessions");
323354 break;
324355 }
356
+ case "transcript": {
357
+ // Voice → text reflection: replace voice bubble with transcribed text
358
+ updateMessageContent(data.messageId, data.content);
359
+ break;
360
+ }
325361 case "error": {
326362 const msg: Message = {
327363 id: generateId(),
....@@ -339,7 +375,7 @@
339375 return () => {
340376 onMessageReceived.current = null;
341377 };
342
- }, [onMessageReceived, sendCommand, addMessageToActive, syncActiveFromSessions]);
378
+ }, [onMessageReceived, sendCommand, addMessageToActive, updateMessageContent, syncActiveFromSessions]);
343379
344380 const sendTextMessage = useCallback(
345381 (text: string) => {
....@@ -375,7 +411,7 @@
375411 addMessageToActive(msg);
376412 try {
377413 const base64 = await encodeAudioToBase64(audioUri);
378
- const sent = wsVoice(base64);
414
+ const sent = wsVoice(base64, "", id);
379415 updateMessageStatus(id, sent ? "sent" : "error");
380416 } catch (err) {
381417 console.error("Failed to encode audio:", err);
contexts/ConnectionContext.tsx
....@@ -24,7 +24,7 @@
2424 connect: (config?: ServerConfig) => void;
2525 disconnect: () => void;
2626 sendTextMessage: (text: string) => boolean;
27
- sendVoiceMessage: (audioBase64: string, transcript?: string) => boolean;
27
+ sendVoiceMessage: (audioBase64: string, transcript?: string, messageId?: string) => boolean;
2828 sendImageMessage: (imageBase64: string, caption: string, mimeType: string) => boolean;
2929 sendCommand: (command: string, args?: Record<string, unknown>) => boolean;
3030 saveServerConfig: (config: ServerConfig) => Promise<void>;
....@@ -120,11 +120,12 @@
120120 }, []);
121121
122122 const sendVoiceMessage = useCallback(
123
- (audioBase64: string, transcript: string = ""): boolean => {
123
+ (audioBase64: string, transcript: string = "", messageId?: string): boolean => {
124124 return wsClient.send({
125125 type: "voice",
126126 content: transcript,
127127 audioBase64,
128
+ messageId,
128129 });
129130 },
130131 []
services/audio.ts
....@@ -10,20 +10,34 @@
1010 durationMs: number;
1111 }
1212
13
+// --- Singleton audio player ---
14
+// Only ONE audio can play at a time. Any new play request stops the current one.
15
+
1316 let currentPlayer: ReturnType<typeof createAudioPlayer> | null = null;
14
-const playingListeners = new Set<(playing: boolean) => void>();
17
+let currentUri: string | null = null;
18
+let cancelCurrent: (() => void) | null = null;
1519
16
-// Audio queue for chaining sequential voice notes
17
-const audioQueue: Array<{ uri: string; onFinish?: () => void }> = [];
18
-let processingQueue = false;
20
+// Listeners get the URI of what's playing (or null when stopped)
21
+const playingListeners = new Set<(uri: string | null) => void>();
1922
20
-function notifyListeners(playing: boolean): void {
21
- for (const cb of playingListeners) cb(playing);
23
+function notifyListeners(uri: string | null): void {
24
+ currentUri = uri;
25
+ for (const cb of playingListeners) cb(uri);
2226 }
2327
24
-export function onPlayingChange(cb: (playing: boolean) => void): () => void {
28
+/** Subscribe to playing state changes. Returns unsubscribe function. */
29
+export function onPlayingChange(cb: (uri: string | null) => void): () => void {
2530 playingListeners.add(cb);
2631 return () => { playingListeners.delete(cb); };
32
+}
33
+
34
+/** Get the URI currently playing, or null. */
35
+export function playingUri(): string | null {
36
+ return currentUri;
37
+}
38
+
39
+export function isPlaying(): boolean {
40
+ return currentPlayer !== null;
2741 }
2842
2943 export async function requestPermissions(): Promise<boolean> {
....@@ -44,9 +58,13 @@
4458 return tmpPath;
4559 }
4660
61
+// --- Audio queue for chaining sequential voice notes (autoplay) ---
62
+const audioQueue: Array<{ uri: string; onFinish?: () => void }> = [];
63
+let processingQueue = false;
64
+
4765 /**
48
- * Queue audio for playback. Multiple calls chain sequentially —
49
- * the next voice note plays only after the current one finishes.
66
+ * Play audio. Stops any current playback first (singleton).
67
+ * Multiple calls chain sequentially via queue (for chunked voice notes).
5068 */
5169 export async function playAudio(
5270 uri: string,
....@@ -56,6 +74,18 @@
5674 if (!processingQueue) {
5775 processAudioQueue();
5876 }
77
+}
78
+
79
+/**
80
+ * Play a single audio file, stopping any current playback first.
81
+ * Does NOT queue — immediately replaces whatever is playing.
82
+ */
83
+export async function playSingle(
84
+ uri: string,
85
+ onFinish?: () => void
86
+): Promise<void> {
87
+ await stopPlayback();
88
+ await playOneAudio(uri, onFinish);
5989 }
6090
6191 async function processAudioQueue(): Promise<void> {
....@@ -72,35 +102,57 @@
72102
73103 function playOneAudio(uri: string, onFinish?: () => void): Promise<void> {
74104 return new Promise<void>(async (resolve) => {
105
+ let settled = false;
106
+ const finish = () => {
107
+ if (settled) return;
108
+ settled = true;
109
+ cancelCurrent = null;
110
+ clearTimeout(timer);
111
+ onFinish?.();
112
+ try { player?.pause(); } catch { /* ignore */ }
113
+ try { player?.remove(); } catch { /* ignore */ }
114
+ if (currentPlayer === player) {
115
+ currentPlayer = null;
116
+ notifyListeners(null);
117
+ }
118
+ resolve();
119
+ };
120
+
121
+ // Stop any currently playing audio first
122
+ if (cancelCurrent) {
123
+ cancelCurrent();
124
+ }
125
+
126
+ // Register cancel callback so stopPlayback can abort us
127
+ cancelCurrent = finish;
128
+
129
+ // Safety timeout
130
+ const timer = setTimeout(finish, 5 * 60 * 1000);
131
+ let player: ReturnType<typeof createAudioPlayer> | null = null;
132
+
75133 try {
76134 await setAudioModeAsync({ playsInSilentMode: true });
77135
78
- const player = createAudioPlayer(uri);
136
+ player = createAudioPlayer(uri);
79137 currentPlayer = player;
80
- notifyListeners(true);
138
+ notifyListeners(uri);
81139
82140 player.addListener("playbackStatusUpdate", (status) => {
83
- if (!status.playing && status.currentTime >= status.duration && status.duration > 0) {
84
- onFinish?.();
85
- player.remove();
86
- if (currentPlayer === player) {
87
- currentPlayer = null;
88
- if (audioQueue.length === 0) notifyListeners(false);
89
- }
90
- resolve();
141
+ if (!status.playing && status.currentTime > 0 &&
142
+ (status.duration <= 0 || status.currentTime >= status.duration)) {
143
+ finish();
91144 }
92145 });
93146
94147 player.play();
95148 } catch (error) {
96149 console.error("Failed to play audio:", error);
150
+ settled = true;
151
+ cancelCurrent = null;
152
+ clearTimeout(timer);
97153 resolve();
98154 }
99155 });
100
-}
101
-
102
-export function isPlaying(): boolean {
103
- return currentPlayer !== null;
104156 }
105157
106158 /**
....@@ -108,7 +160,9 @@
108160 */
109161 export async function stopPlayback(): Promise<void> {
110162 audioQueue.length = 0;
111
- if (currentPlayer) {
163
+ if (cancelCurrent) {
164
+ cancelCurrent();
165
+ } else if (currentPlayer) {
112166 try {
113167 currentPlayer.pause();
114168 currentPlayer.remove();
....@@ -116,7 +170,7 @@
116170 // Ignore cleanup errors
117171 }
118172 currentPlayer = null;
119
- notifyListeners(false);
173
+ notifyListeners(null);
120174 }
121175 }
122176
services/wol.ts
....@@ -56,12 +56,31 @@
5656 }
5757 }
5858
59
+ const TIMEOUT_MS = 5000;
60
+
5961 return new Promise<void>((resolve, reject) => {
62
+ let settled = false;
63
+ const settle = (fn: () => void) => {
64
+ if (settled) return;
65
+ settled = true;
66
+ clearTimeout(timer);
67
+ fn();
68
+ };
69
+
70
+ const timer = setTimeout(() => {
71
+ settle(() => {
72
+ try { socket.close(); } catch { /* ignore */ }
73
+ reject(new Error("WoL timed out — magic packet may not have been sent"));
74
+ });
75
+ }, TIMEOUT_MS);
76
+
6077 const socket = dgram.createSocket({ type: "udp4" });
6178
6279 socket.once("error", (err: Error) => {
63
- try { socket.close(); } catch { /* ignore */ }
64
- reject(err);
80
+ settle(() => {
81
+ try { socket.close(); } catch { /* ignore */ }
82
+ reject(err);
83
+ });
6584 });
6685
6786 socket.bind(0, () => {
....@@ -72,20 +91,22 @@
7291 }
7392
7493 let pending = broadcastAddresses.length;
75
- let failed = false;
7694
7795 for (const addr of broadcastAddresses) {
7896 socket.send(packet, 0, packet.length, 9, addr, (err?: Error) => {
79
- if (err && !failed) {
80
- failed = true;
81
- try { socket.close(); } catch { /* ignore */ }
82
- reject(err);
97
+ if (err) {
98
+ settle(() => {
99
+ try { socket.close(); } catch { /* ignore */ }
100
+ reject(err);
101
+ });
83102 return;
84103 }
85104 pending--;
86105 if (pending === 0) {
87
- try { socket.close(); } catch { /* ignore */ }
88
- resolve();
106
+ settle(() => {
107
+ try { socket.close(); } catch { /* ignore */ }
108
+ resolve();
109
+ });
89110 }
90111 });
91112 }
types/index.ts
....@@ -34,6 +34,7 @@
3434 type: "voice";
3535 audioBase64: string;
3636 content: string;
37
+ messageId?: string;
3738 }
3839
3940 export interface WsImageMessage {
....@@ -95,6 +96,12 @@
9596 name: string;
9697 }
9798
99
+export interface WsIncomingTranscript {
100
+ type: "transcript";
101
+ messageId: string;
102
+ content: string;
103
+}
104
+
98105 export interface WsIncomingError {
99106 type: "error";
100107 message: string;
....@@ -112,5 +119,6 @@
112119 | WsIncomingSessions
113120 | WsIncomingSessionSwitched
114121 | WsIncomingSessionRenamed
122
+ | WsIncomingTranscript
115123 | WsIncomingError
116124 | WsIncomingStatus;