From 0e888d62af1434fef231e11a5c307a5b48a8deb1 Mon Sep 17 00:00:00 2001
From: Matthias Nott <mnott@mnsoft.org>
Date: Sat, 07 Mar 2026 10:49:07 +0100
Subject: [PATCH] feat: singleton audio, transcript reflection, voice persistence

---
 services/wol.ts                   |   39 +++++-
 types/index.ts                    |    8 +
 components/chat/InputBar.tsx      |    2 
 services/audio.ts                 |  104 +++++++++++++----
 contexts/ConnectionContext.tsx    |    5 
 app/chat.tsx                      |   10 -
 components/chat/MessageBubble.tsx |  125 +++++++++++---------
 contexts/ChatContext.tsx          |   48 +++++++-
 components/chat/VoiceButton.tsx   |    8 +
 9 files changed, 243 insertions(+), 106 deletions(-)

diff --git a/app/chat.tsx b/app/chat.tsx
index 49f93bc..8d4e95e 100644
--- a/app/chat.tsx
+++ b/app/chat.tsx
@@ -11,7 +11,7 @@
 import { ImageCaptionModal } from "../components/chat/ImageCaptionModal";
 import { StatusDot } from "../components/ui/StatusDot";
 import { SessionDrawer } from "../components/SessionDrawer";
-import { playAudio, stopPlayback, isPlaying, onPlayingChange } from "../services/audio";
+import { playSingle, stopPlayback, isPlaying, onPlayingChange } from "../services/audio";
 
 interface StagedImage {
   base64: string;
@@ -32,7 +32,7 @@
   const [stagedImage, setStagedImage] = useState<StagedImage | null>(null);
 
   useEffect(() => {
-    return onPlayingChange(setAudioPlaying);
+    return onPlayingChange((uri) => setAudioPlaying(uri !== null));
   }, []);
 
   const handleScreenshot = useCallback(() => {
@@ -137,10 +137,8 @@
     }
     for (let i = messages.length - 1; i >= 0; i--) {
       const msg = messages[i];
-      if (msg.role === "assistant") {
-        if (msg.audioUri) {
-          playAudio(msg.audioUri).catch(() => {});
-        }
+      if (msg.role === "assistant" && msg.audioUri) {
+        playSingle(msg.audioUri).catch(() => {});
         return;
       }
     }
diff --git a/components/chat/InputBar.tsx b/components/chat/InputBar.tsx
index ee2fa20..b002775 100644
--- a/components/chat/InputBar.tsx
+++ b/components/chat/InputBar.tsx
@@ -12,7 +12,7 @@
 
 interface InputBarProps {
   onSendText: (text: string) => void;
-  onVoiceRecorded: (uri: string) => void;
+  onVoiceRecorded: (uri: string, durationMs?: number) => void;
   onReplay: () => void;
   isTextMode: boolean;
   onToggleMode: () => void;
diff --git a/components/chat/MessageBubble.tsx b/components/chat/MessageBubble.tsx
index 8d3bd9a..5edf2f8 100644
--- a/components/chat/MessageBubble.tsx
+++ b/components/chat/MessageBubble.tsx
@@ -1,7 +1,7 @@
 import React, { useCallback, useEffect, useState } from "react";
 import { Image, Pressable, Text, View } from "react-native";
 import { Message } from "../../types";
-import { playAudio, stopPlayback, onPlayingChange } from "../../services/audio";
+import { playSingle, stopPlayback, onPlayingChange } from "../../services/audio";
 import { ImageViewer } from "./ImageViewer";
 import { useTheme } from "../../contexts/ThemeContext";
 
@@ -27,11 +27,12 @@
   const [showViewer, setShowViewer] = useState(false);
   const { colors, isDark } = useTheme();
 
+  // Track whether THIS bubble's audio is playing via the singleton URI
   useEffect(() => {
-    return onPlayingChange((playing) => {
-      if (!playing) setIsPlaying(false);
+    return onPlayingChange((uri) => {
+      setIsPlaying(uri !== null && uri === message.audioUri);
     });
-  }, []);
+  }, [message.audioUri]);
 
   const isUser = message.role === "user";
   const isSystem = message.role === "system";
@@ -40,11 +41,11 @@
     if (!message.audioUri) return;
 
     if (isPlaying) {
+      // This bubble is playing — stop it
       await stopPlayback();
-      setIsPlaying(false);
     } else {
-      setIsPlaying(true);
-      await playAudio(message.audioUri, () => setIsPlaying(false));
+      // Play this bubble (stops anything else automatically)
+      await playSingle(message.audioUri, () => {});
     }
   }, [isPlaying, message.audioUri]);
 
@@ -114,56 +115,70 @@
             />
           </View>
         ) : message.type === "voice" ? (
-          <Pressable
-            onPress={handleVoicePress}
-            style={{ flexDirection: "row", alignItems: "center", gap: 12 }}
-          >
-            <View
-              style={{
-                width: 36,
-                height: 36,
-                borderRadius: 18,
-                alignItems: "center",
-                justifyContent: "center",
-                backgroundColor: isPlaying
-                  ? "#FF9F43"
-                  : isUser
-                  ? "rgba(255,255,255,0.2)"
-                  : colors.border,
-              }}
+          <View>
+            <Pressable
+              onPress={handleVoicePress}
+              style={{ flexDirection: "row", alignItems: "center", gap: 12 }}
             >
-              <Text style={{ fontSize: 14, color: isUser ? "#FFF" : colors.text }}>
-                {isPlaying ? "\u23F8" : "\u25B6"}
+              <View
+                style={{
+                  width: 36,
+                  height: 36,
+                  borderRadius: 18,
+                  alignItems: "center",
+                  justifyContent: "center",
+                  backgroundColor: isPlaying
+                    ? "#FF9F43"
+                    : isUser
+                    ? "rgba(255,255,255,0.2)"
+                    : colors.border,
+                }}
+              >
+                <Text style={{ fontSize: 14, color: isUser ? "#FFF" : colors.text }}>
+                  {isPlaying ? "\u23F8" : "\u25B6"}
+                </Text>
+              </View>
+
+              <View style={{ flex: 1, flexDirection: "row", alignItems: "center", gap: 1, height: 32 }}>
+                {Array.from({ length: 20 }).map((_, i) => (
+                  <View
+                    key={i}
+                    style={{
+                      flex: 1,
+                      borderRadius: 2,
+                      backgroundColor: isPlaying && i < 10
+                        ? "#FF9F43"
+                        : isUser
+                        ? "rgba(255,255,255,0.5)"
+                        : colors.textMuted,
+                      height: `${20 + Math.sin(i * 0.8) * 60}%`,
+                    }}
+                  />
+                ))}
+              </View>
+
+              <Text
+                style={{
+                  fontSize: 11,
+                  color: isUser ? "rgba(255,255,255,0.8)" : colors.textSecondary,
+                }}
+              >
+                {formatDuration(message.duration)}
               </Text>
-            </View>
-
-            <View style={{ flex: 1, flexDirection: "row", alignItems: "center", gap: 1, height: 32 }}>
-              {Array.from({ length: 20 }).map((_, i) => (
-                <View
-                  key={i}
-                  style={{
-                    flex: 1,
-                    borderRadius: 2,
-                    backgroundColor: isPlaying && i < 10
-                      ? "#FF9F43"
-                      : isUser
-                      ? "rgba(255,255,255,0.5)"
-                      : colors.textMuted,
-                    height: `${20 + Math.sin(i * 0.8) * 60}%`,
-                  }}
-                />
-              ))}
-            </View>
-
-            <Text
-              style={{
-                fontSize: 11,
-                color: isUser ? "rgba(255,255,255,0.8)" : colors.textSecondary,
-              }}
-            >
-              {formatDuration(message.duration)}
-            </Text>
-          </Pressable>
+            </Pressable>
+            {message.content ? (
+              <Text
+                style={{
+                  fontSize: 14,
+                  lineHeight: 20,
+                  marginTop: 8,
+                  color: isUser ? "rgba(255,255,255,0.9)" : colors.textSecondary,
+                }}
+              >
+                {message.content}
+              </Text>
+            ) : null}
+          </View>
         ) : (
           <Text
             style={{
diff --git a/components/chat/VoiceButton.tsx b/components/chat/VoiceButton.tsx
index 93b357a..9a8a2b8 100644
--- a/components/chat/VoiceButton.tsx
+++ b/components/chat/VoiceButton.tsx
@@ -10,7 +10,7 @@
 import { stopPlayback } from "../../services/audio";
 
 interface VoiceButtonProps {
-  onVoiceRecorded: (uri: string) => void;
+  onVoiceRecorded: (uri: string, durationMs?: number) => void;
 }
 
 const VOICE_BUTTON_SIZE = 72;
@@ -98,7 +98,11 @@
       });
       const uri = recorder.uri;
       if (uri) {
-        onVoiceRecorded(uri);
+        // currentTime is in seconds after stop
+        const durationMs = recorder.currentTime > 0
+          ? Math.round(recorder.currentTime * 1000)
+          : undefined;
+        onVoiceRecorded(uri, durationMs);
       }
     } catch (err) {
       console.error("Failed to stop recording:", err);
diff --git a/contexts/ChatContext.tsx b/contexts/ChatContext.tsx
index 0865375..151f6f7 100644
--- a/contexts/ChatContext.tsx
+++ b/contexts/ChatContext.tsx
@@ -26,7 +26,9 @@
 
 const MESSAGES_DIR = "pailot-messages";
 
-/** Strip heavy fields (base64 images, audio URIs) before persisting. */
+/** Strip heavy fields (base64 images, audio URIs) before persisting.
+ *  Voice messages keep their content (transcript) but lose audioUri
+ *  since cache files won't survive app restarts. */
 function lightMessage(m: Message): Message {
   const light = { ...m };
   if (light.imageBase64) light.imageBase64 = undefined;
@@ -63,7 +65,16 @@
       if (!file.endsWith(".json")) continue;
       const sessionId = file.replace(".json", "");
       const content = await fs.readAsStringAsync(`${dir}${file}`);
-      result[sessionId] = JSON.parse(content) as Message[];
+      result[sessionId] = (JSON.parse(content) as Message[])
+        // Drop voice messages with no audio and no content (empty chunks)
+        .filter((m) => !(m.type === "voice" && !m.audioUri && !m.content))
+        .map((m) => {
+          // Voice messages without audio but with transcript → show as text
+          if (m.type === "voice" && !m.audioUri && m.content) {
+            return { ...m, type: "text" };
+          }
+          return m;
+        });
     }
     return result;
   } catch {
@@ -179,12 +190,15 @@
     }
   }, [messages]);
 
-  // On connect: ask gateway to detect the focused iTerm2 session and sync
+  // On connect: ask gateway to sync sessions. If we already had a session
+  // selected, tell the gateway so it preserves our selection instead of
+  // jumping to whatever iTerm has focused on the Mac.
   useEffect(() => {
     if (status === "connected") {
       needsSync.current = true;
-      sendCommand("sync");
+      sendCommand("sync", activeSessionId ? { activeSessionId } : undefined);
     }
+    // eslint-disable-next-line react-hooks/exhaustive-deps — only fire on status change
   }, [status, sendCommand]);
 
   // Helper: add a message to the active session
@@ -233,6 +247,23 @@
     },
     []
   );
+
+  // Update a message's content (e.g., voice transcript reflection)
+  const updateMessageContent = useCallback((id: string, content: string) => {
+    setMessages((prev) => {
+      const next = prev.map((m) =>
+        m.id === id ? { ...m, content } : m
+      );
+      setActiveSessionId((sessId) => {
+        if (sessId) {
+          messagesMapRef.current[sessId] = next;
+          debouncedSave(messagesMapRef.current);
+        }
+        return sessId;
+      });
+      return next;
+    });
+  }, []);
 
   // Handle incoming WebSocket messages
   useEffect(() => {
@@ -322,6 +353,11 @@
           sendCommand("sessions");
           break;
         }
+        case "transcript": {
+          // Voice → text reflection: replace voice bubble with transcribed text
+          updateMessageContent(data.messageId, data.content);
+          break;
+        }
         case "error": {
           const msg: Message = {
             id: generateId(),
@@ -339,7 +375,7 @@
     return () => {
       onMessageReceived.current = null;
     };
-  }, [onMessageReceived, sendCommand, addMessageToActive, syncActiveFromSessions]);
+  }, [onMessageReceived, sendCommand, addMessageToActive, updateMessageContent, syncActiveFromSessions]);
 
   const sendTextMessage = useCallback(
     (text: string) => {
@@ -375,7 +411,7 @@
       addMessageToActive(msg);
       try {
         const base64 = await encodeAudioToBase64(audioUri);
-        const sent = wsVoice(base64);
+        const sent = wsVoice(base64, "", id);
         updateMessageStatus(id, sent ? "sent" : "error");
       } catch (err) {
         console.error("Failed to encode audio:", err);
diff --git a/contexts/ConnectionContext.tsx b/contexts/ConnectionContext.tsx
index e01d867..6734d9c 100644
--- a/contexts/ConnectionContext.tsx
+++ b/contexts/ConnectionContext.tsx
@@ -24,7 +24,7 @@
   connect: (config?: ServerConfig) => void;
   disconnect: () => void;
   sendTextMessage: (text: string) => boolean;
-  sendVoiceMessage: (audioBase64: string, transcript?: string) => boolean;
+  sendVoiceMessage: (audioBase64: string, transcript?: string, messageId?: string) => boolean;
   sendImageMessage: (imageBase64: string, caption: string, mimeType: string) => boolean;
   sendCommand: (command: string, args?: Record<string, unknown>) => boolean;
   saveServerConfig: (config: ServerConfig) => Promise<void>;
@@ -120,11 +120,12 @@
   }, []);
 
   const sendVoiceMessage = useCallback(
-    (audioBase64: string, transcript: string = ""): boolean => {
+    (audioBase64: string, transcript: string = "", messageId?: string): boolean => {
       return wsClient.send({
         type: "voice",
         content: transcript,
         audioBase64,
+        messageId,
       });
     },
     []
diff --git a/services/audio.ts b/services/audio.ts
index 5fa8bd5..ea43236 100644
--- a/services/audio.ts
+++ b/services/audio.ts
@@ -10,20 +10,34 @@
   durationMs: number;
 }
 
+// --- Singleton audio player ---
+// Only ONE audio can play at a time. Any new play request stops the current one.
+
 let currentPlayer: ReturnType<typeof createAudioPlayer> | null = null;
-const playingListeners = new Set<(playing: boolean) => void>();
+let currentUri: string | null = null;
+let cancelCurrent: (() => void) | null = null;
 
-// Audio queue for chaining sequential voice notes
-const audioQueue: Array<{ uri: string; onFinish?: () => void }> = [];
-let processingQueue = false;
+// Listeners get the URI of what's playing (or null when stopped)
+const playingListeners = new Set<(uri: string | null) => void>();
 
-function notifyListeners(playing: boolean): void {
-  for (const cb of playingListeners) cb(playing);
+function notifyListeners(uri: string | null): void {
+  currentUri = uri;
+  for (const cb of playingListeners) cb(uri);
 }
 
-export function onPlayingChange(cb: (playing: boolean) => void): () => void {
+/** Subscribe to playing state changes. Returns unsubscribe function. */
+export function onPlayingChange(cb: (uri: string | null) => void): () => void {
   playingListeners.add(cb);
   return () => { playingListeners.delete(cb); };
+}
+
+/** Get the URI currently playing, or null. */
+export function playingUri(): string | null {
+  return currentUri;
+}
+
+export function isPlaying(): boolean {
+  return currentPlayer !== null;
 }
 
 export async function requestPermissions(): Promise<boolean> {
@@ -44,9 +58,13 @@
   return tmpPath;
 }
 
+// --- Audio queue for chaining sequential voice notes (autoplay) ---
+const audioQueue: Array<{ uri: string; onFinish?: () => void }> = [];
+let processingQueue = false;
+
 /**
- * Queue audio for playback. Multiple calls chain sequentially —
- * the next voice note plays only after the current one finishes.
+ * Play audio. Stops any current playback first (singleton).
+ * Multiple calls chain sequentially via queue (for chunked voice notes).
  */
 export async function playAudio(
   uri: string,
@@ -56,6 +74,18 @@
   if (!processingQueue) {
     processAudioQueue();
   }
+}
+
+/**
+ * Play a single audio file, stopping any current playback first.
+ * Does NOT queue — immediately replaces whatever is playing.
+ */
+export async function playSingle(
+  uri: string,
+  onFinish?: () => void
+): Promise<void> {
+  await stopPlayback();
+  await playOneAudio(uri, onFinish);
 }
 
 async function processAudioQueue(): Promise<void> {
@@ -72,35 +102,57 @@
 
 function playOneAudio(uri: string, onFinish?: () => void): Promise<void> {
   return new Promise<void>(async (resolve) => {
+    let settled = false;
+    const finish = () => {
+      if (settled) return;
+      settled = true;
+      cancelCurrent = null;
+      clearTimeout(timer);
+      onFinish?.();
+      try { player?.pause(); } catch { /* ignore */ }
+      try { player?.remove(); } catch { /* ignore */ }
+      if (currentPlayer === player) {
+        currentPlayer = null;
+        notifyListeners(null);
+      }
+      resolve();
+    };
+
+    // Stop any currently playing audio first
+    if (cancelCurrent) {
+      cancelCurrent();
+    }
+
+    // Register cancel callback so stopPlayback can abort us
+    cancelCurrent = finish;
+
+    // Safety timeout
+    const timer = setTimeout(finish, 5 * 60 * 1000);
+    let player: ReturnType<typeof createAudioPlayer> | null = null;
+
     try {
       await setAudioModeAsync({ playsInSilentMode: true });
 
-      const player = createAudioPlayer(uri);
+      player = createAudioPlayer(uri);
       currentPlayer = player;
-      notifyListeners(true);
+      notifyListeners(uri);
 
       player.addListener("playbackStatusUpdate", (status) => {
-        if (!status.playing && status.currentTime >= status.duration && status.duration > 0) {
-          onFinish?.();
-          player.remove();
-          if (currentPlayer === player) {
-            currentPlayer = null;
-            if (audioQueue.length === 0) notifyListeners(false);
-          }
-          resolve();
+        if (!status.playing && status.currentTime > 0 &&
+            (status.duration <= 0 || status.currentTime >= status.duration)) {
+          finish();
         }
       });
 
       player.play();
     } catch (error) {
       console.error("Failed to play audio:", error);
+      settled = true;
+      cancelCurrent = null;
+      clearTimeout(timer);
       resolve();
     }
   });
-}
-
-export function isPlaying(): boolean {
-  return currentPlayer !== null;
 }
 
 /**
@@ -108,7 +160,9 @@
  */
 export async function stopPlayback(): Promise<void> {
   audioQueue.length = 0;
-  if (currentPlayer) {
+  if (cancelCurrent) {
+    cancelCurrent();
+  } else if (currentPlayer) {
     try {
       currentPlayer.pause();
       currentPlayer.remove();
@@ -116,7 +170,7 @@
       // Ignore cleanup errors
     }
     currentPlayer = null;
-    notifyListeners(false);
+    notifyListeners(null);
   }
 }
 
diff --git a/services/wol.ts b/services/wol.ts
index 9929a56..d3b6a4d 100644
--- a/services/wol.ts
+++ b/services/wol.ts
@@ -56,12 +56,31 @@
     }
   }
 
+  const TIMEOUT_MS = 5000;
+
   return new Promise<void>((resolve, reject) => {
+    let settled = false;
+    const settle = (fn: () => void) => {
+      if (settled) return;
+      settled = true;
+      clearTimeout(timer);
+      fn();
+    };
+
+    const timer = setTimeout(() => {
+      settle(() => {
+        try { socket.close(); } catch { /* ignore */ }
+        reject(new Error("WoL timed out — magic packet may not have been sent"));
+      });
+    }, TIMEOUT_MS);
+
     const socket = dgram.createSocket({ type: "udp4" });
 
     socket.once("error", (err: Error) => {
-      try { socket.close(); } catch { /* ignore */ }
-      reject(err);
+      settle(() => {
+        try { socket.close(); } catch { /* ignore */ }
+        reject(err);
+      });
     });
 
     socket.bind(0, () => {
@@ -72,20 +91,22 @@
       }
 
       let pending = broadcastAddresses.length;
-      let failed = false;
 
       for (const addr of broadcastAddresses) {
         socket.send(packet, 0, packet.length, 9, addr, (err?: Error) => {
-          if (err && !failed) {
-            failed = true;
-            try { socket.close(); } catch { /* ignore */ }
-            reject(err);
+          if (err) {
+            settle(() => {
+              try { socket.close(); } catch { /* ignore */ }
+              reject(err);
+            });
             return;
           }
           pending--;
           if (pending === 0) {
-            try { socket.close(); } catch { /* ignore */ }
-            resolve();
+            settle(() => {
+              try { socket.close(); } catch { /* ignore */ }
+              resolve();
+            });
           }
         });
       }
diff --git a/types/index.ts b/types/index.ts
index 9c59488..57d1765 100644
--- a/types/index.ts
+++ b/types/index.ts
@@ -34,6 +34,7 @@
   type: "voice";
   audioBase64: string;
   content: string;
+  messageId?: string;
 }
 
 export interface WsImageMessage {
@@ -95,6 +96,12 @@
   name: string;
 }
 
+export interface WsIncomingTranscript {
+  type: "transcript";
+  messageId: string;
+  content: string;
+}
+
 export interface WsIncomingError {
   type: "error";
   message: string;
@@ -112,5 +119,6 @@
   | WsIncomingSessions
   | WsIncomingSessionSwitched
   | WsIncomingSessionRenamed
+  | WsIncomingTranscript
   | WsIncomingError
   | WsIncomingStatus;

--
Gitblit v1.3.1