From 41c189a18cd27efb0eae4266286126f11ed0f335 Mon Sep 17 00:00:00 2001
From: Jake <realrobotshk@gmail.com>
Date: Fri, 24 Oct 2025 18:39:24 +0800
Subject: [PATCH] integrated voice-to-text, need it displaying on stream better

---
 main.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 66 insertions(+), 9 deletions(-)
diff --git a/main.py b/main.py
index fc60fb4..e5bbdf1 100644
--- a/main.py
+++ b/main.py
@@ -5,8 +5,37 @@ from math import ceil
 from itertools import product
 from rknnlite.api import RKNNLite
 import threading
+import sounddevice as sd
+import queue
+import json
+from vosk import Model, KaldiRecognizer
 import time
 
+def speech_loop():
+    global latest_speech
+    model = Model("./vosk-model-small-en-us-0.15")
+    rec = KaldiRecognizer(model, 16000)
+    q = queue.Queue()
+
+    def callback(indata, frames, time, status):
+        if status:
+            print(status)
+        q.put(bytes(indata))
+
+    with sd.RawInputStream(samplerate=16000, blocksize=8000, dtype='int16',
+                           channels=1, callback=callback):
+        while True:
+            data = q.get()
+            if rec.AcceptWaveform(data):
+                result = json.loads(rec.Result())                
+                latest_speech = result
+                print(".", result)
+            else:
+                partial = json.loads(rec.PartialResult())
+                latest_speech = partial
+                print("...", partial, end='\r')
+
+
 # --- RetinaFace Utilities ---
 def letterbox_resize(image, size, bg_color):
     target_width, target_height = size
@@ -143,14 +172,14 @@ def background_loop():
             conf = data[4]
             box_center = np.array([(x1 + x2) / 2, (y1 + y2) / 2])
             offset = box_center - frame_center
-            face_data.append({
-                "box": [x1, y1, x2, y2],
-                "confidence": float(conf),
-                "offset_from_center": {
-                    "x": float(offset[0]),
-                    "y": float(offset[1])
-                }
-            })
+            # face_data.append({
+            #     "box": [x1, y1, x2, y2],
+            #     "confidence": float(conf),
+            #     "offset_from_center": {
+            #         "x": float(offset[0]),
+            #         "y": float(offset[1])
+            #     }
+            # })
             cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)
             cv2.putText(frame, f'{conf:.4f}', (x1, y1 + 12), cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255))
             for j in range(5):
@@ -171,7 +200,29 @@ app = Flask(__name__)
 
 @app.route('/')
 def index():
-    return Response(stream_frames(), mimetype='multipart/x-mixed-replace; boundary=frame')
+    return '''
+    <html>
+        <head><title>RetinaFace + Speech</title></head>
+        <body>
+            <h2>Live Stream</h2>
+            <img src="/" width="640" />
+            <h3>Live Speech</h3>
+            <div id="speech" style="font-size:1.2em; font-family:monospace;"></div>
+            <script>
+                async function pollSpeech() {
+                    const res = await fetch('/speech');
+                    const data = await res.json();
+                    const text = data.partial || data.text || '';
+                    document.getElementById('speech').innerText = text;
+                    setTimeout(pollSpeech, 300);
+                }
+                pollSpeech();
+            </script>
+        </body>
+    </html>
+    '''
+
+
 
 def stream_frames():
     while True:
@@ -183,8 +234,14 @@ def stream_frames():
 def get_faces():
     return jsonify(latest_faces)
 
+@app.route('/speech')
+def get_speech():
+    return jsonify(latest_speech)
+
+
 # --- Start Background Thread ---
 threading.Thread(target=background_loop, daemon=True).start()
+threading.Thread(target=speech_loop, daemon=True).start()
 
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=5000)