integrated voice-to-text, need it displaying on stream better

2025-10-24 18:39:24 +08:00 · 2025-10-24 18:39:24 +08:00 · 41c189a18c
parent 699ae46f06
commit 41c189a18c
1 changed files with 66 additions and 9 deletions
--- a/main.py
+++ b/main.py
@ -5,8 +5,37 @@ from math import ceil
 from itertools import product
 from rknnlite.api import RKNNLite
 import threading
 import sounddevice as sd
 import queue
 import json
 from vosk import Model, KaldiRecognizer
 import time
 def speech_loop():
    global latest_speech
    model = Model("./vosk-model-small-en-us-0.15")
    rec = KaldiRecognizer(model, 16000)
    q = queue.Queue()
    def callback(indata, frames, time, status):
        if status:
            print(status)
        q.put(bytes(indata))
    with sd.RawInputStream(samplerate=16000, blocksize=8000, dtype='int16',
                           channels=1, callback=callback):
        while True:
            data = q.get()
            if rec.AcceptWaveform(data):
                result = json.loads(rec.Result())                
                latest_speech = result
                print(".", result)
            else:
                partial = json.loads(rec.PartialResult())
                latest_speech = partial
                print("...", partial, end='\r')
 # --- RetinaFace Utilities ---
 def letterbox_resize(image, size, bg_color):
    target_width, target_height = size
@ -143,14 +172,14 @@ def background_loop():
            conf = data[4]
            box_center = np.array([(x1 + x2) / 2, (y1 + y2) / 2])
            offset = box_center - frame_center
-            face_data.append({
+            # face_data.append({
-                "box": [x1, y1, x2, y2],
+            #     "box": [x1, y1, x2, y2],
-                "confidence": float(conf),
+            #     "confidence": float(conf),
-                "offset_from_center": {
+            #     "offset_from_center": {
-                    "x": float(offset[0]),
+            #         "x": float(offset[0]),
-                    "y": float(offset[1])
+            #         "y": float(offset[1])
-                }
+            #     }
-            })
+            # })
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)
            cv2.putText(frame, f'{conf:.4f}', (x1, y1 + 12), cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255))
            for j in range(5):
@ -171,7 +200,29 @@ app = Flask(__name__)
@app.route('/')
 def index():
-    return Response(stream_frames(), mimetype='multipart/x-mixed-replace; boundary=frame')
+    return '''
    <html>
        <head><title>RetinaFace + Speech</title></head>
        <body>
            <h2>Live Stream</h2>
            <img src="/" width="640" />
            <h3>Live Speech</h3>
            <div id="speech" style="font-size:1.2em; font-family:monospace;"></div>
            <script>
                async function pollSpeech() {
                    const res = await fetch('/speech');
                    const data = await res.json();
                    const text = data.partial || data.text || '';
                    document.getElementById('speech').innerText = text;
                    setTimeout(pollSpeech, 300);
                }
                pollSpeech();
            </script>
        </body>
    </html>
    '''
 def stream_frames():
    while True:
@ -183,8 +234,14 @@ def stream_frames():
 def get_faces():
    return jsonify(latest_faces)
@app.route('/speech')
 def get_speech():
    return jsonify(latest_speech)
 # --- Start Background Thread ---
 threading.Thread(target=background_loop, daemon=True).start()
 threading.Thread(target=speech_loop, daemon=True).start()
 if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)