From 41c189a18cd27efb0eae4266286126f11ed0f335 Mon Sep 17 00:00:00 2001 From: Jake Date: Fri, 24 Oct 2025 18:39:24 +0800 Subject: [PATCH] integrated voice-to-text, need it displaying on stream better --- main.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 66 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index fc60fb4..e5bbdf1 100644 --- a/main.py +++ b/main.py @@ -5,8 +5,37 @@ from math import ceil from itertools import product from rknnlite.api import RKNNLite import threading +import sounddevice as sd +import queue +import json +from vosk import Model, KaldiRecognizer import time +def speech_loop(): + global latest_speech + model = Model("./vosk-model-small-en-us-0.15") + rec = KaldiRecognizer(model, 16000) + q = queue.Queue() + + def callback(indata, frames, time, status): + if status: + print(status) + q.put(bytes(indata)) + + with sd.RawInputStream(samplerate=16000, blocksize=8000, dtype='int16', + channels=1, callback=callback): + while True: + data = q.get() + if rec.AcceptWaveform(data): + result = json.loads(rec.Result()) + latest_speech = result + print(".", result) + else: + partial = json.loads(rec.PartialResult()) + latest_speech = partial + print("...", partial, end='\r') + + # --- RetinaFace Utilities --- def letterbox_resize(image, size, bg_color): target_width, target_height = size @@ -143,14 +172,14 @@ def background_loop(): conf = data[4] box_center = np.array([(x1 + x2) / 2, (y1 + y2) / 2]) offset = box_center - frame_center - face_data.append({ - "box": [x1, y1, x2, y2], - "confidence": float(conf), - "offset_from_center": { - "x": float(offset[0]), - "y": float(offset[1]) - } - }) + # face_data.append({ + # "box": [x1, y1, x2, y2], + # "confidence": float(conf), + # "offset_from_center": { + # "x": float(offset[0]), + # "y": float(offset[1]) + # } + # }) cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2) cv2.putText(frame, f'{conf:.4f}', (x1, y1 + 12), cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255)) for j in range(5): @@ -171,7 +200,29 @@ app = Flask(__name__) @app.route('/') def index(): - return Response(stream_frames(), mimetype='multipart/x-mixed-replace; boundary=frame') + return ''' + + RetinaFace + Speech + +

Live Stream

+ +

Live Speech

+
+ + + + ''' + + def stream_frames(): while True: @@ -183,8 +234,14 @@ def stream_frames(): def get_faces(): return jsonify(latest_faces) +@app.route('/speech') +def get_speech(): + return jsonify(latest_speech) + + # --- Start Background Thread --- threading.Thread(target=background_loop, daemon=True).start() +threading.Thread(target=speech_loop, daemon=True).start() if __name__ == '__main__': app.run(host='0.0.0.0', port=5000)