integrated voice-to-text, need it displaying on stream better

master
Jake 2025-10-24 18:39:24 +08:00
parent 699ae46f06
commit 41c189a18c
1 changed files with 66 additions and 9 deletions

75
main.py
View File

@ -5,8 +5,37 @@ from math import ceil
from itertools import product from itertools import product
from rknnlite.api import RKNNLite from rknnlite.api import RKNNLite
import threading import threading
import sounddevice as sd
import queue
import json
from vosk import Model, KaldiRecognizer
import time import time
def speech_loop():
global latest_speech
model = Model("./vosk-model-small-en-us-0.15")
rec = KaldiRecognizer(model, 16000)
q = queue.Queue()
def callback(indata, frames, time, status):
if status:
print(status)
q.put(bytes(indata))
with sd.RawInputStream(samplerate=16000, blocksize=8000, dtype='int16',
channels=1, callback=callback):
while True:
data = q.get()
if rec.AcceptWaveform(data):
result = json.loads(rec.Result())
latest_speech = result
print(".", result)
else:
partial = json.loads(rec.PartialResult())
latest_speech = partial
print("...", partial, end='\r')
# --- RetinaFace Utilities --- # --- RetinaFace Utilities ---
def letterbox_resize(image, size, bg_color): def letterbox_resize(image, size, bg_color):
target_width, target_height = size target_width, target_height = size
@ -143,14 +172,14 @@ def background_loop():
conf = data[4] conf = data[4]
box_center = np.array([(x1 + x2) / 2, (y1 + y2) / 2]) box_center = np.array([(x1 + x2) / 2, (y1 + y2) / 2])
offset = box_center - frame_center offset = box_center - frame_center
face_data.append({ # face_data.append({
"box": [x1, y1, x2, y2], # "box": [x1, y1, x2, y2],
"confidence": float(conf), # "confidence": float(conf),
"offset_from_center": { # "offset_from_center": {
"x": float(offset[0]), # "x": float(offset[0]),
"y": float(offset[1]) # "y": float(offset[1])
} # }
}) # })
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2) cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)
cv2.putText(frame, f'{conf:.4f}', (x1, y1 + 12), cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255)) cv2.putText(frame, f'{conf:.4f}', (x1, y1 + 12), cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255))
for j in range(5): for j in range(5):
@ -171,7 +200,29 @@ app = Flask(__name__)
@app.route('/') @app.route('/')
def index(): def index():
return Response(stream_frames(), mimetype='multipart/x-mixed-replace; boundary=frame') return '''
<html>
<head><title>RetinaFace + Speech</title></head>
<body>
<h2>Live Stream</h2>
<img src="/" width="640" />
<h3>Live Speech</h3>
<div id="speech" style="font-size:1.2em; font-family:monospace;"></div>
<script>
async function pollSpeech() {
const res = await fetch('/speech');
const data = await res.json();
const text = data.partial || data.text || '';
document.getElementById('speech').innerText = text;
setTimeout(pollSpeech, 300);
}
pollSpeech();
</script>
</body>
</html>
'''
def stream_frames(): def stream_frames():
while True: while True:
@ -183,8 +234,14 @@ def stream_frames():
def get_faces(): def get_faces():
return jsonify(latest_faces) return jsonify(latest_faces)
@app.route('/speech')
def get_speech():
return jsonify(latest_speech)
# --- Start Background Thread --- # --- Start Background Thread ---
threading.Thread(target=background_loop, daemon=True).start() threading.Thread(target=background_loop, daemon=True).start()
threading.Thread(target=speech_loop, daemon=True).start()
if __name__ == '__main__': if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000) app.run(host='0.0.0.0', port=5000)