integrated voice-to-text, need it displaying on stream better

master
Jake 2025-10-24 18:39:24 +08:00
parent 699ae46f06
commit 41c189a18c
1 changed files with 66 additions and 9 deletions

75
main.py
View File

@ -5,8 +5,37 @@ from math import ceil
from itertools import product
from rknnlite.api import RKNNLite
import threading
import sounddevice as sd
import queue
import json
from vosk import Model, KaldiRecognizer
import time
def speech_loop():
global latest_speech
model = Model("./vosk-model-small-en-us-0.15")
rec = KaldiRecognizer(model, 16000)
q = queue.Queue()
def callback(indata, frames, time, status):
if status:
print(status)
q.put(bytes(indata))
with sd.RawInputStream(samplerate=16000, blocksize=8000, dtype='int16',
channels=1, callback=callback):
while True:
data = q.get()
if rec.AcceptWaveform(data):
result = json.loads(rec.Result())
latest_speech = result
print(".", result)
else:
partial = json.loads(rec.PartialResult())
latest_speech = partial
print("...", partial, end='\r')
# --- RetinaFace Utilities ---
def letterbox_resize(image, size, bg_color):
target_width, target_height = size
@ -143,14 +172,14 @@ def background_loop():
conf = data[4]
box_center = np.array([(x1 + x2) / 2, (y1 + y2) / 2])
offset = box_center - frame_center
face_data.append({
"box": [x1, y1, x2, y2],
"confidence": float(conf),
"offset_from_center": {
"x": float(offset[0]),
"y": float(offset[1])
}
})
# face_data.append({
# "box": [x1, y1, x2, y2],
# "confidence": float(conf),
# "offset_from_center": {
# "x": float(offset[0]),
# "y": float(offset[1])
# }
# })
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)
cv2.putText(frame, f'{conf:.4f}', (x1, y1 + 12), cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255))
for j in range(5):
@ -171,7 +200,29 @@ app = Flask(__name__)
@app.route('/')
def index():
return Response(stream_frames(), mimetype='multipart/x-mixed-replace; boundary=frame')
return '''
<html>
<head><title>RetinaFace + Speech</title></head>
<body>
<h2>Live Stream</h2>
<img src="/" width="640" />
<h3>Live Speech</h3>
<div id="speech" style="font-size:1.2em; font-family:monospace;"></div>
<script>
async function pollSpeech() {
const res = await fetch('/speech');
const data = await res.json();
const text = data.partial || data.text || '';
document.getElementById('speech').innerText = text;
setTimeout(pollSpeech, 300);
}
pollSpeech();
</script>
</body>
</html>
'''
def stream_frames():
while True:
@ -183,8 +234,14 @@ def stream_frames():
def get_faces():
return jsonify(latest_faces)
@app.route('/speech')
def get_speech():
return jsonify(latest_speech)
# --- Start Background Thread ---
threading.Thread(target=background_loop, daemon=True).start()
threading.Thread(target=speech_loop, daemon=True).start()
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)