integrated voice-to-text, need it displaying on stream better
parent
699ae46f06
commit
41c189a18c
75
main.py
75
main.py
|
|
@ -5,8 +5,37 @@ from math import ceil
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from rknnlite.api import RKNNLite
|
from rknnlite.api import RKNNLite
|
||||||
import threading
|
import threading
|
||||||
|
import sounddevice as sd
|
||||||
|
import queue
|
||||||
|
import json
|
||||||
|
from vosk import Model, KaldiRecognizer
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
def speech_loop():
|
||||||
|
global latest_speech
|
||||||
|
model = Model("./vosk-model-small-en-us-0.15")
|
||||||
|
rec = KaldiRecognizer(model, 16000)
|
||||||
|
q = queue.Queue()
|
||||||
|
|
||||||
|
def callback(indata, frames, time, status):
|
||||||
|
if status:
|
||||||
|
print(status)
|
||||||
|
q.put(bytes(indata))
|
||||||
|
|
||||||
|
with sd.RawInputStream(samplerate=16000, blocksize=8000, dtype='int16',
|
||||||
|
channels=1, callback=callback):
|
||||||
|
while True:
|
||||||
|
data = q.get()
|
||||||
|
if rec.AcceptWaveform(data):
|
||||||
|
result = json.loads(rec.Result())
|
||||||
|
latest_speech = result
|
||||||
|
print(".", result)
|
||||||
|
else:
|
||||||
|
partial = json.loads(rec.PartialResult())
|
||||||
|
latest_speech = partial
|
||||||
|
print("...", partial, end='\r')
|
||||||
|
|
||||||
|
|
||||||
# --- RetinaFace Utilities ---
|
# --- RetinaFace Utilities ---
|
||||||
def letterbox_resize(image, size, bg_color):
|
def letterbox_resize(image, size, bg_color):
|
||||||
target_width, target_height = size
|
target_width, target_height = size
|
||||||
|
|
@ -143,14 +172,14 @@ def background_loop():
|
||||||
conf = data[4]
|
conf = data[4]
|
||||||
box_center = np.array([(x1 + x2) / 2, (y1 + y2) / 2])
|
box_center = np.array([(x1 + x2) / 2, (y1 + y2) / 2])
|
||||||
offset = box_center - frame_center
|
offset = box_center - frame_center
|
||||||
face_data.append({
|
# face_data.append({
|
||||||
"box": [x1, y1, x2, y2],
|
# "box": [x1, y1, x2, y2],
|
||||||
"confidence": float(conf),
|
# "confidence": float(conf),
|
||||||
"offset_from_center": {
|
# "offset_from_center": {
|
||||||
"x": float(offset[0]),
|
# "x": float(offset[0]),
|
||||||
"y": float(offset[1])
|
# "y": float(offset[1])
|
||||||
}
|
# }
|
||||||
})
|
# })
|
||||||
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)
|
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)
|
||||||
cv2.putText(frame, f'{conf:.4f}', (x1, y1 + 12), cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255))
|
cv2.putText(frame, f'{conf:.4f}', (x1, y1 + 12), cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255))
|
||||||
for j in range(5):
|
for j in range(5):
|
||||||
|
|
@ -171,7 +200,29 @@ app = Flask(__name__)
|
||||||
|
|
||||||
@app.route('/')
|
@app.route('/')
|
||||||
def index():
|
def index():
|
||||||
return Response(stream_frames(), mimetype='multipart/x-mixed-replace; boundary=frame')
|
return '''
|
||||||
|
<html>
|
||||||
|
<head><title>RetinaFace + Speech</title></head>
|
||||||
|
<body>
|
||||||
|
<h2>Live Stream</h2>
|
||||||
|
<img src="/" width="640" />
|
||||||
|
<h3>Live Speech</h3>
|
||||||
|
<div id="speech" style="font-size:1.2em; font-family:monospace;"></div>
|
||||||
|
<script>
|
||||||
|
async function pollSpeech() {
|
||||||
|
const res = await fetch('/speech');
|
||||||
|
const data = await res.json();
|
||||||
|
const text = data.partial || data.text || '';
|
||||||
|
document.getElementById('speech').innerText = text;
|
||||||
|
setTimeout(pollSpeech, 300);
|
||||||
|
}
|
||||||
|
pollSpeech();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def stream_frames():
|
def stream_frames():
|
||||||
while True:
|
while True:
|
||||||
|
|
@ -183,8 +234,14 @@ def stream_frames():
|
||||||
def get_faces():
|
def get_faces():
|
||||||
return jsonify(latest_faces)
|
return jsonify(latest_faces)
|
||||||
|
|
||||||
|
@app.route('/speech')
|
||||||
|
def get_speech():
|
||||||
|
return jsonify(latest_speech)
|
||||||
|
|
||||||
|
|
||||||
# --- Start Background Thread ---
|
# --- Start Background Thread ---
|
||||||
threading.Thread(target=background_loop, daemon=True).start()
|
threading.Thread(target=background_loop, daemon=True).start()
|
||||||
|
threading.Thread(target=speech_loop, daemon=True).start()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
app.run(host='0.0.0.0', port=5000)
|
app.run(host='0.0.0.0', port=5000)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue