diff --git a/main.py b/main.py index fc60fb4..e5bbdf1 100644 --- a/main.py +++ b/main.py @@ -5,8 +5,37 @@ from math import ceil from itertools import product from rknnlite.api import RKNNLite import threading +import sounddevice as sd +import queue +import json +from vosk import Model, KaldiRecognizer import time +def speech_loop(): + global latest_speech + model = Model("./vosk-model-small-en-us-0.15") + rec = KaldiRecognizer(model, 16000) + q = queue.Queue() + + def callback(indata, frames, time, status): + if status: + print(status) + q.put(bytes(indata)) + + with sd.RawInputStream(samplerate=16000, blocksize=8000, dtype='int16', + channels=1, callback=callback): + while True: + data = q.get() + if rec.AcceptWaveform(data): + result = json.loads(rec.Result()) + latest_speech = result + print(".", result) + else: + partial = json.loads(rec.PartialResult()) + latest_speech = partial + print("...", partial, end='\r') + + # --- RetinaFace Utilities --- def letterbox_resize(image, size, bg_color): target_width, target_height = size @@ -143,14 +172,14 @@ def background_loop(): conf = data[4] box_center = np.array([(x1 + x2) / 2, (y1 + y2) / 2]) offset = box_center - frame_center - face_data.append({ - "box": [x1, y1, x2, y2], - "confidence": float(conf), - "offset_from_center": { - "x": float(offset[0]), - "y": float(offset[1]) - } - }) + # face_data.append({ + # "box": [x1, y1, x2, y2], + # "confidence": float(conf), + # "offset_from_center": { + # "x": float(offset[0]), + # "y": float(offset[1]) + # } + # }) cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2) cv2.putText(frame, f'{conf:.4f}', (x1, y1 + 12), cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255)) for j in range(5): @@ -171,7 +200,29 @@ app = Flask(__name__) @app.route('/') def index(): - return Response(stream_frames(), mimetype='multipart/x-mixed-replace; boundary=frame') + return ''' + +