face detection and voice-to-text integrated into flask stream

2025-10-24 21:13:18 +08:00 · 2025-10-24 21:13:18 +08:00 · 4926044e71
parent 41c189a18c
commit 4926044e71
1 changed files with 69 additions and 33 deletions
--- a/main.py
+++ b/main.py
@ -11,8 +11,11 @@ import json
 from vosk import Model, KaldiRecognizer
 import time

+latest_partial = {"partial": ""}
+latest_result = {"text": ""}
+
 def speech_loop():
-    global latest_speech
+    global latest_partial, latest_result
    model = Model("./vosk-model-small-en-us-0.15")
    rec = KaldiRecognizer(model, 16000)
    q = queue.Queue()
@ -27,13 +30,14 @@ def speech_loop():
        while True:
            data = q.get()
            if rec.AcceptWaveform(data):
-                result = json.loads(rec.Result())                
-                latest_speech = result
+                result = json.loads(rec.Result())
+                latest_result = result
                print(".", result)
            else:
                partial = json.loads(rec.PartialResult())
-                latest_speech = partial
-                print("...", partial, end='\r')
+                latest_partial = partial
+                print("...", partial.get("partial", ""), end='\r')
+


 # --- RetinaFace Utilities ---
@ -172,14 +176,14 @@ def background_loop():
            conf = data[4]
            box_center = np.array([(x1 + x2) / 2, (y1 + y2) / 2])
            offset = box_center - frame_center
-            # face_data.append({
-            #     "box": [x1, y1, x2, y2],
-            #     "confidence": float(conf),
-            #     "offset_from_center": {
-            #         "x": float(offset[0]),
-            #         "y": float(offset[1])
-            #     }
-            # })
+            face_data.append({
+                "box": [x1, y1, x2, y2],
+                "confidence": float(conf),
+                "offset_from_center": {
+                    "x": float(offset[0]),
+                    "y": float(offset[1])
+                }
+            })
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)
            cv2.putText(frame, f'{conf:.4f}', (x1, y1 + 12), cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255))
            for j in range(5):
@ -201,29 +205,57 @@ app = Flask(__name__)
@app.route('/')
 def index():
    return '''
-    <html>
-        <head><title>RetinaFace + Speech</title></head>
-        <body>
-            <h2>Live Stream</h2>
-            <img src="/" width="640" />
-            <h3>Live Speech</h3>
-            <div id="speech" style="font-size:1.2em; font-family:monospace;"></div>
-            <script>
-                async function pollSpeech() {
-                    const res = await fetch('/speech');
-                    const data = await res.json();
-                    const text = data.partial || data.text || '';
-                    document.getElementById('speech').innerText = text;
-                    setTimeout(pollSpeech, 300);
-                }
-                pollSpeech();
-            </script>
-        </body>
-    </html>
+    <h2>Little Sophias Inner Thoughts</h2>
+    <img src="/video_feed" width="640" />
+    
+    <h3>Voice-To-Text</h3>
+    <div><strong>Partial:</strong> <span id="partial"></span></div>
+    <div><strong>Final:</strong> <span id="final"></span></div>
+
+    <h3>Face Detection</h3>
+    <div><strong>Count:</strong> <span id="face_count"></span></div>
+    <div><strong>Offsets:</strong>
+        <ul id="face_offsets"></ul>
+    </div>
+
+    <script>
+        async function pollSpeech() {
+            const res = await fetch('/speech');
+            const data = await res.json();
+            document.getElementById('partial').innerText = data.partial || '';
+            document.getElementById('final').innerText = data.text || '';
+        }
+
+        async function pollFaces() {
+            const res = await fetch('/faces');
+            const data = await res.json();
+            document.getElementById('face_count').innerText = data.length;
+            const list = document.getElementById('face_offsets');
+            list.innerHTML = '';
+            data.forEach(face => {
+                const offset = face.offset_from_center;
+                const li = document.createElement('li');
+                li.innerText = `x: ${offset.x.toFixed(1)}, y: ${offset.y.toFixed(1)}`;
+                list.appendChild(li);
+            });
+        }
+
+        function loop() {
+            pollSpeech();
+            pollFaces();
+            setTimeout(loop, 500);
+        }
+        loop();
+    </script>
    '''



+@app.route('/video_feed')
+def video_feed():
+    return Response(stream_frames(), mimetype='multipart/x-mixed-replace; boundary=frame')
+
+
 def stream_frames():
    while True:
        if latest_frame:
@ -236,7 +268,11 @@ def get_faces():

@app.route('/speech')
 def get_speech():
-    return jsonify(latest_speech)
+    return jsonify({
+        "partial": latest_partial.get("partial", ""),
+        "text": latest_result.get("text", "")
+    })
+


 # --- Start Background Thread ---