face detection and voice-to-text integrated into flask stream

master
Jake 2025-10-24 21:13:18 +08:00
parent 41c189a18c
commit 4926044e71
1 changed files with 69 additions and 33 deletions

100
main.py
View File

@ -11,8 +11,11 @@ import json
from vosk import Model, KaldiRecognizer from vosk import Model, KaldiRecognizer
import time import time
latest_partial = {"partial": ""}
latest_result = {"text": ""}
def speech_loop(): def speech_loop():
global latest_speech global latest_partial, latest_result
model = Model("./vosk-model-small-en-us-0.15") model = Model("./vosk-model-small-en-us-0.15")
rec = KaldiRecognizer(model, 16000) rec = KaldiRecognizer(model, 16000)
q = queue.Queue() q = queue.Queue()
@ -28,12 +31,13 @@ def speech_loop():
data = q.get() data = q.get()
if rec.AcceptWaveform(data): if rec.AcceptWaveform(data):
result = json.loads(rec.Result()) result = json.loads(rec.Result())
latest_speech = result latest_result = result
print(".", result) print(".", result)
else: else:
partial = json.loads(rec.PartialResult()) partial = json.loads(rec.PartialResult())
latest_speech = partial latest_partial = partial
print("...", partial, end='\r') print("...", partial.get("partial", ""), end='\r')
# --- RetinaFace Utilities --- # --- RetinaFace Utilities ---
@ -172,14 +176,14 @@ def background_loop():
conf = data[4] conf = data[4]
box_center = np.array([(x1 + x2) / 2, (y1 + y2) / 2]) box_center = np.array([(x1 + x2) / 2, (y1 + y2) / 2])
offset = box_center - frame_center offset = box_center - frame_center
# face_data.append({ face_data.append({
# "box": [x1, y1, x2, y2], "box": [x1, y1, x2, y2],
# "confidence": float(conf), "confidence": float(conf),
# "offset_from_center": { "offset_from_center": {
# "x": float(offset[0]), "x": float(offset[0]),
# "y": float(offset[1]) "y": float(offset[1])
# } }
# }) })
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2) cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)
cv2.putText(frame, f'{conf:.4f}', (x1, y1 + 12), cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255)) cv2.putText(frame, f'{conf:.4f}', (x1, y1 + 12), cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255))
for j in range(5): for j in range(5):
@ -201,29 +205,57 @@ app = Flask(__name__)
@app.route('/') @app.route('/')
def index(): def index():
return ''' return '''
<html> <h2>Little Sophias Inner Thoughts</h2>
<head><title>RetinaFace + Speech</title></head> <img src="/video_feed" width="640" />
<body>
<h2>Live Stream</h2> <h3>Voice-To-Text</h3>
<img src="/" width="640" /> <div><strong>Partial:</strong> <span id="partial"></span></div>
<h3>Live Speech</h3> <div><strong>Final:</strong> <span id="final"></span></div>
<div id="speech" style="font-size:1.2em; font-family:monospace;"></div>
<script> <h3>Face Detection</h3>
async function pollSpeech() { <div><strong>Count:</strong> <span id="face_count"></span></div>
const res = await fetch('/speech'); <div><strong>Offsets:</strong>
const data = await res.json(); <ul id="face_offsets"></ul>
const text = data.partial || data.text || ''; </div>
document.getElementById('speech').innerText = text;
setTimeout(pollSpeech, 300); <script>
} async function pollSpeech() {
pollSpeech(); const res = await fetch('/speech');
</script> const data = await res.json();
</body> document.getElementById('partial').innerText = data.partial || '';
</html> document.getElementById('final').innerText = data.text || '';
}
async function pollFaces() {
const res = await fetch('/faces');
const data = await res.json();
document.getElementById('face_count').innerText = data.length;
const list = document.getElementById('face_offsets');
list.innerHTML = '';
data.forEach(face => {
const offset = face.offset_from_center;
const li = document.createElement('li');
li.innerText = `x: ${offset.x.toFixed(1)}, y: ${offset.y.toFixed(1)}`;
list.appendChild(li);
});
}
function loop() {
pollSpeech();
pollFaces();
setTimeout(loop, 500);
}
loop();
</script>
''' '''
@app.route('/video_feed')
def video_feed():
return Response(stream_frames(), mimetype='multipart/x-mixed-replace; boundary=frame')
def stream_frames(): def stream_frames():
while True: while True:
if latest_frame: if latest_frame:
@ -236,7 +268,11 @@ def get_faces():
@app.route('/speech') @app.route('/speech')
def get_speech(): def get_speech():
return jsonify(latest_speech) return jsonify({
"partial": latest_partial.get("partial", ""),
"text": latest_result.get("text", "")
})
# --- Start Background Thread --- # --- Start Background Thread ---