face detection and voice-to-text integrated into flask stream
parent
41c189a18c
commit
4926044e71
102
main.py
102
main.py
|
|
@ -11,8 +11,11 @@ import json
|
|||
from vosk import Model, KaldiRecognizer
|
||||
import time
|
||||
|
||||
latest_partial = {"partial": ""}
|
||||
latest_result = {"text": ""}
|
||||
|
||||
def speech_loop():
|
||||
global latest_speech
|
||||
global latest_partial, latest_result
|
||||
model = Model("./vosk-model-small-en-us-0.15")
|
||||
rec = KaldiRecognizer(model, 16000)
|
||||
q = queue.Queue()
|
||||
|
|
@ -27,13 +30,14 @@ def speech_loop():
|
|||
while True:
|
||||
data = q.get()
|
||||
if rec.AcceptWaveform(data):
|
||||
result = json.loads(rec.Result())
|
||||
latest_speech = result
|
||||
result = json.loads(rec.Result())
|
||||
latest_result = result
|
||||
print(".", result)
|
||||
else:
|
||||
partial = json.loads(rec.PartialResult())
|
||||
latest_speech = partial
|
||||
print("...", partial, end='\r')
|
||||
latest_partial = partial
|
||||
print("...", partial.get("partial", ""), end='\r')
|
||||
|
||||
|
||||
|
||||
# --- RetinaFace Utilities ---
|
||||
|
|
@ -172,14 +176,14 @@ def background_loop():
|
|||
conf = data[4]
|
||||
box_center = np.array([(x1 + x2) / 2, (y1 + y2) / 2])
|
||||
offset = box_center - frame_center
|
||||
# face_data.append({
|
||||
# "box": [x1, y1, x2, y2],
|
||||
# "confidence": float(conf),
|
||||
# "offset_from_center": {
|
||||
# "x": float(offset[0]),
|
||||
# "y": float(offset[1])
|
||||
# }
|
||||
# })
|
||||
face_data.append({
|
||||
"box": [x1, y1, x2, y2],
|
||||
"confidence": float(conf),
|
||||
"offset_from_center": {
|
||||
"x": float(offset[0]),
|
||||
"y": float(offset[1])
|
||||
}
|
||||
})
|
||||
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)
|
||||
cv2.putText(frame, f'{conf:.4f}', (x1, y1 + 12), cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255))
|
||||
for j in range(5):
|
||||
|
|
@ -201,29 +205,57 @@ app = Flask(__name__)
|
|||
@app.route('/')
|
||||
def index():
|
||||
return '''
|
||||
<html>
|
||||
<head><title>RetinaFace + Speech</title></head>
|
||||
<body>
|
||||
<h2>Live Stream</h2>
|
||||
<img src="/" width="640" />
|
||||
<h3>Live Speech</h3>
|
||||
<div id="speech" style="font-size:1.2em; font-family:monospace;"></div>
|
||||
<script>
|
||||
async function pollSpeech() {
|
||||
const res = await fetch('/speech');
|
||||
const data = await res.json();
|
||||
const text = data.partial || data.text || '';
|
||||
document.getElementById('speech').innerText = text;
|
||||
setTimeout(pollSpeech, 300);
|
||||
}
|
||||
pollSpeech();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
<h2>Little Sophias Inner Thoughts</h2>
|
||||
<img src="/video_feed" width="640" />
|
||||
|
||||
<h3>Voice-To-Text</h3>
|
||||
<div><strong>Partial:</strong> <span id="partial"></span></div>
|
||||
<div><strong>Final:</strong> <span id="final"></span></div>
|
||||
|
||||
<h3>Face Detection</h3>
|
||||
<div><strong>Count:</strong> <span id="face_count"></span></div>
|
||||
<div><strong>Offsets:</strong>
|
||||
<ul id="face_offsets"></ul>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
async function pollSpeech() {
|
||||
const res = await fetch('/speech');
|
||||
const data = await res.json();
|
||||
document.getElementById('partial').innerText = data.partial || '';
|
||||
document.getElementById('final').innerText = data.text || '';
|
||||
}
|
||||
|
||||
async function pollFaces() {
|
||||
const res = await fetch('/faces');
|
||||
const data = await res.json();
|
||||
document.getElementById('face_count').innerText = data.length;
|
||||
const list = document.getElementById('face_offsets');
|
||||
list.innerHTML = '';
|
||||
data.forEach(face => {
|
||||
const offset = face.offset_from_center;
|
||||
const li = document.createElement('li');
|
||||
li.innerText = `x: ${offset.x.toFixed(1)}, y: ${offset.y.toFixed(1)}`;
|
||||
list.appendChild(li);
|
||||
});
|
||||
}
|
||||
|
||||
function loop() {
|
||||
pollSpeech();
|
||||
pollFaces();
|
||||
setTimeout(loop, 500);
|
||||
}
|
||||
loop();
|
||||
</script>
|
||||
'''
|
||||
|
||||
|
||||
|
||||
@app.route('/video_feed')
|
||||
def video_feed():
|
||||
return Response(stream_frames(), mimetype='multipart/x-mixed-replace; boundary=frame')
|
||||
|
||||
|
||||
def stream_frames():
|
||||
while True:
|
||||
if latest_frame:
|
||||
|
|
@ -236,7 +268,11 @@ def get_faces():
|
|||
|
||||
@app.route('/speech')
|
||||
def get_speech():
|
||||
return jsonify(latest_speech)
|
||||
return jsonify({
|
||||
"partial": latest_partial.get("partial", ""),
|
||||
"text": latest_result.get("text", "")
|
||||
})
|
||||
|
||||
|
||||
|
||||
# --- Start Background Thread ---
|
||||
|
|
|
|||
Loading…
Reference in New Issue