esp32_sense_cam/esp32_sense_cam.ino

675 lines
22 KiB
C++

/*
* XIAO ESP32S3 Sense - Face Detection Web Server
*
* This sketch captures camera frames, runs face detection,
* and serves both the video stream and detection results via web server.
*
* Board: XIAO_ESP32S3
* Required: ESP32 board package 2.0.8+
*
* IMPORTANT: In Arduino IDE, go to Tools menu and set:
* - PSRAM: "OPI PSRAM"
*/
#include "esp_camera.h"
#include <WiFi.h>
#include "esp_http_server.h"
// Try to include face detection - available in ESP32 Arduino Core with ESP-WHO
#if __has_include("human_face_detect_msr01.hpp")
#include "human_face_detect_msr01.hpp"
#include "human_face_detect_mnp01.hpp"
#define FACE_DETECTION_AVAILABLE 1
#elif __has_include("esp_face_detect.h")
#include "esp_face_detect.h"
#define FACE_DETECTION_AVAILABLE 2
#else
#define FACE_DETECTION_AVAILABLE 0
#warning "Face detection headers not found - using motion detection fallback"
#endif
// WiFi credentials
const char* ssid = "Police Surveillance Van";
const char* password = "ourpassword";
// ===========================================
// XIAO ESP32S3 Sense Camera Pin Definitions
// ===========================================
#define PWDN_GPIO_NUM -1
#define RESET_GPIO_NUM -1
#define XCLK_GPIO_NUM 10
#define SIOD_GPIO_NUM 40
#define SIOC_GPIO_NUM 39
#define Y9_GPIO_NUM 48
#define Y8_GPIO_NUM 11
#define Y7_GPIO_NUM 12
#define Y6_GPIO_NUM 14
#define Y5_GPIO_NUM 16
#define Y4_GPIO_NUM 18
#define Y3_GPIO_NUM 17
#define Y2_GPIO_NUM 15
#define VSYNC_GPIO_NUM 38
#define HREF_GPIO_NUM 47
#define PCLK_GPIO_NUM 13
// LED pin for status
#define LED_GPIO_NUM 21
// Global variables
httpd_handle_t stream_httpd = NULL;
httpd_handle_t camera_httpd = NULL;
// Detection settings
static bool detectionEnabled = true;
static int detectionCount = 0;
static unsigned long lastDetectionTime = 0;
// For motion/change detection fallback
static uint8_t* prevFrame = NULL;
static size_t prevFrameLen = 0;
// Part boundary for MJPEG stream
#define PART_BOUNDARY "123456789000000000000987654321"
static const char* _STREAM_CONTENT_TYPE = "multipart/x-mixed-replace;boundary=" PART_BOUNDARY;
static const char* _STREAM_BOUNDARY = "\r\n--" PART_BOUNDARY "\r\n";
static const char* _STREAM_PART = "Content-Type: image/jpeg\r\nContent-Length: %u\r\nX-Faces: %d\r\n\r\n";
#if FACE_DETECTION_AVAILABLE == 1
// ESP-DL based face detection
HumanFaceDetectMSR01 *s_detector = nullptr;
HumanFaceDetectMNP01 *s_detector2 = nullptr;
static int detect_faces_dl(camera_fb_t *fb, uint8_t **out_buf, size_t *out_len) {
if (!s_detector) {
s_detector = new HumanFaceDetectMSR01(0.1F, 0.5F, 10, 0.2F);
s_detector2 = new HumanFaceDetectMNP01(0.5F, 0.3F, 5);
}
int faces = 0;
if (fb->format == PIXFORMAT_RGB565) {
// Convert to RGB888
size_t rgb_len = fb->width * fb->height * 3;
uint8_t *rgb_buf = (uint8_t*)ps_malloc(rgb_len);
if (rgb_buf) {
// Convert RGB565 to RGB888
uint16_t *src = (uint16_t*)fb->buf;
for (size_t i = 0; i < fb->width * fb->height; i++) {
uint16_t p = src[i];
rgb_buf[i*3] = ((p >> 11) & 0x1F) << 3;
rgb_buf[i*3+1] = ((p >> 5) & 0x3F) << 2;
rgb_buf[i*3+2] = (p & 0x1F) << 3;
}
// Run detection
std::list<dl::detect::result_t> &results = s_detector->infer(rgb_buf, {(int)fb->height, (int)fb->width, 3});
if (results.size() > 0) {
results = s_detector2->infer(rgb_buf, {(int)fb->height, (int)fb->width, 3}, results);
faces = results.size();
// Draw boxes
for (auto &r : results) {
int x1 = constrain(r.box[0], 0, fb->width-1);
int y1 = constrain(r.box[1], 0, fb->height-1);
int x2 = constrain(r.box[2], 0, fb->width-1);
int y2 = constrain(r.box[3], 0, fb->height-1);
// Draw green rectangle
for (int x = x1; x <= x2; x++) {
rgb_buf[(y1 * fb->width + x) * 3 + 1] = 255;
rgb_buf[(y2 * fb->width + x) * 3 + 1] = 255;
}
for (int y = y1; y <= y2; y++) {
rgb_buf[(y * fb->width + x1) * 3 + 1] = 255;
rgb_buf[(y * fb->width + x2) * 3 + 1] = 255;
}
}
}
// Convert to JPEG
if (!fmt2jpg(rgb_buf, rgb_len, fb->width, fb->height, PIXFORMAT_RGB888, 80, out_buf, out_len)) {
*out_buf = NULL;
*out_len = 0;
}
free(rgb_buf);
}
}
return faces;
}
#endif
// Simple skin-tone based face detection (works without ESP-DL)
static int detect_faces_simple(uint8_t *rgb565_buf, int width, int height) {
int skinPixels = 0;
int totalPixels = width * height;
uint16_t *pixels = (uint16_t*)rgb565_buf;
// Count skin-tone pixels (simplified detection)
for (int i = 0; i < totalPixels; i++) {
uint16_t p = pixels[i];
uint8_t r = ((p >> 11) & 0x1F) << 3;
uint8_t g = ((p >> 5) & 0x3F) << 2;
uint8_t b = (p & 0x1F) << 3;
// Simple skin tone detection in RGB
// Skin typically has R > 95, G > 40, B > 20
// and R > G > B with R-G > 15
if (r > 95 && g > 40 && b > 20 &&
r > g && g > b && (r - g) > 15 &&
(r - b) > 15) {
skinPixels++;
}
}
// If more than 5% skin pixels, likely a face is present
float skinRatio = (float)skinPixels / totalPixels;
if (skinRatio > 0.05 && skinRatio < 0.6) {
return 1; // Face likely detected
}
return 0;
}
// Stream handler
static esp_err_t stream_handler(httpd_req_t *req) {
camera_fb_t *fb = NULL;
esp_err_t res = ESP_OK;
char part_buf[128];
res = httpd_resp_set_type(req, _STREAM_CONTENT_TYPE);
if (res != ESP_OK) return res;
httpd_resp_set_hdr(req, "Access-Control-Allow-Origin", "*");
httpd_resp_set_hdr(req, "X-Framerate", "15");
while (true) {
fb = esp_camera_fb_get();
if (!fb) {
Serial.println("Camera capture failed");
res = ESP_FAIL;
break;
}
uint8_t *jpg_buf = NULL;
size_t jpg_len = 0;
int faces = 0;
if (detectionEnabled) {
#if FACE_DETECTION_AVAILABLE == 1
if (fb->format == PIXFORMAT_RGB565) {
faces = detect_faces_dl(fb, &jpg_buf, &jpg_len);
}
#else
// Fallback: simple skin-tone detection
if (fb->format == PIXFORMAT_RGB565) {
faces = detect_faces_simple(fb->buf, fb->width, fb->height);
}
#endif
}
// Use original frame if detection didn't produce output
if (jpg_buf == NULL) {
if (fb->format == PIXFORMAT_JPEG) {
jpg_buf = fb->buf;
jpg_len = fb->len;
} else {
bool converted = frame2jpg(fb, 80, &jpg_buf, &jpg_len);
if (!converted) {
esp_camera_fb_return(fb);
continue;
}
}
}
// Update detection status
if (faces > 0) {
detectionCount = faces;
lastDetectionTime = millis();
digitalWrite(LED_GPIO_NUM, LOW); // LED on
} else if (millis() - lastDetectionTime > 500) {
detectionCount = 0;
digitalWrite(LED_GPIO_NUM, HIGH); // LED off
}
// Send frame
if (res == ESP_OK) {
res = httpd_resp_send_chunk(req, _STREAM_BOUNDARY, strlen(_STREAM_BOUNDARY));
}
if (res == ESP_OK) {
size_t hlen = snprintf(part_buf, sizeof(part_buf), _STREAM_PART, jpg_len, detectionCount);
res = httpd_resp_send_chunk(req, part_buf, hlen);
}
if (res == ESP_OK) {
res = httpd_resp_send_chunk(req, (const char*)jpg_buf, jpg_len);
}
// Free JPEG buffer if we allocated it
if (jpg_buf != fb->buf) {
free(jpg_buf);
}
esp_camera_fb_return(fb);
if (res != ESP_OK) break;
}
return res;
}
// Single capture handler
static esp_err_t capture_handler(httpd_req_t *req) {
camera_fb_t *fb = esp_camera_fb_get();
if (!fb) {
httpd_resp_send_500(req);
return ESP_FAIL;
}
httpd_resp_set_type(req, "image/jpeg");
httpd_resp_set_hdr(req, "Content-Disposition", "inline; filename=capture.jpg");
httpd_resp_set_hdr(req, "Access-Control-Allow-Origin", "*");
esp_err_t res;
if (fb->format == PIXFORMAT_JPEG) {
res = httpd_resp_send(req, (const char*)fb->buf, fb->len);
} else {
uint8_t *jpg_buf = NULL;
size_t jpg_len = 0;
if (frame2jpg(fb, 80, &jpg_buf, &jpg_len)) {
res = httpd_resp_send(req, (const char*)jpg_buf, jpg_len);
free(jpg_buf);
} else {
res = ESP_FAIL;
httpd_resp_send_500(req);
}
}
esp_camera_fb_return(fb);
return res;
}
// Status handler
static esp_err_t status_handler(httpd_req_t *req) {
char json[256];
snprintf(json, sizeof(json),
"{\"detection\":%s,\"count\":%d,\"method\":\"%s\",\"heap\":%lu,\"psram\":%lu}",
detectionEnabled ? "true" : "false",
detectionCount,
#if FACE_DETECTION_AVAILABLE == 1
"neural-network",
#else
"skin-tone",
#endif
(unsigned long)ESP.getFreeHeap(),
(unsigned long)ESP.getFreePsram());
httpd_resp_set_type(req, "application/json");
httpd_resp_set_hdr(req, "Access-Control-Allow-Origin", "*");
return httpd_resp_send(req, json, strlen(json));
}
// Toggle detection
static esp_err_t toggle_handler(httpd_req_t *req) {
detectionEnabled = !detectionEnabled;
char json[64];
snprintf(json, sizeof(json), "{\"detection\":%s}", detectionEnabled ? "true" : "false");
httpd_resp_set_type(req, "application/json");
httpd_resp_set_hdr(req, "Access-Control-Allow-Origin", "*");
return httpd_resp_send(req, json, strlen(json));
}
// Main page - HTML in static storage to avoid stack overflow
static esp_err_t index_handler(httpd_req_t *req) {
static const char html[] = R"rawliteral(
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>XIAO ESP32S3 Face Detection</title>
<style>
* { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
min-height: 100vh;
color: #fff;
padding: 20px;
}
.container { max-width: 900px; margin: 0 auto; }
h1 {
text-align: center;
margin-bottom: 20px;
font-size: 1.8em;
background: linear-gradient(90deg, #00d9ff, #00ff88);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
background-clip: text;
}
.video-container {
background: #0f0f23;
border-radius: 16px;
padding: 15px;
box-shadow: 0 10px 40px rgba(0,0,0,0.4);
margin-bottom: 20px;
position: relative;
}
#stream {
width: 100%;
border-radius: 8px;
display: block;
}
.overlay {
position: absolute;
top: 25px;
right: 25px;
background: rgba(0,0,0,0.7);
padding: 10px 15px;
border-radius: 8px;
font-size: 14px;
}
.face-indicator {
display: inline-block;
width: 12px;
height: 12px;
border-radius: 50%;
margin-right: 8px;
animation: pulse 1s infinite;
}
.face-indicator.active { background: #00ff88; }
.face-indicator.inactive { background: #666; animation: none; }
@keyframes pulse {
0%, 100% { opacity: 1; }
50% { opacity: 0.5; }
}
.controls {
display: flex;
gap: 10px;
flex-wrap: wrap;
justify-content: center;
}
button {
padding: 12px 24px;
font-size: 14px;
border: none;
border-radius: 8px;
cursor: pointer;
transition: all 0.3s ease;
font-weight: 600;
}
.btn-primary { background: linear-gradient(135deg, #00d9ff, #0099cc); color: #fff; }
.btn-secondary { background: linear-gradient(135deg, #ff6b6b, #ee5a5a); color: #fff; }
.btn-success { background: linear-gradient(135deg, #00ff88, #00cc6a); color: #1a1a2e; }
button:hover { transform: translateY(-2px); box-shadow: 0 5px 20px rgba(0,0,0,0.3); }
.status {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
gap: 15px;
margin-top: 20px;
padding: 15px;
background: rgba(255,255,255,0.1);
border-radius: 8px;
}
.status-item { text-align: center; }
.status-label { font-size: 11px; opacity: 0.7; text-transform: uppercase; letter-spacing: 1px; }
.status-value { font-size: 20px; font-weight: bold; margin-top: 5px; }
.on { color: #00ff88; }
.off { color: #ff6b6b; }
.info { color: #00d9ff; }
</style>
</head>
<body>
<div class="container">
<h1>XIAO ESP32S3 Face Detection</h1>
<div class="video-container">
<img id="stream" src="" alt="Camera Stream">
<div class="overlay">
<span id="face-indicator" class="face-indicator inactive"></span>
<span id="face-text">No faces</span>
</div>
</div>
<div class="controls">
<button class="btn-primary" onclick="toggleDetection()">Toggle Detection</button>
<button class="btn-secondary" onclick="captureImage()">Capture Image</button>
<button class="btn-success" onclick="refreshStream()">Refresh Stream</button>
</div>
<div class="status">
<div class="status-item">
<div class="status-label">Detection</div>
<div id="detection-status" class="status-value on">ON</div>
</div>
<div class="status-item">
<div class="status-label">Faces</div>
<div id="face-count" class="status-value">0</div>
</div>
<div class="status-item">
<div class="status-label">Method</div>
<div id="method" class="status-value info">-</div>
</div>
<div class="status-item">
<div class="status-label">Free RAM</div>
<div id="heap" class="status-value info">-</div>
</div>
</div>
</div>
<script>
const streamUrl = window.location.protocol + '//' + window.location.hostname + ':81/stream';
document.getElementById('stream').src = streamUrl;
function toggleDetection() {
fetch('/toggle').then(r => r.json()).then(updateUI);
}
function captureImage() {
window.open('/capture', '_blank');
}
function refreshStream() {
document.getElementById('stream').src = streamUrl + '?' + Date.now();
}
function updateUI(data) {
document.getElementById('detection-status').textContent = data.detection ? 'ON' : 'OFF';
document.getElementById('detection-status').className = 'status-value ' + (data.detection ? 'on' : 'off');
document.getElementById('face-count').textContent = data.count;
document.getElementById('method').textContent = data.method || '-';
const indicator = document.getElementById('face-indicator');
const text = document.getElementById('face-text');
if (data.count > 0) {
indicator.className = 'face-indicator active';
text.textContent = data.count + ' face' + (data.count > 1 ? 's' : '');
} else {
indicator.className = 'face-indicator inactive';
text.textContent = 'No faces';
}
if (data.heap) {
document.getElementById('heap').textContent = Math.round(data.heap / 1024) + 'KB';
}
}
function pollStatus() {
fetch('/status')
.then(r => r.json())
.then(updateUI)
.catch(() => {});
}
setInterval(pollStatus, 500);
pollStatus();
</script>
</body>
</html>
)rawliteral";
httpd_resp_set_type(req, "text/html");
return httpd_resp_send(req, html, strlen(html));
}
void startCameraServer() {
httpd_config_t config = HTTPD_DEFAULT_CONFIG();
config.server_port = 80;
config.ctrl_port = 32768;
config.max_open_sockets = 7;
httpd_uri_t index_uri = { .uri = "/", .method = HTTP_GET, .handler = index_handler };
httpd_uri_t capture_uri = { .uri = "/capture", .method = HTTP_GET, .handler = capture_handler };
httpd_uri_t status_uri = { .uri = "/status", .method = HTTP_GET, .handler = status_handler };
httpd_uri_t toggle_uri = { .uri = "/toggle", .method = HTTP_GET, .handler = toggle_handler };
httpd_uri_t stream_uri = { .uri = "/stream", .method = HTTP_GET, .handler = stream_handler };
Serial.printf("Starting web server on port %d\n", config.server_port);
if (httpd_start(&camera_httpd, &config) == ESP_OK) {
httpd_register_uri_handler(camera_httpd, &index_uri);
httpd_register_uri_handler(camera_httpd, &capture_uri);
httpd_register_uri_handler(camera_httpd, &status_uri);
httpd_register_uri_handler(camera_httpd, &toggle_uri);
}
config.server_port = 81;
config.ctrl_port = 32769;
Serial.printf("Starting stream server on port %d\n", config.server_port);
if (httpd_start(&stream_httpd, &config) == ESP_OK) {
httpd_register_uri_handler(stream_httpd, &stream_uri);
}
}
void setup() {
Serial.begin(115200);
for (int i = 0; i < 10; i++){
Serial.println(i);
delay(500);
}
Serial.setDebugOutput(true);
Serial.println();
// Configure LED
pinMode(LED_GPIO_NUM, OUTPUT);
digitalWrite(LED_GPIO_NUM, HIGH);
// Check PSRAM
if (psramFound()) {
Serial.printf("PSRAM found: %d bytes\n", ESP.getPsramSize());
} else {
Serial.println("WARNING: No PSRAM found! Face detection may not work.");
Serial.println("Make sure PSRAM is set to 'OPI PSRAM' in Tools menu.");
}
// Camera configuration
camera_config_t config;
config.ledc_channel = LEDC_CHANNEL_0;
config.ledc_timer = LEDC_TIMER_0;
config.pin_d0 = Y2_GPIO_NUM;
config.pin_d1 = Y3_GPIO_NUM;
config.pin_d2 = Y4_GPIO_NUM;
config.pin_d3 = Y5_GPIO_NUM;
config.pin_d4 = Y6_GPIO_NUM;
config.pin_d5 = Y7_GPIO_NUM;
config.pin_d6 = Y8_GPIO_NUM;
config.pin_d7 = Y9_GPIO_NUM;
config.pin_xclk = XCLK_GPIO_NUM;
config.pin_pclk = PCLK_GPIO_NUM;
config.pin_vsync = VSYNC_GPIO_NUM;
config.pin_href = HREF_GPIO_NUM;
config.pin_sccb_sda = SIOD_GPIO_NUM;
config.pin_sccb_scl = SIOC_GPIO_NUM;
config.pin_pwdn = PWDN_GPIO_NUM;
config.pin_reset = RESET_GPIO_NUM;
config.xclk_freq_hz = 20000000;
config.grab_mode = CAMERA_GRAB_LATEST;
config.fb_location = CAMERA_FB_IN_PSRAM;
#if FACE_DETECTION_AVAILABLE == 1
// For neural network face detection, use RGB565
config.frame_size = FRAMESIZE_QVGA; // 320x240
config.pixel_format = PIXFORMAT_RGB565;
config.fb_count = 2;
config.jpeg_quality = 12;
Serial.println("Face detection: Neural Network (ESP-DL)");
#else
// For skin-tone detection, JPEG is fine and faster
config.frame_size = FRAMESIZE_VGA; // 640x480
config.pixel_format = PIXFORMAT_JPEG;
config.fb_count = 2;
config.jpeg_quality = 10;
Serial.println("Face detection: Skin-tone heuristic (fallback)");
#endif
// Initialize camera
esp_err_t err = esp_camera_init(&config);
if (err != ESP_OK) {
Serial.printf("Camera init failed with error 0x%x\n", err);
while (true) {
digitalWrite(LED_GPIO_NUM, LOW);
delay(100);
digitalWrite(LED_GPIO_NUM, HIGH);
delay(100);
}
}
Serial.println("Camera initialized successfully");
// Camera settings
sensor_t *s = esp_camera_sensor_get();
if (s) {
s->set_vflip(s, 0);
s->set_hmirror(s, 0);
s->set_brightness(s, 1);
s->set_contrast(s, 1);
}
// Connect to WiFi
WiFi.begin(ssid, password);
WiFi.setSleep(false);
Serial.print("Connecting to WiFi");
int attempts = 0;
while (WiFi.status() != WL_CONNECTED && attempts < 30) {
delay(500);
Serial.print(".");
digitalWrite(LED_GPIO_NUM, (attempts % 2) ? LOW : HIGH);
attempts++;
}
if (WiFi.status() == WL_CONNECTED) {
Serial.println("\nWiFi connected!");
digitalWrite(LED_GPIO_NUM, HIGH);
startCameraServer();
Serial.println("\n========================================");
Serial.println(" Face Detection Web Server Ready!");
Serial.println("========================================");
Serial.print(" Open: http://");
Serial.println(WiFi.localIP());
Serial.print(" Stream: http://");
Serial.print(WiFi.localIP());
Serial.println(":81/stream");
Serial.println("========================================\n");
} else {
Serial.println("\nWiFi connection failed!");
while (true) {
digitalWrite(LED_GPIO_NUM, LOW);
delay(1000);
digitalWrite(LED_GPIO_NUM, HIGH);
delay(1000);
}
}
}
void loop() {
delay(10000);
Serial.printf("Status - Heap: %lu, PSRAM: %lu, Faces: %d\n",
(unsigned long)ESP.getFreeHeap(),
(unsigned long)ESP.getFreePsram(),
detectionCount);
}