Visual Servoing Platform  version 3.3.0
Tutorial: Deep learning object detection

Introduction

This tutorial shows how to use vpDetectorDNN (DNN stands for Deep Neural Network) class to perform object detection with deep learning. This class is a small wrapper over the OpenCV DNN module.

It provides convenient ways to retrieve detection bounding boxes, class ids and confidence values. For other tasks such as image classification or more elaborate functionality, you should use directly the OpenCV DNN API.

In the next section you will find an example that shows how to perform face detection in a single image or in images acquired from a camera connected to your computer.

Note that all the material (source code and network model) described in this tutorial is part of ViSP source code and could be downloaded using the following command:

$ svn export https://github.com/lagadic/visp.git/trunk/tutorial/detection/dnn

Face detection

The following example also available in tutorial-dnn-object-detection-live.cpp detects human face.

#include <visp3/core/vpConfig.h>
#include <visp3/detection/vpDetectorDNN.h>
#include <visp3/gui/vpDisplayGDI.h>
#include <visp3/gui/vpDisplayOpenCV.h>
#include <visp3/gui/vpDisplayX.h>
int main(int argc, const char *argv[])
{
#if (VISP_HAVE_OPENCV_VERSION >= 0x030403)
try {
int opt_device = 0;
std::string input = "";
std::string model = "opencv_face_detector_uint8.pb";
std::string config = "opencv_face_detector.pbtxt";
int inputWidth = 300, inputHeight = 300;
double meanR = 104.0, meanG = 177.0, meanB = 123.0;
double scaleFactor = 1.0;
bool swapRB = false;
float confThresh = 0.5f;
float nmsThresh = 0.4f;
std::string labelFile = "";
for (int i = 1; i < argc; i++) {
if (std::string(argv[i]) == "--device" && i+1 < argc) {
opt_device = atoi(argv[i+1]);
} else if (std::string(argv[i]) == "--input" && i+1 < argc) {
input = std::string(argv[i+1]);
} else if (std::string(argv[i]) == "--model" && i+1 < argc) {
model = std::string(argv[i+1]);
} else if (std::string(argv[i]) == "--config" && i+1 < argc) {
config = std::string(argv[i+1]);
} else if (std::string(argv[i]) == "--width" && i+1 < argc) {
inputWidth = atoi(argv[i+1]);
} else if (std::string(argv[i]) == "--height" && i+1 < argc) {
inputHeight = atoi(argv[i+1]);
} else if (std::string(argv[i]) == "--mean" && i+3 < argc) {
meanR = atof(argv[i+1]);
meanG = atof(argv[i+2]);
meanB = atof(argv[i+3]);
} else if (std::string(argv[i]) == "--scale" && i+1 < argc) {
scaleFactor = atof(argv[i+1]);
} else if (std::string(argv[i]) == "--swapRB") {
swapRB = true;
} else if (std::string(argv[i]) == "--confThresh" && i+1 < argc) {
confThresh = (float)atof(argv[i+1]);
} else if (std::string(argv[i]) == "--nmsThresh" && i+1 < argc) {
nmsThresh = (float)atof(argv[i+1]);
} else if (std::string(argv[i]) == "--labels" && i+1 < argc) {
labelFile = std::string(argv[i+1]);
} else if (std::string(argv[i]) == "--help" || std::string(argv[i]) == "-h") {
std::cout << argv[0] << " --device <camera device number> --input <path to image or video>"
" (camera is used if input is empty) --model <path to net trained weights>"
" --config <path to net config file>"
" --width <blob width> --height <blob height>"
" -- mean <meanR meanG meanB> --scale <scale factor>"
" --swapRB --confThresh <confidence threshold>"
" --nmsThresh <NMS threshold> --labels <path to label file>" << std::endl;
return EXIT_SUCCESS;
}
}
std::cout << "Model: " << model << std::endl;
std::cout << "Config: " << config << std::endl;
std::cout << "Width: " << inputWidth << std::endl;
std::cout << "Height: " << inputHeight << std::endl;
std::cout << "Mean: " << meanR << ", " << meanG << ", " << meanB << std::endl;
std::cout << "Scale: " << scaleFactor << std::endl;
std::cout << "Swap RB? " << swapRB << std::endl;
std::cout << "Confidence threshold: " << confThresh << std::endl;
std::cout << "NMS threshold: " << nmsThresh << std::endl;
cv::VideoCapture capture;
if (input.empty()) {
capture.open(opt_device);
} else {
capture.open(input);
}
#if defined(VISP_HAVE_X11)
#elif defined(VISP_HAVE_GDI)
#elif defined(VISP_HAVE_OPENCV)
#endif
dnn.readNet(model, config);
dnn.setInputSize(inputWidth, inputHeight);
dnn.setMean(meanR, meanG, meanB);
dnn.setScaleFactor(scaleFactor);
dnn.setSwapRB(swapRB);
dnn.setConfidenceThreshold(confThresh);
dnn.setNMSThreshold(nmsThresh);
std::vector<std::string> labels;
if (!labelFile.empty()) {
std::ifstream f_label(labelFile);
std::string line;
while (std::getline(f_label, line)) {
labels.push_back(line);
}
}
cv::Mat frame;
while (true) {
capture >> frame;
if (frame.empty())
break;
if (I.getSize() == 0) {
d.init(I);
vpDisplay::setTitle(I, "DNN object detection");
} else {
}
double t = vpTime::measureTimeMs();
std::vector<vpRect> boundingBoxes;
dnn.detect(I, boundingBoxes);
std::vector<int> classIds = dnn.getDetectionClassIds();
std::vector<float> confidences = dnn.getDetectionConfidence();
for (size_t i = 0; i < boundingBoxes.size(); i++) {
vpDisplay::displayRectangle(I, boundingBoxes[i], vpColor::red, false, 2);
std::ostringstream oss;
if (labels.empty())
oss << "class: " << classIds[i];
else
oss << labels[classIds[i]];
oss << " - conf: " << confidences[i];
vpDisplay::displayText(I, (int)boundingBoxes[i].getTop()-10, (int)boundingBoxes[i].getLeft()+10,
oss.str(), vpColor::red);
}
std::ostringstream oss;
oss << "Detection time: " << t << " ms";
vpDisplay::displayText(I, 20, 20, oss.str(), vpColor::red);
if (vpDisplay::getClick(I, false))
break;
}
} catch (const vpException &e) {
std::cout << e.what() << std::endl;
}
#else
(void)argc;
(void)argv;
#endif
return EXIT_SUCCESS;
}

The default behavior is to detect human face, but you can input another model to detect the objects you want. To see which are the options, run:

$ ./tutorial-dnn-object-detection-live --help

Default DNN model and config files perform human face detection.

std::string model = "opencv_face_detector_uint8.pb";
std::string config = "opencv_face_detector.pbtxt";

It is provided by OpenCV and has been trained with the following characteristics:

This is a brief description of training process which has been used to get res10_300x300_ssd_iter_140000.caffemodel. The model was created with SSD framework using ResNet-10 like architecture as a backbone. Channels count in ResNet-10 convolution layers was significantly dropped (2x- or 4x- fewer channels). The model was trained in Caffe framework on some huge and available online dataset.

More specifically, the model used (opencv_face_detector_uint8.pb) has been quantized (with the TensorFlow library) on 8-bit unsigned int to reduce the size of the training model (2.7 mo vs 10.7 mo for res10_300x300_ssd_iter_140000.caffemodel).

To create the DNN object detector:

dnn.readNet(model, config);
dnn.setInputSize(inputWidth, inputHeight);
dnn.setMean(meanR, meanG, meanB);
dnn.setScaleFactor(scaleFactor);
dnn.setSwapRB(swapRB);
dnn.setConfidenceThreshold(confThresh);
dnn.setNMSThreshold(nmsThresh);

model is the network trained weights, config is the network topology description.

inputWidth and inputHeight are the dimensions to resize the input image into the blob that is fed in entry of the network.

meanR, meanG and meanB are the values used for mean subtraction.

scaleFactor is used to normalize the data range.

swapRB should be set to true when the model has been trained on RGB data. Since OpenCV used the BGR convention, R and B channel should be swapped.

You can directly refer to the OpenCV model zoo for the parameters values.

confThresh is the confidence threshold used to filter the detections after inference.

nmsThresh is the Non-Maximum Threshold. It is used to filter multiple detections that can occur approximatively at the same locations.

After setting the correct parameters, you can easily detect object in an image with

std::vector<vpRect> boundingBoxes;
dnn.detect(I, boundingBoxes);

Class ids and detection confidence scores can be retrieved with

std::vector<int> classIds = dnn.getDetectionClassIds();
std::vector<float> confidences = dnn.getDetectionConfidence();

Object detection model zoo

You can find more models in the OpenCV model zoo.

vpDisplayX
Use the X11 console to display images on unix-like OS. Thus to enable this class X11 should be instal...
Definition: vpDisplayX.h:149
vpDetectorDNN::setInputSize
void setInputSize(int width, int height)
Definition: vpDetectorDNN.cpp:324
vpDetectorDNN
Definition: vpDetectorDNN.h:51
vpDisplay::setTitle
static void setTitle(const vpImage< unsigned char > &I, const std::string &windowtitle)
Definition: vpDisplay_uchar.cpp:1222
vpImageConvert::convert
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
Definition: vpImageConvert.cpp:78
vpDetectorDNN::setMean
void setMean(double meanR, double meanG, double meanB)
Definition: vpDetectorDNN.cpp:336
vpDetectorDNN::setSwapRB
void setSwapRB(bool swapRB)
Definition: vpDetectorDNN.cpp:382
vpDetectorDNN::getDetectionClassIds
std::vector< int > getDetectionClassIds(bool afterNMS=true) const
Definition: vpDetectorDNN.cpp:141
vpImage::getSize
unsigned int getSize() const
Definition: vpImage.h:224
vpDisplayGDI
Display for windows using GDI (available on any windows 32 platform).
Definition: vpDisplayGDI.h:127
vpDisplay::displayRectangle
static void displayRectangle(const vpImage< unsigned char > &I, const vpImagePoint &topLeft, unsigned int width, unsigned int height, const vpColor &color, bool fill=false, unsigned int thickness=1)
Definition: vpDisplay_uchar.cpp:547
vpDetectorDNN::setScaleFactor
void setScaleFactor(double scaleFactor)
Definition: vpDetectorDNN.cpp:373
vpDetectorDNN::setNMSThreshold
void setNMSThreshold(float nmsThreshold)
Definition: vpDetectorDNN.cpp:346
vpDetectorDNN::detect
virtual bool detect(const vpImage< unsigned char > &I)
Definition: vpDetectorDNN.cpp:54
vpDisplayOpenCV
The vpDisplayOpenCV allows to display image using the OpenCV library. Thus to enable this class OpenC...
Definition: vpDisplayOpenCV.h:140
vpTime::measureTimeMs
VISP_EXPORT double measureTimeMs()
Definition: vpTime.cpp:125
vpDisplay::display
static void display(const vpImage< unsigned char > &I)
Definition: vpDisplay_uchar.cpp:739
vpDisplay::displayText
static void displayText(const vpImage< unsigned char > &I, const vpImagePoint &ip, const std::string &s, const vpColor &color)
Definition: vpDisplay_uchar.cpp:663
vpException::what
const char * what() const
Definition: vpException.cpp:101
vpDetectorDNN::getDetectionConfidence
std::vector< float > getDetectionConfidence(bool afterNMS=true) const
Definition: vpDetectorDNN.cpp:157
vpDisplayX::init
void init(vpImage< unsigned char > &I, int winx=-1, int winy=-1, const std::string &title="")
Definition: vpDisplayX.cpp:251
vpDetectorDNN::setConfidenceThreshold
void setConfidenceThreshold(float confThreshold)
Definition: vpDetectorDNN.cpp:314
vpDisplay::flush
static void flush(const vpImage< unsigned char > &I)
Definition: vpDisplay_uchar.cpp:715
vpImage< vpRGBa >
vpDisplay::getClick
static bool getClick(const vpImage< unsigned char > &I, bool blocking=true)
Definition: vpDisplay_uchar.cpp:764
vpException
error that can be emited by ViSP classes.
Definition: vpException.h:70
vpDetectorDNN::readNet
void readNet(const std::string &model, const std::string &config="", const std::string &framework="")
Definition: vpDetectorDNN.cpp:300
vpColor::red
static const vpColor red
Definition: vpColor.h:178