2 #include <opencv2/imgproc/types_c.h>
3 #include <opencv2/imgproc.hpp>
4 #include "GIEFeatExtractor.h"
9 bool GIEFeatExtractor::cudaAllocMapped(
void** cpuPtr,
void** gpuPtr,
size_t size )
11 if( !cpuPtr || !gpuPtr || size == 0 )
16 if( CUDA_FAILED(cudaHostAlloc(cpuPtr, size, cudaHostAllocMapped)) )
19 if( CUDA_FAILED(cudaHostGetDevicePointer(gpuPtr, *cpuPtr, 0)) )
22 memset(*cpuPtr, 0, size);
23 std::cout <<
"cudaAllocMapped : " << size <<
" bytes" << std::endl;
27 bool GIEFeatExtractor::cudaFreeMapped(
void *cpuPtr)
29 if ( CUDA_FAILED( cudaFreeHost(cpuPtr) ) )
31 std::cout <<
"cudaFreeMapped: OK" << std::endl;
34 bool GIEFeatExtractor::caffeToGIEModel(
const std::string& deployFile,
35 const std::string& modelFile,
36 const std::string& binaryprotoFile,
37 const std::vector<std::string>& outputs,
38 unsigned int maxBatchSize,
39 std::ostream& gieModelStream)
42 nvinfer1::IBuilder* builder = createInferBuilder(gLogger);
43 nvinfer1::INetworkDefinition* network = builder->createNetwork();
45 builder->setMinFindIterations(3);
46 builder->setAverageFindIterations(2);
49 nvcaffeparser1::ICaffeParser* parser = nvcaffeparser1::createCaffeParser();
51 const bool useFp16 = builder->platformHasFastFp16();
52 std::cout <<
"Platform FP16 support: " << useFp16 << std::endl;
53 std::cout <<
"Loading: " << deployFile <<
", " << modelFile << std::endl;
55 nvinfer1::DataType modelDataType = useFp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT;
56 const nvcaffeparser1::IBlobNameToTensor *blobNameToTensor = parser->parse(deployFile.c_str(),
61 if( !blobNameToTensor )
63 std::cout <<
"Failed to parse caffe network." << std::endl;
67 if (binaryprotoFile!=
"")
71 nvcaffeparser1::IBinaryProtoBlob* meanBlob = parser->parseBinaryProto(binaryprotoFile.c_str());
72 resizeDims = meanBlob->getDimensions();
74 const float *meanDataConst =
reinterpret_cast<const float*
>(meanBlob->getData());
76 meanData = (
float *) malloc(resizeDims.w*resizeDims.h*resizeDims.c*resizeDims.n*
sizeof(
float));
77 memcpy(meanData, meanDataConst, resizeDims.w*resizeDims.h*resizeDims.c*resizeDims.n*
sizeof(
float) );
92 const size_t num_outputs = outputs.size();
94 for(
size_t n=0; n < num_outputs; n++ )
95 network->markOutput(*blobNameToTensor->find(outputs[n].c_str()));
98 std::cout <<
"Configuring CUDA engine..." << std::endl;
100 builder->setMaxBatchSize(maxBatchSize);
101 builder->setMaxWorkspaceSize(16 << 20);
105 builder->setHalf2Mode(
true);
107 std::cout <<
"Building CUDA engine..." << std::endl;
108 nvinfer1::ICudaEngine* engine = builder->buildCudaEngine(*network);
112 std::cout <<
"Failed to build CUDA engine." << std::endl;
120 engine->serialize(gieModelStream);
129 GIEFeatExtractor::GIEFeatExtractor(
string _caffemodel_file,
130 string _binaryproto_meanfile,
float _meanR,
float _meanG,
float _meanB,
131 string _prototxt_file,
int _resizeWidth,
int _resizeHeight,
158 caffemodel_file =
"";
160 binaryproto_meanfile =
"";
165 if( !init(_caffemodel_file, _binaryproto_meanfile, _meanR, _meanG, _meanB, _prototxt_file, _resizeWidth, _resizeHeight, _blob_name ) )
167 std::cout <<
"GIEFeatExtractor: init() failed." << std::endl;
175 bool GIEFeatExtractor::init(
string _caffemodel_file,
string _binaryproto_meanfile,
float _meanR,
float _meanG,
float _meanB,
string _prototxt_file,
int _resizeWidth,
int _resizeHeight,
string _blob_name)
181 if ( CUDA_FAILED( cudaGetDevice(&whichDevice)) )
184 if ( CUDA_FAILED( cudaGetDeviceProperties(&prop, whichDevice)) )
187 if (prop.canMapHostMemory != 1)
189 std::cout <<
"Device cannot map memory!" << std::endl;
197 caffemodel_file = _caffemodel_file;
198 binaryproto_meanfile = _binaryproto_meanfile;
200 mean_values.push_back(_meanB);
201 mean_values.push_back(_meanG);
202 mean_values.push_back(_meanR);
204 prototxt_file = _prototxt_file;
207 blob_name = _blob_name;
210 std::stringstream gieModelStream;
211 gieModelStream.seekg(0, gieModelStream.beg);
213 if( !caffeToGIEModel( prototxt_file, caffemodel_file, binaryproto_meanfile, std::vector< std::string > { blob_name }, 1, gieModelStream) )
215 std::cout <<
"Failed to load: " << caffemodel_file << std::endl;
218 std::cout << caffemodel_file <<
": loaded." << std::endl;
221 nvinfer1::IRuntime* infer = createInferRuntime(gLogger);
224 std::cout <<
"Failed to create InferRuntime." << std::endl;
227 nvinfer1::ICudaEngine* engine = infer->deserializeCudaEngine(gieModelStream);
230 std::cout <<
"Failed to create CUDA engine." << std::endl;
233 nvinfer1::IExecutionContext* context = engine->createExecutionContext();
236 std::cout <<
"failed to create execution context." << std::endl;
239 std::cout <<
"CUDA engine context initialized with " << engine->getNbBindings() <<
" bindings." << std::endl;
246 const int inputIndex = engine->getBindingIndex(
"data");
247 const int outputIndex = engine->getBindingIndex( blob_name.c_str() );
249 std::cout << caffemodel_file <<
" input binding index: " << inputIndex << std::endl;
250 std::cout << caffemodel_file <<
" output binding index: " << outputIndex << std::endl;
252 nvinfer1::Dims3 inputDims = engine->getBindingDimensions(inputIndex);
253 nvinfer1::Dims3 outputDims = engine->getBindingDimensions(outputIndex);
255 size_t inputSize = inputDims.c * inputDims.h * inputDims.w *
sizeof(float);
256 size_t outputSize = outputDims.c * outputDims.h * outputDims.w *
sizeof(float);
258 std::cout << caffemodel_file <<
"input dims (c=" << inputDims.c <<
" h=" << inputDims.h <<
" w=" << inputDims.w <<
") size=" << inputSize << std::endl;
259 std::cout << caffemodel_file <<
"output dims (c=" << outputDims.c <<
" h=" << outputDims.h <<
" w=" << outputDims.w <<
") size=" << outputSize << std::endl;
262 if ( !cudaAllocMapped((
void**)&mInputCPU, (
void**)&mInputCUDA, inputSize) )
264 std::cout <<
"Failed to alloc CUDA mapped memory for input, " << inputSize <<
" bytes" << std::endl;
267 mInputSize = inputSize;
268 mWidth = inputDims.w;
269 mHeight = inputDims.h;
272 if( !cudaAllocMapped((
void**)&mOutputCPU, (
void**)&mOutputCUDA, outputSize) )
274 std::cout <<
"Failed to alloc CUDA mapped memory for output, " << outputSize <<
" bytes" << std::endl;
277 mOutputSize = outputSize;
278 mOutputDims = outputDims.c;
280 std::cout << caffemodel_file <<
": initialized." << std::endl;
282 if (binaryproto_meanfile==
"")
285 resizeDims.h = _resizeHeight;
286 resizeDims.w = _resizeWidth;
294 GIEFeatExtractor::~GIEFeatExtractor()
296 if( mEngine != NULL )
308 cudaFreeMapped(mOutputCPU);
309 cudaFreeMapped(mInputCPU);
311 if (mean_values[0]==-1)
315 bool GIEFeatExtractor::extract_singleFeat_1D(cv::Mat &imMat, vector<float> &features,
float (×)[2])
323 std::cout <<
"GIEFeatExtractor::extract_singleFeat_1D(): empty imMat!" << std::endl;
328 cudaEvent_t startPrep, stopPrep, startNet, stopNet;
331 cudaEventCreate(&startPrep);
332 cudaEventCreate(&startNet);
333 cudaEventCreate(&stopPrep);
334 cudaEventCreate(&stopNet);
335 cudaEventRecord(startPrep, NULL);
336 cudaEventRecord(startNet, NULL);
342 if (imMat.rows != resizeDims.h || imMat.cols != resizeDims.w)
344 if (imMat.rows > resizeDims.h || imMat.cols > resizeDims.w)
346 cv::resize(imMat, imMat, cv::Size(resizeDims.h, resizeDims.w), 0, 0, cv::INTER_LANCZOS4);
350 cv::resize(imMat, imMat, cv::Size(resizeDims.h, resizeDims.w), 0, 0, cv::INTER_LINEAR);
355 int h_off = (imMat.rows - mHeight) / 2;
356 int w_off = (imMat.cols - mWidth) / 2;
358 cv::Mat cv_cropped_img = imMat;
359 cv::Rect roi(w_off, h_off, mWidth, mHeight);
360 cv_cropped_img = imMat(roi);
363 for (
int h = 0; h < mHeight; ++h)
365 const uchar* ptr = cv_cropped_img.ptr<uchar>(h);
367 for (
int w = 0; w < mWidth; ++w)
369 for (
int c = 0; c < imMat.channels(); ++c)
371 top_index = (c * mHeight + h) * mWidth + w;
372 float pixel =
static_cast<float>(ptr[img_index++]);
373 if (mean_values[0]==-1)
375 int mean_index = (c * imMat.rows + h_off + h) * imMat.cols + w_off + w;
376 mInputCPU[top_index] = pixel - meanData[mean_index];
380 mInputCPU[top_index] = pixel - mean_values[c];
427 void* inferenceBuffers[] = { mInputCUDA, mOutputCUDA };
432 cudaEventRecord(stopPrep, NULL);
435 cudaEventSynchronize(stopPrep);
437 cudaEventElapsedTime(times, startPrep, stopPrep);
440 mContext->execute(1, inferenceBuffers);
443 features.insert(features.end(), &mOutputCPU[0], &mOutputCPU[mOutputDims]);
448 cudaEventRecord(stopNet, NULL);
451 cudaEventSynchronize(stopNet);
453 cudaEventElapsedTime(times+1, startNet, stopNet);