call mexcuda file returned gpuarray cost abnormal time
显示 更早的评论
I found when I call the returned gpuarray from mex(cuda c) file coat so much time.
Here is my function,I use the returned array to do a simplest operation,but it cost much time.
function y = Afun_gmm_test(x,bmag, bori,beta,lambda,transp_flag)
ss = size(bmag);
xx = reshape(x,ss);
xx = gpuArray(xx);
bmag = gpuArray(bmag);
bori = gpuArray(bori);
Kx = MotionBlurConv_gpu(xx,bmag,bori);
Kxx = MotionBlurConv_mirror_gpu(Kx,bmag,bori);
Kxx = Kxx + 1;
y = lambda * Kxx + beta*xx;
y = y(:);
y = gather(y);
time cost

MotionBlurConv_gpu and MotionBlurConv_mirror_gpu are two mex file function,which returned gpuarray.
Here is my MotionBlurConv_mirror_gpu code: it's a bit long,you can only read the 'host code' part if you are inpatient,I think it's the only part may be wrong
#include "mex.h"
#include "gpu/mxGPUArray.h"
#include <stdlib.h>
#include <math.h>
#define eps 2.2204e-16f
#define sign(n) (n<0?-1.0f:1.0f)
#define ABS(x) (x<0?-x:x)
#define M_PI 3.14159265358979323846
/*
* Device code
*/
void __global__ MotionBlurConv(int w_img,int h_img,double *X,
double *bmag,double *bori,double *R)
{
/* Perform convolution */
int sx, sy;
int offSet_img, off_x, off_y, l, k;
double *src_img,*dst;
double half, phi, cosphi, xsign, sinphi, x2lastpix, mag, ori,dist2line, dist2cent, linewdt = 1, sumKernel = 0;
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if (i < w_img && j < h_img) {
/* Set indicex */
src_img = X + j * w_img + i;
dst = R + j * w_img + i;
/* Generate kernel value */
mag = bmag[j * w_img + i];
ori = bori[j * w_img + i];
half = (mag - 1) / 2.0f;
phi = fmod(ori, 180.0) / 180 * M_PI;
cosphi = cos(phi);
sinphi = sin(phi);
xsign = sign(cosphi);
double tmp = half*cosphi + linewdt*xsign - mag*eps;
sx = (floor(ABS(tmp)));
tmp = half*sinphi + linewdt - mag*eps;
sy = (floor(ABS(tmp)));
// if (j == y0 & i == x0)
// {
// printf("%f %f %f %d %d \n", mag, ori, half, sx, sy);
// printf(" %d %d \n", sx, sy);
// printf("%f %f %f %f %f %f\n", half, cosphi, linewdt, xsign, mag, eps);
// printf("%f %f \n",half*cosphi + linewdt*xsign - mag*eps, half*sinphi + linewdt - mag*eps);
// }
/* Convolution at each location */
sumKernel = 0;
for(l = -sy; l <= sy; l++) // y
{
for(k = -sx; k <= sx; k++) // x
{
// compute the kernel value at currret location
dist2line = l * cosphi + k * sinphi;
dist2cent = sqrtf(l * l + k * k);
if (ABS(dist2line) <= linewdt & dist2cent >= half) // if it is the end point
{
x2lastpix = half - ABS((k + dist2line*sinphi)/cosphi);
dist2line = sqrt(pow(dist2line, 2) + pow(x2lastpix, 2));
}
dist2line = linewdt + eps - ABS(dist2line);
if (dist2line<0) dist2line = 0;
off_x = (i + k < w_img & i + k >= 0) ? k : (i + k < 0 ? -i : w_img - 1 - i);
off_y = (j + l < h_img & j + l >= 0) ? l : (j + l < 0 ? -j : h_img - 1 - j);
offSet_img = (off_y) * w_img + off_x;
//ker[(l + sy) * (2*sx + 1) + k + sx] = dist2line * (*(src_img));
//*(dst + offSet_img) += dist2line * (*(src_img)); /**/
sumKernel += dist2line;
//if (j == y0 & i == x0)
//{
// printf(" %f ", *(src_img));
//}
}
}
//printf(" %d %d \n", sx, sy);
// normalization
for(l = -sy; l <= sy; l++) // y
{
for(k = -sx; k <= sx; k++) // x
{
dist2line = l * cosphi + k * sinphi;
dist2cent = sqrtf(l * l + k * k);
if (ABS(dist2line) <= linewdt & dist2cent >= half) // if it is the end point
{
x2lastpix = half - ABS((k + dist2line*sinphi)/cosphi);
dist2line = sqrt(pow(dist2line, 2) + pow(x2lastpix, 2));
}
dist2line = linewdt + eps - ABS(dist2line);
if (dist2line<0) dist2line = 0;
off_x = (i + k < w_img & i + k >= 0) ? k : (i + k < 0 ? -i : w_img - 1 - i);
off_y = (j + l < h_img & j + l >= 0) ? l : (j + l < 0 ? -j : h_img - 1 - j);
offSet_img = (off_y) * w_img + off_x;
//if((j * w_img + i + offSet_img == y0 * w_img + x0))
// {
// printf(" (%d %d %d %d %f %f %f %d)", off_x, off_y, i, j, ker[(l + sy) * (2*sx + 1) + k + sx], sumKernel, R[j * w_img + i + offSet_img], y0 * w_img + x0);
// }
if (sumKernel > 0)
{
//*(dst + offSet_img) += ker[(l + sy) * (2*sx + 1) + k + sx] / sumKernel;
atomicAdd(R + j * w_img + i + offSet_img,dist2line * (*(src_img))/ sumKernel);
//__syncthreads();
}
else
{
atomicAdd(R + j * w_img + i + offSet_img,dist2line * (*(src_img)));
//__syncthreads();
}
}
}
}
}
/*
* Host code
*/
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, mxArray const *prhs[])
{
/* Declare all variables.*/
mxGPUArray const *matrix_X, *matrix_bmag, *matrix_bori;
mxGPUArray *B;
double *X, *bmag, *bori, *R;
int w_img, h_img;
char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput";
char const * const errMsg = "Invalid input to MEX file.";
/* Initialize the MathWorks GPU API. */
mxInitGPU();
/* Throw an error if the input is not a GPU array. */
if ((nrhs!=3) || !(mxIsGPUArray(prhs[0]))) {
mexErrMsgIdAndTxt(errId, errMsg);
}
matrix_X = mxGPUCreateFromMxArray(prhs[0]);
matrix_bmag = mxGPUCreateFromMxArray(prhs[1]);
matrix_bori = mxGPUCreateFromMxArray(prhs[2]);
/*
* Verify that A really is a double array before extracting the pointer.
*/
if (mxGPUGetClassID(matrix_X) != mxDOUBLE_CLASS) {
mexErrMsgIdAndTxt(errId, errMsg);
}
/*
* Now that we have verified the data type, extract a pointer to the input
* data on the device.
*/
X = (double *)(mxGPUGetDataReadOnly(matrix_X));
bmag = (double *)(mxGPUGetDataReadOnly(matrix_bmag));
bori = (double *)(mxGPUGetDataReadOnly(matrix_bori));
mwSize const *size= mxGPUGetDimensions(matrix_X);
w_img = (int)(size[0]);
h_img = (int)(size[1]);
/* Choose a reasonably sized number of threads for the block. */ /*now only process the N*N case*/
dim3 threadsPerBlock(32, 32); /*when 256 threads 16,when 1024 threads 32,if process all at once w_img,h_img*/
dim3 blocksPerGrid((w_img + threadsPerBlock.x -1) / threadsPerBlock.x, (h_img + threadsPerBlock.y -1) / threadsPerBlock.y);
/* Create a GPUArray to hold the result and get its underlying pointer. */
B = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(matrix_X),
mxGPUGetDimensions(matrix_X),
mxGPUGetClassID(matrix_X),
mxGPUGetComplexity(matrix_X),
MX_GPU_INITIALIZE_VALUES);
R = (double *)(mxGPUGetData(B));
/*
* Call the kernel using the CUDA runtime API. We are using a 1-d grid here,
* and it would be possible for the number of elements to be too large for
* the grid. For this example we are not guarding against this possibility.
*/
MotionBlurConv<<<blocksPerGrid, threadsPerBlock>>>(w_img, h_img, X, bmag, bori ,R );
/* Wrap the result up as a MATLAB gpuArray for return. */
plhs[0] = mxGPUCreateMxArrayOnGPU(B);
/*
* The mxGPUArray pointers are host-side structures that refer to device
* data. These must be destroyed before leaving the MEX function.
*/
mxGPUDestroyGPUArray(matrix_X);
mxGPUDestroyGPUArray(matrix_bmag);
mxGPUDestroyGPUArray(matrix_bori);
mxGPUDestroyGPUArray(B);
}
So does anyone have the same the problem? I want to know the reason of it and how to fix it. Thanks.
采纳的回答
更多回答(0 个)
类别
在 帮助中心 和 File Exchange 中查找有关 GPU CUDA and MEX Programming 的更多信息
Community Treasure Hunt
Find the treasures in MATLAB Central and discover how the community can help you!
Start Hunting!