Can I get faster MatrixMultiplication using CUDA than with Matlab internal GPU implementation??

Question

Robert 2013-2-1

0
链接

此问题的直接链接

https://ww2.mathworks.cn/matlabcentral/answers/60812-can-i-get-faster-matrixmultiplication-using-cuda-than-with-matlab-internal-gpu-implementation

回答： Zainub 2015-2-9

I have to Multiply a Matrix A (size about 400 x400 ) with M Matrices B, while m is also about 500.

N=400;M=500;
A=rand(N,N);
B=rand(N,N,M);
C=zeros(N,N,M);

Using Matlab GPU implementation:

gpu_A=gpuArray(A);
gpu_B=gpuArray(B);
zCell = arrayfun(@(iz) gpu_A * gpu_B(:,:,iz),1:size(gpu_B,3),'uniformOutput',false);
size(zCell)
C3 = gather(cat(3, zCell{:}));

My hope was that an implementation using CUDA code and a mex file would be faster than this, similar to the Mandelbrot example in the MatLab help.

My Cuda Code uses the CUBLAS function: cublasDgemmBatched

To save memory the matrix is stored only one time on the GPU, but I use 500 pointers showing to the same matrix A..

My complete CUDA code you find below. Any suggestion how to get it faster? Now MatLab is about 10% faster than my CUDA code..

#include "mex.h"
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
void inline checkError(cublasStatus_t status, const char *msg)
{
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("%s", msg);
        exit(EXIT_FAILURE);
    }
}
// end of CUDA Helper Functions
void initializeCUDA(int &devID)
{
    // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
    cudaError_t error;
    devID = 0;
    // get number of SMs on this GPU
    error = cudaGetDevice(&devID);
      if (error != cudaSuccess)
      {
          printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
          exit(EXIT_FAILURE);
      }
      cudaDeviceProp deviceProp;
      error = cudaGetDeviceProperties(&deviceProp, devID);
      if (error != cudaSuccess)
      {
          printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
          exit(EXIT_FAILURE);
      }  
}
////////////////////////////////////////////////////////////////////////////////
//Make Matrix Multiplication using CUBLAS
////////////////////////////////////////////////////////////////////////////////
void matrixMultiply3D( int devID,double *h_A, double *h_B, double *h_C ,mwSize ncolsA , mwSize nrowsA,mwSize ncolsB,mwSize nrowsB,mwSize nB)
{
  initializeCUDA( devID);
      cudaDeviceProp deviceProp;
      cudaError_t error;
      error = cudaGetDeviceProperties(&deviceProp, devID);
      if (error != cudaSuccess)
      {
          printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
          exit(EXIT_FAILURE);
      }
      // use a larger block size for Fermi and above
      int block_size = (deviceProp.major < 2) ? 16 : 32;
      unsigned int size_A = ncolsA * nrowsA;
      unsigned int mem_size_A = sizeof(double) * size_A;
      unsigned int size_B = ncolsB * nrowsB * nB;
      unsigned int mem_size_B = sizeof(double) * size_B;
      const unsigned int nrowsC = nrowsA;
      const unsigned int ncolsC = ncolsB;
      // allocate device memory
      double *d_A, *d_B, *d_C;
      const double **dd_A, **dd_B;
      double **dd_C;
      double **point_temp =new double*[nB];
      unsigned int size_C = nrowsA * ncolsB * nB;
      unsigned int mem_size_C = sizeof(double) * size_C;
      unsigned int mem_size_point = sizeof(double*) * nB;
       error = cudaMalloc((void **) &d_A, mem_size_A);
      if (error != cudaSuccess)
      {
          printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__);
          exit(EXIT_FAILURE);
      }
      error = cudaMalloc((void **) &dd_A, mem_size_point);
      if (error != cudaSuccess)
      {
          printf("cudaMalloc dd_A returned error code %d, line(%d)\n", error, __LINE__);
          exit(EXIT_FAILURE);
      }
      for ( int i=0; i<nB; i++) point_temp[i] = d_A;
      // copy host memory to device
      error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
      if (error != cudaSuccess)
      {
          printf("cudaMemcpy d_A h_A returned error code %d, line(%d)\n", error, __LINE__);
          exit(EXIT_FAILURE);
      }
      // copy host memory to device
      error = cudaMemcpy(dd_A, point_temp, mem_size_point, cudaMemcpyHostToDevice);
      if (error != cudaSuccess)
      {
          printf("cudaMemcpy dd_A d_A returned error code %d, line(%d)\n", error, __LINE__);
          exit(EXIT_FAILURE);
      }
      error = cudaMalloc((void **) &d_B, mem_size_B);
      if (error != cudaSuccess)
      {
          printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__);
          exit(EXIT_FAILURE);
      }
      error = cudaMalloc((void **) &dd_B, nB*sizeof(double*));
      if (error != cudaSuccess)
      {
          printf("cudaMalloc dd_B returned error code %d, line(%d)\n", error, __LINE__);
          exit(EXIT_FAILURE);
      }
      for ( int i=0; i<nB; i++) point_temp[i] = d_B + i*ncolsB * nrowsB;
      error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
      if (error != cudaSuccess)
      {
          printf("cudaMemcpy d_B h_B returned error code %d, line(%d)\n", error, __LINE__);
          exit(EXIT_FAILURE);
      }
      // copy host memory to device
      error = cudaMemcpy(dd_B, point_temp, mem_size_point, cudaMemcpyHostToDevice);
      if (error != cudaSuccess)
      {
          printf("cudaMemcpy dd_B d_B returned error code %d, line(%d)\n", error, __LINE__);
          exit(EXIT_FAILURE);
      } 
      error = cudaMalloc((void **) &d_C, mem_size_C);
      if (error != cudaSuccess)
      {
          printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__);
          exit(EXIT_FAILURE);
      }
      error = cudaMalloc((void **) &dd_C, nB*sizeof(double*));
      if (error != cudaSuccess)
      {
          printf("cudaMalloc dd_C returned error code %d, line(%d)\n", error, __LINE__);
          exit(EXIT_FAILURE);
      }
      for ( int i=0; i<nB; i++) point_temp[i] = d_C + i*ncolsC * nrowsC;
      // copy host memory to device
      error = cudaMemcpy(dd_C, point_temp, mem_size_point, cudaMemcpyHostToDevice);
      if (error != cudaSuccess)
      {
          printf("cudaMemcpy dd_C d_C returned error code %d, line(%d)\n", error, __LINE__);
          exit(EXIT_FAILURE);
      }
      // setup execution parameters
      dim3 threads(block_size, block_size);
      dim3 grid(ncolsB / threads.x, nrowsA / threads.y);
      // execute the kernel
      // CUBLAS version 2.0
      {
          cublasHandle_t handle;
          cublasStatus_t ret;
          ret = cublasCreate(&handle);
          if (ret != CUBLAS_STATUS_SUCCESS)
          {
              printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
              exit(EXIT_FAILURE);
          }
          const double alpha = 1.0f;
          const double beta  = 0.0f;
          ret = cublasDgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, nrowsA, ncolsB, ncolsA, &alpha, dd_A, nrowsA, dd_B, nrowsB, &beta, dd_C, nrowsC, nB);
           if (ret != CUBLAS_STATUS_SUCCESS)
          {
              printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__);
              exit(EXIT_FAILURE);
          }
          // copy result from device to host
          error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
          if (error != cudaSuccess)
          {
              printf("cudaMemcpy h_CUBLAS d_C returned error code %d, line(%d)\n", error, __LINE__);
              exit(EXIT_FAILURE);
          }
          checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
      }
      cudaFree(d_A);
      cudaFree(d_B);
      cudaFree(d_C);
      cudaFree(dd_A);
      cudaFree(dd_B);
      cudaFree(dd_C);
      delete [] point_temp;
      cudaDeviceReset();
}
/* The gateway function */
void mexFunction( int nlhs, mxArray *plhs[],
                  int nrhs, const mxArray *prhs[]){
  /*nlhs number of outputs
    plhs pointer to outputs
    nrhs number of inputs
    prhs   pointer to inputs  */
    double *inMatrixA;              /* 1xN input matrix */
    double *inMatrixB;               /* 1xN input matrix */
    size_t ncolsA;                   /* size of matrix */
    size_t nrowsA;                   /* size of matrix */
    size_t ncolsB;                   /* size of matrix */
    size_t nrowsB;                   /* size of matrix */
    size_t nB;                   /* number of matrices B */
    const mwSize *dims;
      double *outMatrix;              /* output matrix */
      /* check for proper number of arguments */
      if(nrhs!=2) {
          mexErrMsgIdAndTxt("MyToolbox:arrayProduct:nrhs","Two Matrix inputs required.");
      }
      if(nlhs!=1) {
          mexErrMsgIdAndTxt("MyToolbox:arrayProduct:nlhs","One output required.");
      }
      /* get the value of the scalar input  */
      inMatrixA = (double*) mxGetData(prhs[0]);
      /* create a pointer to the real data in the input matrix  */
      inMatrixB = (double*) mxGetData(prhs[1]);
      /* get dimensions of the input matrix */
      ncolsA = mxGetN(prhs[0]);
      nrowsA = mxGetM(prhs[0]);   
      dims = mxGetDimensions(prhs[1]);
      ncolsB = dims[1];
      nrowsB = dims[0];
      nB = dims[2];
      /* create the output matrix */
      plhs[0] = mxCreateNumericArray(3 , dims, mxDOUBLE_CLASS, mxREAL);
      /* get a pointer to the real data in the output matrix */
      outMatrix = (double*) mxGetData(plhs[0]);
      /* call the computational routine */
      matrixMultiply3D(0, inMatrixA,inMatrixB,outMatrix,(mwSize)ncolsA , (mwSize)nrowsA,(mwSize)ncolsB,(mwSize)nrowsB,(mwSize) nB);//, sMatrixSize &matrix_size)
}

Thanks in advance

Robert

0 个评论
显示 -2更早的评论隐藏 -2更早的评论

请先登录，再进行评论。

请先登录，再回答此问题。

Answer 1

James Tursa 2013-2-1

1
链接

此回答的直接链接

https://ww2.mathworks.cn/matlabcentral/answers/60812-can-i-get-faster-matrixmultiplication-using-cuda-than-with-matlab-internal-gpu-implementation#answer_73151

在 MATLAB Online 中打开

I'm curious how the times compare to this:

C = mtimesx(A,B);

MTIMESX can be found here:

href = ""<http://www.mathworks.com/matlabcentral/fileexchange/25977-mtimesx-fast-matrix-multiply-with-multi-dimensional-support</a>>

0 个评论
显示 -2更早的评论隐藏 -2更早的评论

请先登录，再进行评论。

Answer 2

Francisco Ramírez 2013-2-18

0
链接

此回答的直接链接

https://ww2.mathworks.cn/matlabcentral/answers/60812-can-i-get-faster-matrixmultiplication-using-cuda-than-with-matlab-internal-gpu-implementation#answer_75535

In which format do you saved the file (.cu, .c, .cpp)?

How was the compilation instruction (mex ...)?

Regards,

Francisco

0 个评论
显示 -2更早的评论隐藏 -2更早的评论

请先登录，再进行评论。

Answer 3

Francisco Ramírez 2013-2-22

0
链接

此回答的直接链接

https://ww2.mathworks.cn/matlabcentral/answers/60812-can-i-get-faster-matrixmultiplication-using-cuda-than-with-matlab-internal-gpu-implementation#answer_76129

Hi Robert,

I was looking into the CUBLAS documentation for more information about the function "cublasDgemmBatched" and I found the following:

cublasDgemmBatched. This function is intended to be used for matrices of small sizes where the launch overhead is a significant factor. For small sizes, typically smaller than 100x100, this function improves significantly performance compared to making calls to its corresponding cublas<t>gemm routine.

This is the link for more info: http://docs.nvidia.com/cuda/cublas/index.html#topic_8_2

0 个评论
显示 -2更早的评论隐藏 -2更早的评论

请先登录，再进行评论。

Answer 4

Zainub 2015-2-9

0
链接

此回答的直接链接

https://ww2.mathworks.cn/matlabcentral/answers/60812-can-i-get-faster-matrixmultiplication-using-cuda-than-with-matlab-internal-gpu-implementation#answer_167408

hi... I am using the same thing (cuda with matlab)... but I face a problem while adding path of cublas library... Error using loadlibrary (line 422) There was an error loading the library "libcublas" The specified module could not be found.

Caused by: Error using loaddefinedlibrary The specified module could not be found.

0 个评论
显示 -2更早的评论隐藏 -2更早的评论

请先登录，再进行评论。

Can I get faster MatrixMultiplication using CUDA than with Matlab internal GPU implementation??

0 个评论
显示 -2更早的评论隐藏 -2更早的评论

回答（4 个）

0 个评论
显示 -2更早的评论隐藏 -2更早的评论

0 个评论
显示 -2更早的评论隐藏 -2更早的评论

0 个评论
显示 -2更早的评论隐藏 -2更早的评论

0 个评论
显示 -2更早的评论隐藏 -2更早的评论

另请参阅

类别

标签

产品

Community Treasure Hunt

Can I get faster MatrixMultiplication using CUDA than with Matlab internal GPU implementation??

0 个评论 显示 -2更早的评论隐藏 -2更早的评论

回答（4 个）

0 个评论 显示 -2更早的评论隐藏 -2更早的评论

0 个评论 显示 -2更早的评论隐藏 -2更早的评论

0 个评论 显示 -2更早的评论隐藏 -2更早的评论

0 个评论 显示 -2更早的评论隐藏 -2更早的评论

另请参阅

类别

标签

产品

Community Treasure Hunt

0 个评论
显示 -2更早的评论隐藏 -2更早的评论

0 个评论
显示 -2更早的评论隐藏 -2更早的评论

0 个评论
显示 -2更早的评论隐藏 -2更早的评论

0 个评论
显示 -2更早的评论隐藏 -2更早的评论

0 个评论
显示 -2更早的评论隐藏 -2更早的评论