I have a code that performs matrix multiplications, I decided implemeatar on gpu with kernel.cu kernel.ptx the matrix multiplications.
the kernel is generic for matrix multiplication of size (DIMX*DIMY)*(DIMY*DIMZ)
kernel:
__global__ void MatMultNoShared(float* A, float* B, float* C,
int ARows, int ACols, int BRows,
int BCols, int CRows, int CCols,
int TILE_DIM)
{
float CValue = 0;
//indice of threads
int Col = blockIdx.y*TILE_DIM + threadIdx.y;
int Row = blockIdx.x*TILE_DIM + threadIdx.x;
for(int k=0; k<(TILE_DIM + ACols-1)/TILE_DIM; k++)
{
for(int n=0;n<TILE_DIM; ++n)
{
if((k*TILE_DIM + n<ACols && Row < ARows) &&
(k*TILE_DIM + n < BRows && Col < BCols ))
CValue += A[Row*ACols + k*TILE_DIM + n]*B[(k*TILE_DIM + n)*BCols + Col];
}
}
if (Row < CRows && Col < CCols)
{
C[((blockIdx.y * blockDim.y + threadIdx.y)*CCols)+(blockIdx.x * blockDim.x) + threadIdx.x] = CValue;
}
}
result:
matrix | time | GFLOP
------------------------------------------------------
| gpuArray | kernel | gpuArray | kernel
1024*1024 | 0.009165 | 0.131942 | 234.195459 | 16.268014
2048*2048 | 0.049744 | 0.918414 | 345.279053 | 18.701454
3072*3072 | 0.160027 | 3.105348 | 362.267296 | 18.668641
4096*4096 | 0.375305 | 7.339200 | 366.161794 | 18.724408
this kernel(MatMultNoShared) is too slow.
kernel slow, why? I expected that the kernel was 10 times faster than the gpuArray
The second kernel is only to square matrix
#define TILE_DIM 16
__global__ void simpleMultiply(float *a, float* b, float *c, int N)
{
int ACols=N;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
for(int k=0; k<(TILE_DIM + ACols-1)/TILE_DIM; k++)
{
for (int i = 0; i < TILE_DIM; i++)
{
sum += a[row*TILE_DIM+i] * b[i*N+col];
}
}
c[row*N+col] = tanh(sum);
}
result:
matrix | time | GFLOP
------------------------------------------------------
| gpuArray | kernel | gpuArray | kernel
1024*1024 0.009609 0.006878 223.384168 312.054593
2048*2048 0.050874 0.039116 337.615311 439.097961
3072*2048 0.160703 0.123678 360.742749 468.736544
4096*2048 0.375426 0.290781 366.043310 472.596134
similar time, similar GFLOP kernel slow, why? I expected that the kernel was 10 times faster than the gpuArray
the last kernel, with shared memory
#define TILE_DIM 16
__global__ void simpleMultiply(float *a, float* b, float *c, int N)
{
int ACols=N;
__shared__ float aTile[TILE_DIM][TILE_DIM];
bTile[TILE_DIM][TILE_DIM+1];
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
aTile[threadIdx.y][threadIdx.x] = a[row*TILE_DIM+threadIdx.x];
bTile[threadIdx.y][threadIdx.x] = b[threadIdx.y*N+col];
for(int k=0; k<(TILE_DIM + ACols-1)/TILE_DIM; k++)
{
aTile[threadIdx.y][threadIdx.x] = a[row*TILE_DIM+threadIdx.x];
bTile[threadIdx.y][threadIdx.x] = b[threadIdx.y*N+col];
for (int i = 0; i < TILE_DIM; i++)
{
sum += aTile[threadIdx.y][i]* b[i*N+col];
}
}
c[row*N+col] = (sum);
}
result:
matrix | time | GFLOP
------------------------------------------------------
| gpuArray | kernel | gpuArray | kernel
1024*1024 0.008948 0.008453 239.872186 253.939580
2048*2048 0.051383 0.051315 334.269557 334.712226
3072*3072 0.162450 0.167174 356.863683 346.779796
4096*4096 0.375368 0.393502 366.099592 349.228523
I don't understand, why the last kernel is slower than the previous kernel?