I have been testing a CUDA matrix multiplication code for mex that I found online, and when I change the block size from 32 to 64, my output is suddenly all zeros.
My matrix multiplication is A X B = C. I have a matrix structure that holds the height and width of A, B, and C.
I define GRID SIZE as follows and I set block size arbitrarily.
dim3 dimGrid(ceil((B->width + dimBlock.x - 1) / dimBlock.x), ceil((A.height + dimBlock.y - 1) / dimBlock.y));
I input to the cuda kernel as follows
MatMulKernel << <dimGrid, dimBlock >> >(A, B, C);
I chose a block size of (32,32) and (64, 64). The output when my block size is 32 is correct, and the grid size/block size combo is
GRID SIZE = 48, 1
BLOCK SIZE = 32, 32
but the output when my block size is 64 is all zeros. In this case I have
GRID SIZE = 24, 1
BLOCK SIZE = 64, 64
In this example, I'm multiplying a 2x3 matrix (A) by a 3x1518 matrix (B). I'm not providing the matrix because it would just be a bunch of meaningless numbers to anyone but me.
Can anyone help me understand why this is happening? And if you need more information, let me know.
===========Edit===============
Here is the kernel
void MatMul(Matrix * A_h, Matrix * B_h, Matrix* C_h){
float size_f = sizeof(float);
// MATRIX A
Matrix A_d;
A_d.width = A_h->width;
A_d.height = A_h->height;
cudaMalloc( &A_d.elements, A_h->width*A_h->height*size_f);
cudaMemcpy( A_d.elements, A_h->elements, A_h->width*A_h->height*size_f, cudaMemcpyHostToDevice);
// MATRIX B
Matrix B_d;
B_d.width = B_h->width;
B_d.height = B_h->height;
cudaMalloc(&B_d.elements,B_h->width*B_h->height*size_f);
cudaMemcpy( B_d.elements, B_h->elements, B_h->width*B_h->height*size_f, cudaMemcpyHostToDevice);
// MATRIX C
Matrix C_d;
C_d.width = B_h->width;
C_d.height = A_h->height;
cudaMalloc(&C_d.elements, C_d.width*C_d.height*size_f);
C_h->width = C_d.width;
C_h->height = C_d.height;
C_h->elements = (float*)mxMalloc(C_h->width * C_h->height*size_f);
//printf("%dx%d, %dx%d -> %dx%d\n",A_d.height,A_d.width,B_d.height,B_d.width,C_d.height,C_d.width);
// CALL MATRIX MULTIPLICATION FXN
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(ceil((B_h->width + dimBlock.x - 1) / dimBlock.x), ceil((A_d.height + dimBlock.y - 1) / dimBlock.y));
MatMulKernel << <dimGrid, dimBlock >> >(A_d, B_d, C_d);
cudaMemcpy(C_h->elements, C_d.elements, C_d.width*C_d.height*size_f, cudaMemcpyDeviceToHost);
cudaFree(A_d.elements);
cudaFree(C_d.elements);
cudaFree(B_d.elements);
}