Why the output of my MEX Function become incorrect when used in a for loop in Matlab? Memory Management!!

3 次查看(过去 30 天)
Hi,
I'm trying to write a MEXGateway code so i could pass some variables in Matlab to my MEXFunction and process in GPU. The problem is that the MEXFunction works fine for the first time, but not anymore since its output is not correct. Based on what i found in the web, it could be a memory management issue. However, i cannot find any problem in my code causing this (I'm using proper commands at the end of my code to free all the memories (either in host or device) allocated in my code). Here is my code (and attached you can find all the codes you need to compile):
#include "Header.cuh"
#define NRHS 10 // the number of inputs
#define NLHS 2 // the number of outputs
__global__ void Test(int* Device_MediumZ) {
Device_MediumZ[threadIdx.x] += 1;
//printf("Device_MediumZ: %d ", Device_MediumZ[threadIdx.x]);
}
void mexFunction(int nlhs, mxArray* plhs[],
int nrhs, const mxArray* prhs[]) {
//mxInitGPU();
/* check for proper number of arguments */
if (nrhs != NRHS) {
mexErrMsgIdAndTxt("MyToolbox:arrayProduct:nrhs", "Number of inputs must be %u", NRHS);
}
if (nlhs != NLHS) {
mexErrMsgIdAndTxt("MyToolbox:arrayProduct:nlhs", "Number of outputs must be %u", NLHS);
}
int Counter = 0;
double TransferDataToDevice = mxGetScalar(prhs[Counter]); Counter++;; // we can set this variable "0" once the informative data are transfered.
Setup SystemSetup; // the variable containing the properties of the medium.
fillSetup(SystemSetup, prhs[Counter], 1, TransferDataToDevice); Counter++; // set the last parameter zero if you do not want to show the properties of the setup.
//float* MediumX; // X
float* MediumZ; // Z coordinates of the pixels
int* LensZIndex; // Z index of the lens; it has been already substracted by -1 in MATLAB
int* RfData; // RF data; a pinned memory was dedicated to this
int* Dir; // directivity pattern of the elements; a pinned memory was dedicated to this
float* LensArrivalTime; // The arrival time of the lens index with respect to the array elements; a pinned memory was dedicated to this
float* TRansducerCorrX; // X
float* TRansducerCorrZ; // Z coordinates of the array elements
int* ReconstructedImage_GPU_Sorted;
float* ProcessingTime; // this pointer contans the processing time of this mex file
mxGPUArray const* MediumX = mxGPUCreateFromMxArray(prhs[Counter]); Counter++;// Can be CPU or GPU, will copy to GPU if its not already there
float* Device_MediumX = static_cast<float*>((float*)mxGPUGetDataReadOnly(MediumX)); // get the pointer itself (assuming float data)
MediumZ = (float*)mxGetPr(prhs[Counter]); Counter++;
LensZIndex = (int*)mxGetPr(prhs[Counter]); Counter++;
RfData = (int*)mxGetPr(prhs[Counter]); Counter++;
Dir = (int*)mxGetPr(prhs[Counter]); Counter++;
LensArrivalTime = (float*)mxGetPr(prhs[Counter]); Counter++;
TRansducerCorrX = (float*)mxGetPr(prhs[Counter]); Counter++;
TRansducerCorrZ = (float*)mxGetPr(prhs[Counter]); Counter++;
plhs[0] = mxCreateNumericMatrix(SystemSetup.Nz, SystemSetup.Nx, mxINT32_CLASS, mxREAL);
//plhs[0] = mxCreateDoubleMatrix(1, SystemSetup.Nz * SystemSetup.Nx, mxREAL);
ReconstructedImage_GPU_Sorted = (int*)mxGetData(plhs[0]);
plhs[1] = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL);
ProcessingTime = (float*)mxGetPr(plhs[1]);
int NStart_Transmit{};
bool ShowTime = 1;
clock_t CUDA_start, CUDA_end, MAX_Find_Start, MAX_Find_End; // pointers to calculate the processing time at different stages
int ArrayByteSize_RfData = sizeof(int) * (SystemSetup.NumberOfTransmitter * SystemSetup.NumberOfReceiver * SystemSetup.NumberOfSamples);
int Dir_Size = SystemSetup.Nz * SystemSetup.Nx; // keep this constant. to refere to each pixel seen by each element, we need to shift by Dir_Size
int ArrayByteSize_Dir = sizeof(int) * (SystemSetup.NumberOfReceiver * Dir_Size);
float ArrayByteSize_LensArrivalTime = sizeof(float) * (SystemSetup.NumberOfReceiver * SystemSetup.Nx);
////>>>>>>>>>>>>>>>>> POINTERS for the DEVICE >>>>>>>>>>>>>>>>>>>>>>
Setup* Device_SystemSetup; // Devide pointer to the medium properties
float ArrayByteSize_MediumZ = sizeof(float) * SystemSetup.Nz;
float* Device_MediumZ; // device pointer to the Z coordinates of the medium
float ArrayByteSize_MediumX = sizeof(float) * SystemSetup.Nx;
//float* Device_MediumX; // device pointer to the X coordinates of the medium
float ArrayByteSize_TRansducerCorrZ = sizeof(float) * SystemSetup.NumberOfReceiver;
float* Device_TRansducerCorrZ; // device pointer to the Z coordinates of the array elements
float ArrayByteSize_TRansducerCorrX = sizeof(float) * SystemSetup.NumberOfReceiver;
float* Device_TRansducerCorrX; // device pointer to the X coordinates of the array elements
int ArrayByteSize_LensZIndex = sizeof(int) * SystemSetup.Nx;
int* Device_LensZIndex; // device pointer to the Z coordinates of the lens.
float* Device_LensArrivalTime; // device pointer to the lens arrival time.
int BYTES_PER_STREAM = ArrayByteSize_RfData / SystemSetup.NumberOfTransmitter;
int* Device_Dir;// device pointer to the directivity of the array elements.
int ArrayByteSize_ReconstructedImage_GPU = sizeof(int) * (SystemSetup.Nz * SystemSetup.Nx);
int* Device_ReconstructedImage_GPU; // device pointer to the reconstructed image, I part
int* Device_ReconstructedImage_GPU_Q; // device pointer to the reconstructed image, Q part
int Size_Proximal = (SystemSetup.Perio_ZEnd - SystemSetup.LenseThicknessIndex) * SystemSetup.Nx;
float ArrayByteSize_ProximalArrivalTime = sizeof(float) * (SystemSetup.NumberOfReceiver * Size_Proximal);
float* Device_ProximalArrivalTime; // Device pointer to the arrival time of the pixels we need to reconstruct to find the proximal end
int ArrayByteSize_ReconstructedImage_GPU_ProximalSegmentation = sizeof(int) * ((SystemSetup.Perio_ZEnd - SystemSetup.LenseThicknessIndex) * SystemSetup.Nx);
float* Device_DistalArrivalTime;// Device pointer to the arrival time of the pixels we need to reconstruct to find the distal end
float* Device_RestArrivalTime; // Device pointer to the arrival time of the pixels we need to reconstruct after finding the distal end
int* Device_ZIndexProximal; // Device pointer to the segmented proximal end
int* Device_ZIndexDistal; // Device pointer to the segmented distal end
int GridX, size, ReconstructionSoundSpeed, Size_Distal;
int NumberOfPixels{};
/// pointers needed for parabola fitting:
int Segmentation_ZStartPixel;
int Segmentation_ZEndPixel;
int Nz_Seg;
float ArrayByteSize_DistalArrivalTime;
float ArrayByteSize_RestArrivalTime;
//here is where we transfer the data to the GPU, ONLY for ONCE ////////////////////////////////////////////////////////////////////////////////////
int* xParabola;
int* FittingOutput;
xParabola = new int[SystemSetup.Nx];
for (int i = 0; i < SystemSetup.Nx; i++) {
xParabola[i] = i;
}
FittingOutput = new int[SystemSetup.Nx];
int* ZIndexProximal;
int* ZIndexDistal;
int* ReconstructedImage_GPU;
int* ReconstructedImage_GPU_Q;
int* ReconstructedImage_GPU_ABS;
ZIndexProximal = (int*)calloc(SystemSetup.Nx, sizeof(int)); // pointer for the proximal end
ZIndexDistal = (int*)calloc(SystemSetup.Nx, sizeof(int)); // pointer for the distal end
ReconstructedImage_GPU = (int*)calloc(SystemSetup.Nz * SystemSetup.Nx, sizeof(int)); // the pointer for the reconstructed image, I
ReconstructedImage_GPU_Q = (int*)calloc(SystemSetup.Nz * SystemSetup.Nx, sizeof(int)); // the pointer for the reconstructed image, Q
ReconstructedImage_GPU_ABS = (int*)calloc(SystemSetup.Nz * SystemSetup.Nx, sizeof(int)); // the pointer for the reconstructed image, IQ demodulated
//>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> prepare the data to run Kernels in the device <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
// Memory allocation and transfer to the Device: SystemSetup this should be only transfered ONCE
(cudaMalloc((void**)&Device_SystemSetup, sizeof(Setup)));
(cudaMemcpy(Device_SystemSetup, &SystemSetup, sizeof(Setup), cudaMemcpyHostToDevice));
// Memory allocation and transfer to the Device: MediumZ this should be only transfered ONCE
(cudaMalloc((float**)&Device_MediumZ, ArrayByteSize_MediumZ));
(cudaMemcpy(Device_MediumZ, MediumZ, ArrayByteSize_MediumZ, cudaMemcpyHostToDevice));
(cudaMalloc((float**)&Device_TRansducerCorrZ, ArrayByteSize_TRansducerCorrZ));
(cudaMemcpy(Device_TRansducerCorrZ, TRansducerCorrZ, ArrayByteSize_TRansducerCorrZ, cudaMemcpyHostToDevice));
// Memory allocation and transfer to the Device : TRansducerCorrX this should be only transfered ONCE
(cudaMalloc((float**)&Device_TRansducerCorrX, ArrayByteSize_TRansducerCorrX));
(cudaMemcpy(Device_TRansducerCorrX, TRansducerCorrX, ArrayByteSize_TRansducerCorrX, cudaMemcpyHostToDevice));
//Memory allocation and transfer to the Device : LensZIndex this should be only transfered ONCE
(cudaMalloc((int**)&Device_LensZIndex, ArrayByteSize_LensZIndex));
(cudaMemcpy(Device_LensZIndex, LensZIndex, ArrayByteSize_LensZIndex, cudaMemcpyHostToDevice));
// Memory allocation and transfer to the Device : LensArrivalTime this should be only transfered ONCE
(cudaMalloc((float**)&Device_LensArrivalTime, ArrayByteSize_LensArrivalTime));
(cudaMemcpy(Device_LensArrivalTime, LensArrivalTime, ArrayByteSize_LensArrivalTime, cudaMemcpyHostToDevice));
(cudaMalloc((int**)&Device_Dir, ArrayByteSize_Dir));
(cudaMemcpy(Device_Dir, Dir, ArrayByteSize_Dir, cudaMemcpyHostToDevice));
(cudaMalloc((int**)&Device_ReconstructedImage_GPU, ArrayByteSize_ReconstructedImage_GPU));
(cudaMalloc((int**)&Device_ReconstructedImage_GPU_Q, ArrayByteSize_ReconstructedImage_GPU));
(cudaMalloc((int**)&Device_ZIndexProximal, ArrayByteSize_LensZIndex));
(cudaMalloc((int**)&Device_ZIndexDistal, ArrayByteSize_LensZIndex));
(cudaMalloc((float**)&Device_ProximalArrivalTime, ArrayByteSize_ProximalArrivalTime));
if (ShowTime == 1) { CUDA_start = clock(); }
//}while (TransferDataToDevice == 1);
int* Device_RfData; // device pointer to the RF data.
(cudaMalloc((int**)&Device_RfData, ArrayByteSize_RfData));
(cudaHostRegister(RfData, ArrayByteSize_RfData, cudaHostRegisterPortable));
//printf("The CUDA reconstruction started... \n");
dim3 block(1024, 1);
// >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Find the type of image recnstruction: conventional/advanced >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
if (SystemSetup.ProcessingType == 1) {
NumberOfPixels = (SystemSetup.Nz - 1) * SystemSetup.Nx;
GridX = (NumberOfPixels / block.x);
ReconstructionSoundSpeed = SystemSetup.WaterSoundSpeed;
}
else if (SystemSetup.ProcessingType == 2) {
NumberOfPixels = SystemSetup.LenseThicknessIndex * SystemSetup.Nx;
GridX = (NumberOfPixels / block.x + (((NumberOfPixels % block.x) != 0) * 1));
ReconstructionSoundSpeed = SystemSetup.LenseSoundSpeed;
}
else {
cout << "ERROR in the Griding: The processing type does not exist..." << endl;
}
dim3 grid(GridX, SystemSetup.NumberOfTransmitter);//SystemSetup.NumberOfTransmitter
cudaStream_t* streams = new cudaStream_t[SystemSetup.NumberOfTransmitter]; //SystemSetup.NumberOfTransmitter
for (int transmit = 0; transmit < SystemSetup.NumberOfTransmitter; transmit++) {
cudaStreamCreate(&streams[transmit]);
NStart_Transmit = transmit * (SystemSetup.NumberOfReceiver * SystemSetup.NumberOfSamples);
cudaMemcpyAsync(&Device_RfData[NStart_Transmit], &RfData[NStart_Transmit], BYTES_PER_STREAM, cudaMemcpyHostToDevice, streams[transmit]);
kernel_Reconstruction2 << <grid, block, 0, streams[transmit] >> > (Device_SystemSetup, Device_MediumZ, Device_MediumX, Device_TRansducerCorrZ, Device_TRansducerCorrX,
&Device_RfData[NStart_Transmit], Device_Dir, Dir_Size, ReconstructionSoundSpeed, Device_ReconstructedImage_GPU, Device_ReconstructedImage_GPU_Q,
transmit, NStart_Transmit, NumberOfPixels);
(cudaPeekAtLastError());
}
for (int transmit = 0; transmit < SystemSetup.NumberOfTransmitter; transmit++) { cudaStreamDestroy(streams[transmit]); } // destroy the streams
delete[] streams;
cudaDeviceSynchronize();
(cudaMemcpy(ReconstructedImage_GPU, Device_ReconstructedImage_GPU, ArrayByteSize_ReconstructedImage_GPU, cudaMemcpyDeviceToHost));
(cudaMemcpy(ReconstructedImage_GPU_Q, Device_ReconstructedImage_GPU_Q, ArrayByteSize_ReconstructedImage_GPU, cudaMemcpyDeviceToHost));
for (int IX = 0; IX < SystemSetup.Nz * SystemSetup.Nx; IX++) {
ReconstructedImage_GPU_ABS[IX] = abs(complex<int>(ReconstructedImage_GPU[IX], ReconstructedImage_GPU_Q[IX]));
}
for (int Pz = 0; Pz < SystemSetup.Nz; Pz++) {
for (int Px = 0; Px < SystemSetup.Nx; Px++) {
ReconstructedImage_GPU_Sorted[Px * SystemSetup.Nz + Pz] = ReconstructedImage_GPU_ABS[Pz * SystemSetup.Nx + Px];
}
//fout30 << endl;
}
if (ShowTime == 1) {
CUDA_end = clock();
*ProcessingTime = (double)((double)(CUDA_end - CUDA_start) / CLOCKS_PER_SEC);
}
// delete the memories defined in the program
(cudaFree(Device_SystemSetup));
(cudaFree(Device_MediumZ));
//gpuErrchk(cudaFree(Device_MediumX));
mxGPUDestroyGPUArray(MediumX);
(cudaFree(Device_TRansducerCorrZ));
(cudaFree(Device_TRansducerCorrX));
(cudaFree(Device_LensZIndex));
(cudaFree(Device_LensArrivalTime));
(cudaFree(Device_RfData));
(cudaFree(Device_Dir));
(cudaFree(Device_ReconstructedImage_GPU));
(cudaFree(Device_ReconstructedImage_GPU_Q));
(cudaFree(Device_ProximalArrivalTime));
(cudaFree(Device_ZIndexProximal));
(cudaFree(Device_ZIndexDistal));
delete[] xParabola;
delete[]FittingOutput;
free(ZIndexProximal);
free(ZIndexDistal);
free(ReconstructedImage_GPU);
free(ReconstructedImage_GPU_Q);
free(ReconstructedImage_GPU_ABS);
//cudaDeviceReset();
}
So, As you can see, i use "cudaFree", "delete" and "free" at the end of my code for memory management. Could you please let me know what could be wrong in my code?
Moein.
  3 个评论
Moein Mozaffarzadeh
Hi Richard,
Thank you for your comment.
I added cudaHostUnregister(RfData); at the end of my code, but the problem still exists. So, there should be something else going on. I also compile all these codes with Matlab following your suggestion.
Any other idea?
Moein.
Moein Mozaffarzadeh
Hi Richard,
If it helps, pleae use https://filesender.surf.nl/?s=download&token=6b1c494b-56d4-4f48-9336-0f086f5dbb2b to download the data you need for running the compiled code in Matlab.
Then, please use in Matlab to see the problem.
clear TUI_Main
GPU_device=gpuDevice();
DR=40;
for i=1:10000
i
TransferDataToDevice=i;
[Image,ProcessingTime]=TUI_Main(TransferDataToDevice,Setup,MediumX,MediumZ,Lens_Index_Array,...
data_Rearranged_saved,Dir_Rearranged_saved,LensToElementsArrivalTime,TransElmXPos,TransElmZPos);
PT(i)=ProcessingTime;
abs1= double(abs(Image));
a = abs1/max(abs1(:));
a1 = 20*log10(a);
a2= (a1+DR);
a3=a2.*(a2>0);
figure(1) ;
imagesc(MediumX,MediumZ,a3); colormap gray; pause(0.1);
%%reset(GPU_device);
end
While the first image (i=1) is as i expect, unwanted effects are added to the image each as i increases.
I hope this helps with better problem clarification.
Moein.

请先登录,再进行评论。

回答(0 个)

类别

Help CenterFile Exchange 中查找有关 GPU Computing 的更多信息

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!

Translated by