Commit 7722cd19 authored by He Guanlin's avatar He Guanlin
Browse files

Update gpu.cu

parent a5b9817b
......@@ -214,6 +214,7 @@ __global__ void ComputeAssign(T_real *GPU_dataT, T_real *GPU_centroid, int *GPU_
#endif
#if BSXN > 16
__syncwarp(); // avoid races between threads within the same warp
if (threadIdx.x < 16)
shTrack[threadIdx.x] += shTrack[threadIdx.x + 16];
else
......@@ -221,6 +222,7 @@ __global__ void ComputeAssign(T_real *GPU_dataT, T_real *GPU_centroid, int *GPU_
#endif
#if BSXN > 8
__syncwarp(); // avoid races between threads within the same warp
if (threadIdx.x < 8)
shTrack[threadIdx.x] += shTrack[threadIdx.x + 8];
else
......@@ -228,6 +230,7 @@ __global__ void ComputeAssign(T_real *GPU_dataT, T_real *GPU_centroid, int *GPU_
#endif
#if BSXN > 4
__syncwarp(); // avoid races between threads within the same warp
if (threadIdx.x < 4)
shTrack[threadIdx.x] += shTrack[threadIdx.x + 4];
else
......@@ -235,6 +238,7 @@ __global__ void ComputeAssign(T_real *GPU_dataT, T_real *GPU_centroid, int *GPU_
#endif
#if BSXN > 2
__syncwarp(); // avoid races between threads within the same warp
if (threadIdx.x < 2)
shTrack[threadIdx.x] += shTrack[threadIdx.x + 2];
else
......@@ -242,6 +246,7 @@ __global__ void ComputeAssign(T_real *GPU_dataT, T_real *GPU_centroid, int *GPU_
#endif
#if BSXN > 1
__syncwarp(); // avoid races between threads within the same warp
if (threadIdx.x < 1)
shTrack[threadIdx.x] += shTrack[threadIdx.x + 1];
else
......@@ -326,7 +331,7 @@ __global__ void UpdateCentroids_Step1_Parent(int *GPU_label, T_real *GPU_package
Db.y = BSYD;
Db.z = 1;
Dg.y = NbDims/BSYD + (NbDims%BSYD > 0 ? 1 : 0);
Dg.y = NbDims/Db.y + (NbDims%Db.y > 0 ? 1 : 0);
Dg.z = 1;
for (int i = 0; i < np; i++) {
......@@ -334,7 +339,7 @@ __global__ void UpdateCentroids_Step1_Parent(int *GPU_label, T_real *GPU_package
if (pid < NbPackages) {
offset = (pid < remainder ? ((quotient + 1) * pid) : (quotient * pid + remainder));
length = (pid < remainder ? (quotient + 1) : quotient);
Dg.x = length/BSXP + (length%BSXP > 0 ? 1 : 0);
Dg.x = length/Db.x + (length%Db.x > 0 ? 1 : 0);
// Launch a child kernel on a stream to process a package
UpdateCentroids_Step1_Child<<<Dg,Db,0,stream>>>(pid, offset, length, GPU_label, GPU_package, GPU_dataT, GPU_count);
}
......@@ -454,7 +459,7 @@ void gpuKmeans(void)
Tms_transpose += elapsed;
}
CHECK_CUDA_SUCCESS(cudaMemset(GPU_label, 0, sizeof(int)*NbPoints), "Reset GPU_label to zeros");
// CHECK_CUDA_SUCCESS(cudaMemset(GPU_label, 0, sizeof(int)*NbPoints), "Reset GPU_label to zeros");
do {
// Compute point-centroid distances & Assign each point to its nearest centroid
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment