Commit 7722cd19 authored by He Guanlin's avatar He Guanlin
Browse files

Update gpu.cu

parent a5b9817b
...@@ -214,6 +214,7 @@ __global__ void ComputeAssign(T_real *GPU_dataT, T_real *GPU_centroid, int *GPU_ ...@@ -214,6 +214,7 @@ __global__ void ComputeAssign(T_real *GPU_dataT, T_real *GPU_centroid, int *GPU_
#endif #endif
#if BSXN > 16 #if BSXN > 16
__syncwarp(); // avoid races between threads within the same warp
if (threadIdx.x < 16) if (threadIdx.x < 16)
shTrack[threadIdx.x] += shTrack[threadIdx.x + 16]; shTrack[threadIdx.x] += shTrack[threadIdx.x + 16];
else else
...@@ -221,6 +222,7 @@ __global__ void ComputeAssign(T_real *GPU_dataT, T_real *GPU_centroid, int *GPU_ ...@@ -221,6 +222,7 @@ __global__ void ComputeAssign(T_real *GPU_dataT, T_real *GPU_centroid, int *GPU_
#endif #endif
#if BSXN > 8 #if BSXN > 8
__syncwarp(); // avoid races between threads within the same warp
if (threadIdx.x < 8) if (threadIdx.x < 8)
shTrack[threadIdx.x] += shTrack[threadIdx.x + 8]; shTrack[threadIdx.x] += shTrack[threadIdx.x + 8];
else else
...@@ -228,6 +230,7 @@ __global__ void ComputeAssign(T_real *GPU_dataT, T_real *GPU_centroid, int *GPU_ ...@@ -228,6 +230,7 @@ __global__ void ComputeAssign(T_real *GPU_dataT, T_real *GPU_centroid, int *GPU_
#endif #endif
#if BSXN > 4 #if BSXN > 4
__syncwarp(); // avoid races between threads within the same warp
if (threadIdx.x < 4) if (threadIdx.x < 4)
shTrack[threadIdx.x] += shTrack[threadIdx.x + 4]; shTrack[threadIdx.x] += shTrack[threadIdx.x + 4];
else else
...@@ -235,6 +238,7 @@ __global__ void ComputeAssign(T_real *GPU_dataT, T_real *GPU_centroid, int *GPU_ ...@@ -235,6 +238,7 @@ __global__ void ComputeAssign(T_real *GPU_dataT, T_real *GPU_centroid, int *GPU_
#endif #endif
#if BSXN > 2 #if BSXN > 2
__syncwarp(); // avoid races between threads within the same warp
if (threadIdx.x < 2) if (threadIdx.x < 2)
shTrack[threadIdx.x] += shTrack[threadIdx.x + 2]; shTrack[threadIdx.x] += shTrack[threadIdx.x + 2];
else else
...@@ -242,6 +246,7 @@ __global__ void ComputeAssign(T_real *GPU_dataT, T_real *GPU_centroid, int *GPU_ ...@@ -242,6 +246,7 @@ __global__ void ComputeAssign(T_real *GPU_dataT, T_real *GPU_centroid, int *GPU_
#endif #endif
#if BSXN > 1 #if BSXN > 1
__syncwarp(); // avoid races between threads within the same warp
if (threadIdx.x < 1) if (threadIdx.x < 1)
shTrack[threadIdx.x] += shTrack[threadIdx.x + 1]; shTrack[threadIdx.x] += shTrack[threadIdx.x + 1];
else else
...@@ -326,7 +331,7 @@ __global__ void UpdateCentroids_Step1_Parent(int *GPU_label, T_real *GPU_package ...@@ -326,7 +331,7 @@ __global__ void UpdateCentroids_Step1_Parent(int *GPU_label, T_real *GPU_package
Db.y = BSYD; Db.y = BSYD;
Db.z = 1; Db.z = 1;
Dg.y = NbDims/BSYD + (NbDims%BSYD > 0 ? 1 : 0); Dg.y = NbDims/Db.y + (NbDims%Db.y > 0 ? 1 : 0);
Dg.z = 1; Dg.z = 1;
for (int i = 0; i < np; i++) { for (int i = 0; i < np; i++) {
...@@ -334,7 +339,7 @@ __global__ void UpdateCentroids_Step1_Parent(int *GPU_label, T_real *GPU_package ...@@ -334,7 +339,7 @@ __global__ void UpdateCentroids_Step1_Parent(int *GPU_label, T_real *GPU_package
if (pid < NbPackages) { if (pid < NbPackages) {
offset = (pid < remainder ? ((quotient + 1) * pid) : (quotient * pid + remainder)); offset = (pid < remainder ? ((quotient + 1) * pid) : (quotient * pid + remainder));
length = (pid < remainder ? (quotient + 1) : quotient); length = (pid < remainder ? (quotient + 1) : quotient);
Dg.x = length/BSXP + (length%BSXP > 0 ? 1 : 0); Dg.x = length/Db.x + (length%Db.x > 0 ? 1 : 0);
// Launch a child kernel on a stream to process a package // Launch a child kernel on a stream to process a package
UpdateCentroids_Step1_Child<<<Dg,Db,0,stream>>>(pid, offset, length, GPU_label, GPU_package, GPU_dataT, GPU_count); UpdateCentroids_Step1_Child<<<Dg,Db,0,stream>>>(pid, offset, length, GPU_label, GPU_package, GPU_dataT, GPU_count);
} }
...@@ -454,7 +459,7 @@ void gpuKmeans(void) ...@@ -454,7 +459,7 @@ void gpuKmeans(void)
Tms_transpose += elapsed; Tms_transpose += elapsed;
} }
CHECK_CUDA_SUCCESS(cudaMemset(GPU_label, 0, sizeof(int)*NbPoints), "Reset GPU_label to zeros"); // CHECK_CUDA_SUCCESS(cudaMemset(GPU_label, 0, sizeof(int)*NbPoints), "Reset GPU_label to zeros");
do { do {
// Compute point-centroid distances & Assign each point to its nearest centroid // Compute point-centroid distances & Assign each point to its nearest centroid
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment