Commit 0940d9ce authored by Cabaret Laurent's avatar Cabaret Laurent
Browse files

Creation of 3 functions:

- standard
- prepared for autovect
- manual vect
parent 109806a0
......@@ -6,9 +6,9 @@ CUDA_TARGET_FLAGS = --gpu-architecture=sm_75
# For GTX 1080
# CUDA_TARGET_FLAGS = --gpu-architecture=sm_61
CXXFLAGS = -O3 #-DDP
CXXFLAGS = -O3 #-DDP
CXXFLAGS += -I/usr/local/cuda/include/
CC_CXXFLAGS = -fopenmp -funroll-loops
CC_CXXFLAGS = -fopenmp -funroll-loops -march=native
CUDA_CXXFLAGS = $(CUDA_TARGET_FLAGS)
CC_LDFLAGS = -fopenmp
......
......@@ -20,7 +20,7 @@ cluster_dim3 = [60, 60, 40, 40]
cluster_dim4 = [60, 40, 60, 40]
# initialize the nb of points within each cluster and sum up the total nb of points
points_per_cluster = 2500000 # 12500000
points_per_cluster = 12500000 # 12500000
nb_points = [points_per_cluster, points_per_cluster, points_per_cluster, points_per_cluster]
sum_np = sum(nb_points)
......
......@@ -2,11 +2,85 @@
#include <stdlib.h>
#include <omp.h>
#include <float.h>
#include <immintrin.h>
#include "main.h"
#include "init.h"
#include "gpu.h"
/*-----------------------------------------------------------------------------------------*/
/* no_vectorisation candidate */
/*-----------------------------------------------------------------------------------------*/
T_real square_of_distance_between_instance_i_and_centroid_k_no_vect(int i, int k, T_real dist_sq)
{
for (int j = 0; j < (NbDims); j++) {
dist_sq += (data[i*NbDims + j] - centroid[k][j])*(data[i*NbDims + j] - centroid[k][j]);
}
return dist_sq;
}
/*-----------------------------------------------------------------------------------------*/
/* Autovectorisation candidate */
/*-----------------------------------------------------------------------------------------*/
T_real square_of_distance_between_instance_i_and_centroid_k_auto_vect(int i, int k, T_real dist_sq)
{
for (int j = 0; j < (NbDims/8)*8; j += 8) {
T_real ddist_sq[8] = {0.0};
for (int jj = 0; jj < 8; jj++) {
ddist_sq[jj] = (data[i*NbDims + j + jj] - centroid[k][j + jj])*(data[i*NbDims + j + jj] - centroid[k][j + jj]);
}
dist_sq += (ddist_sq[0] + ddist_sq[1] + ddist_sq[2] + ddist_sq[3] + ddist_sq[4] + ddist_sq[5] + ddist_sq[6] + ddist_sq[7]);
}
for (int j = (NbDims/8)*8; j < (NbDims/4)*4; j += 4) {
T_real ddist_sq[4] = {0.0};
for (int jj = 0; jj < 4; jj++) {
ddist_sq[jj] = (data[i*NbDims + j + jj] - centroid[k][j + jj])*(data[i*NbDims + j + jj] - centroid[k][j + jj]);
}
for (int jj = 0; jj < 4; jj++)
dist_sq += ddist_sq[jj];
}
if (NbDims%4 > 0) {
T_real ddist_sq[3] = {0.0};
for (int jj = 0; jj < NbDims%4; jj++) {
ddist_sq[jj] = (data[i*NbDims + (NbDims/4)*4 + jj] - centroid[k][(NbDims/4)*4 + jj])*(data[i*NbDims + (NbDims/4)*4 + jj] - centroid[k][(NbDims/4)*4 + jj]);
}
dist_sq += (ddist_sq[0] + ddist_sq[1] + ddist_sq[2]);
}
return dist_sq;
}
/*-----------------------------------------------------------------------------------------*/
/* Autovectorisation candidate */
/*-----------------------------------------------------------------------------------------*/
T_real square_of_distance_between_instance_i_and_centroid_k_manual_vect(int i, int k, T_real dist_sq)
{
__m256 vect_dist_sq = _mm256_set1_ps(0.0f);
for (int j = 0; j < (NbDims/8)*8; j += 8) {
__m256 vect_data = _mm256_loadu_ps(&data[i*NbDims + j]);
__m256 vect_centroid = _mm256_loadu_ps(&centroid[k][j]);
vect_data = _mm256_mul_ps(vect_data, vect_centroid);
vect_data = _mm256_mul_ps(vect_data, vect_data);
vect_dist_sq = _mm256_add_ps(vect_dist_sq, vect_data);
}
float ddist_sq[8];
_mm256_storeu_ps((float *)ddist_sq, vect_dist_sq);
dist_sq += (ddist_sq[0] + ddist_sq[1] + ddist_sq[2] + ddist_sq[3] + ddist_sq[4] + ddist_sq[5] + ddist_sq[6] + ddist_sq[7]);
// Finish the job
for (int j = (NbDims/8)*8; j < NbDims; j ++) {
dist_sq += (data[i*NbDims + j] - centroid[k][j])*(data[i*NbDims + j] - centroid[k][j]);
}
return dist_sq;
}
/*-----------------------------------------------------------------------------------------*/
/* K-means clustering on the CPU */
......@@ -80,30 +154,9 @@ void cpuKmeans(void)
for (int k = 0; k < NbClusters; k++) {
dist_sq = 0.0;
// Calculate the square of distance between instance i and centroid k
for (int j = 0; j < (NbDims/8)*8; j += 8) {
T_real ddist_sq[8] = {0.0};
for (int jj = 0; jj < 8; jj++) {
ddist_sq[jj] = (data[i*NbDims + j + jj] - centroid[k][j + jj])*(data[i*NbDims + j + jj] - centroid[k][j + jj]);
}
dist_sq += (ddist_sq[0] + ddist_sq[1] + ddist_sq[2] + ddist_sq[3] + ddist_sq[4] + ddist_sq[5] + ddist_sq[6] + ddist_sq[7]);
}
for (int j = (NbDims/8)*8; j < (NbDims/4)*4; j += 4) {
T_real ddist_sq[4] = {0.0};
for (int jj = 0; jj < 4; jj++) {
ddist_sq[jj] = (data[i*NbDims + j + jj] - centroid[k][j + jj])*(data[i*NbDims + j + jj] - centroid[k][j + jj]);
}
for (int jj = 0; jj < 4; jj++)
dist_sq += ddist_sq[jj];
}
if (NbDims%4 > 0) {
T_real ddist_sq[3] = {0.0};
for (int jj = 0; jj < NbDims%4; jj++) {
ddist_sq[jj] = (data[i*NbDims + (NbDims/4)*4 + jj] - centroid[k][(NbDims/4)*4 + jj])*(data[i*NbDims + (NbDims/4)*4 + jj] - centroid[k][(NbDims/4)*4 + jj]);
}
dist_sq += (ddist_sq[0] + ddist_sq[1] + ddist_sq[2]);
}
//dist_sq = square_of_distance_between_instance_i_and_centroid_k_no_vect(i, k, dist_sq);
//dist_sq = square_of_distance_between_instance_i_and_centroid_k_auto_vect(i, k, dist_sq);
dist_sq = square_of_distance_between_instance_i_and_centroid_k_manual_vect(i, k, dist_sq);
// Find and record the nearest centroid to instance i
bool a = (dist_sq < minDist_sq);
......
......@@ -2,7 +2,7 @@
/* Define constants */
/*-----------------------------------------------------------------------------------------*/
// Benchmark dataset
#define NbPoints 1000000 // Number of data instances (50000000)
#define NbPoints 50000000 // Number of data instances (50000000)
#define NbDims 4 // Number of dimensions
#define NbClusters 4 // Number of clusters
#define NbPackages 100 // Number of packages used for UpdateCentroids
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment