Size: 512 x 512 x 512
Threads | Computation Time (s) | Performance (Gflop/s) | Speedup / Single Thread |
---|
1 | 267.075484 | 0.653310 | 1. |
2 | 135.902281 | 1.283886 | 1.96520182 |
4 | 70.080111 | 2.489766 | 3.81100243 |
8 | 36.813651 | 4.739629 | 7.25479328 |
16 | 25.793147 | 6.764706 | 10.35451164 |
28 | 17.400062 | 10.027726 | 15.34910839 |
Threads | Computation Time (s) | Performance (Gflop/s) | Speedup / Single Thread | Speedup / Naive Single Thread |
---|
1 | 33.793919 | 5.163149 | 1. | 7.90306133 |
2 | 20.366889 | 8.566996 | 1.65925795 | 13.11321731 |
4 | 11.615848 | 15.021120 | 2.90929431 | 22.99233136 |
8 | 10.494952 | 16.625426 | 3.22001670 | 25.44798947 |
16 | 8.618103 | 20.246109 | 3.92127150 | 30.99004913 |
28 | 7.622385 | 22.890871 | 4.43350967 | 35.03829882 |
使用 Time Skewing + Intrinsic 手动向量化进行优化. Time Skewing 通过在空间维度上进行分块, 在时间维度上进行斜向划分, 使得 t + 1
时刻的计算能够利用 t
时刻缓存在 cache 中的计算结果, 因而提升性能. 注意, 边界上的块会随 t
的增大而逐渐减小. 因此块的大小并不均衡. 好消息是, Time Skewing 并不并行块, 而是在每个块内部进行并行计算, 并不会导致负载的严重不均衡.
使用 Intel Intrinsic 手动向量化的优化效果并不明显, 与自动向量化相比几乎没有提升, 但聊胜于无.
此外, 由于计算顺序的不同, 结果可能会与串行得到的结果产生微小偏差, 在可接受范围之内, 算法本身没有错误.
其余的算法, 如 2D Cache Blocking, Cache Oblivious, Circular Queue 等算法也参考 StencilProbeopen in new window 进行了实现和测试, 均不如 Time Skewing 高效, 测试结果见 Performance.
#define INTRINSIC
#include <immintrin.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include "common.h"
const char *version_name = "OMP";
void create_dist_grid(dist_grid_info_t *grid_info) {
if (grid_info->p_id == 0) {
grid_info->local_size_x = grid_info->global_size_x;
grid_info->local_size_y = grid_info->global_size_y;
grid_info->local_size_z = grid_info->global_size_z;
} else {
grid_info->local_size_x = 0;
grid_info->local_size_y = 0;
grid_info->local_size_z = 0;
}
grid_info->offset_x = 0;
grid_info->offset_y = 0;
grid_info->offset_z = 0;
grid_info->halo_size_x = 1;
grid_info->halo_size_y = 1;
grid_info->halo_size_z = 1;
}
void destroy_dist_grid(dist_grid_info_t *grid_info) {}
int Min(const int a, const int b) { return (a < b) ? a : b; }
int Max(const int a, const int b) { return (a < b) ? b : a; }
#ifdef INTRINSIC
void Kernel7(cptr_t a0, ptr_t a1, const int x, const int y, const int z,
const int ldx, const int ldy) {
const __m256d alpha_zzz = _mm256_set1_pd((double)ALPHA_ZZZ);
const __m256d alpha_nzz = _mm256_set1_pd((double)ALPHA_NZZ);
const __m256d alpha_pzz = _mm256_set1_pd((double)ALPHA_PZZ);
const __m256d alpha_znz = _mm256_set1_pd((double)ALPHA_ZNZ);
const __m256d alpha_zpz = _mm256_set1_pd((double)ALPHA_ZPZ);
const __m256d alpha_zzn = _mm256_set1_pd((double)ALPHA_ZZN);
const __m256d alpha_zzp = _mm256_set1_pd((double)ALPHA_ZZP);
__m256d zzz = _mm256_loadu_pd(a0 + INDEX(x, y, z, ldx, ldy));
__m256d nzz = _mm256_loadu_pd(a0 + INDEX(x - 1, y, z, ldx, ldy));
__m256d pzz = _mm256_loadu_pd(a0 + INDEX(x + 1, y, z, ldx, ldy));
__m256d znz = _mm256_loadu_pd(a0 + INDEX(x, y - 1, z, ldx, ldy));
__m256d zpz = _mm256_loadu_pd(a0 + INDEX(x, y + 1, z, ldx, ldy));
__m256d zzn = _mm256_loadu_pd(a0 + INDEX(x, y, z - 1, ldx, ldy));
__m256d zzp = _mm256_loadu_pd(a0 + INDEX(x, y, z + 1, ldx, ldy));
__m256d res = _mm256_mul_pd(alpha_zzz, zzz);
res = _mm256_fmadd_pd(alpha_nzz, nzz, res);
res = _mm256_fmadd_pd(alpha_pzz, pzz, res);
res = _mm256_fmadd_pd(alpha_znz, znz, res);
res = _mm256_fmadd_pd(alpha_zpz, zpz, res);
res = _mm256_fmadd_pd(alpha_zzn, zzn, res);
res = _mm256_fmadd_pd(alpha_zzp, zzp, res);
_mm256_storeu_pd(a1 + INDEX(x, y, z, ldx, ldy), res);
}
#endif
ptr_t stencil_7(ptr_t grid, ptr_t aux, const dist_grid_info_t *grid_info,
int nt) {
omp_set_num_threads(28);
ptr_t buffer[2] = {grid, aux};
int x_start = grid_info->halo_size_x,
x_end = grid_info->local_size_x + grid_info->halo_size_x;
int y_start = grid_info->halo_size_y,
y_end = grid_info->local_size_y + grid_info->halo_size_y;
int z_start = grid_info->halo_size_z,
z_end = grid_info->local_size_z + grid_info->halo_size_z;
int ldx = grid_info->local_size_x + 2 * grid_info->halo_size_x;
int ldy = grid_info->local_size_y + 2 * grid_info->halo_size_y;
int ldz = grid_info->local_size_z + 2 * grid_info->halo_size_z;
const int tx = x_end - x_start;
const int ty = 16;
const int tz = 112;
for (int zz = z_start; zz < z_end; zz += tz) {
const int neg_z_slope = (zz == z_start) ? 0 : 1;
const int pos_z_slope = (zz + tz < z_end) ? -1 : 0;
for (int yy = y_start; yy < y_end; yy += ty) {
const int neg_y_slope = (yy == y_start) ? 0 : 1;
const int pos_y_slope = (yy + ty < y_end) ? -1 : 0;
for (int xx = x_start; xx < x_end; xx += tx) {
const int neg_x_slope = (xx == x_start) ? 0 : 1;
const int pos_x_slope = (xx + tx < x_end) ? -1 : 0;
for (int t = 0; t < nt; ++t) {
const int block_min_x = Max(x_start, xx - t * neg_x_slope);
const int block_min_y = Max(y_start, yy - t * neg_y_slope);
const int block_min_z = Max(z_start, zz - t * neg_z_slope);
const int block_max_x =
Min(x_end, Max(x_start, xx + tx + t * pos_x_slope));
const int block_max_y =
Min(y_end, Max(y_start, yy + ty + t * pos_y_slope));
const int block_max_z =
Min(z_end, Max(z_start, zz + tz + t * pos_z_slope));
cptr_t a0 = buffer[t % 2];
ptr_t a1 = buffer[(t + 1) % 2];
#pragma omp parallel for
for (int z = block_min_z; z < block_max_z; z++) {
for (int y = block_min_y; y < block_max_y; y++) {
#ifdef INTRINSIC
for (int x = block_min_x; x < block_max_x / 4 * 4; x += 4)
Kernel7(a0, a1, x, y, z, ldx, ldy);
for (int x = block_max_x / 4 * 4; x < block_max_x; ++x) {
a1[INDEX(x, y, z, ldx, ldy)] =
ALPHA_ZZZ * a0[INDEX(x, y, z, ldx, ldy)] +
ALPHA_NZZ * a0[INDEX(x - 1, y, z, ldx, ldy)] +
ALPHA_PZZ * a0[INDEX(x + 1, y, z, ldx, ldy)] +
ALPHA_ZNZ * a0[INDEX(x, y - 1, z, ldx, ldy)] +
ALPHA_ZPZ * a0[INDEX(x, y + 1, z, ldx, ldy)] +
ALPHA_ZZN * a0[INDEX(x, y, z - 1, ldx, ldy)] +
ALPHA_ZZP * a0[INDEX(x, y, z + 1, ldx, ldy)];
}
#else
#pragma omp simd
for (int x = block_min_x; x < block_max_x; ++x) {
a1[INDEX(x, y, z, ldx, ldy)] =
ALPHA_ZZZ * a0[INDEX(x, y, z, ldx, ldy)] +
ALPHA_NZZ * a0[INDEX(x - 1, y, z, ldx, ldy)] +
ALPHA_PZZ * a0[INDEX(x + 1, y, z, ldx, ldy)] +
ALPHA_ZNZ * a0[INDEX(x, y - 1, z, ldx, ldy)] +
ALPHA_ZPZ * a0[INDEX(x, y + 1, z, ldx, ldy)] +
ALPHA_ZZN * a0[INDEX(x, y, z - 1, ldx, ldy)] +
ALPHA_ZZP * a0[INDEX(x, y, z + 1, ldx, ldy)];
}
#endif
}
}
}
}
}
}
return buffer[nt % 2];
}
Threads | Computation Time (s) | Performance (Gflop/s) | Speedup / Single Thread | Speedup / Naive Single Thread |
---|
1 | 23.760512 | 7.3434040 | 1. | 11.24030552 |
2 | 12.095517 | 14.425432 | 1.9644067 | 22.08053145 |
4 | 6.763701 | 25.796978 | 3.5129455 | 39.48658064 |
8 | 3.647941 | 47.830557 | 6.513404 | 73.21265096 |
16 | 2.625376 | 66.460208 | 9.05032707 | 101.72844132 |
28 | 2.359089 | 73.962038 | 10.07190099 | 113.21124428 |
考虑对数据进行分块, 并尽量减小通信. 不难发现, 每个块都需要与其相邻的块交换边界上的数据, 因此通信量的大小与分块后内部多出的表面积成正比. 显然, 如果只沿一个方向分块, 无疑是增加面积最大的分块方法, 因此考虑沿 x, y, z 轴进行 3D Blocking, 在进程数为 2, 4, 8, 16, 28 时进行手动分块, 以减少通信.
在测试时, 我们发现非阻塞通信的提升并不显著, 且编程相对复杂, 容易产生死锁等问题, 因此最终选用 Sendrecv()
进行通信.
此外, 还使用了 Intel Intrinsic 手动向量化进行优化.
#define INTRINSIC
#include <immintrin.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include "common.h"
const char *version_name = "MPI";
typedef struct {
int num_block_x, num_block_y, num_block_z;
int id_x, id_y, id_z;
} GridId;
enum Direction { kXPred, kXSucc, kYPred, kYSucc, kZPred, kZSucc };
int Min(const int a, const int b) { return (b < a) ? b : a; }
int Ceiling(const int a, const int b) { return (a + (b - 1)) / b; }
void Blocking(const int id, const int global_size, const int num_block,
int *local_size, int *offset) {
int block_size = Ceiling(global_size, num_block);
*offset = block_size * id;
if ((*offset) < global_size) {
*local_size = Min(block_size, global_size - (*offset));
} else {
*local_size = 0;
}
}
void create_dist_grid(dist_grid_info_t *grid_info) {
GridId *grid_id = malloc(sizeof(GridId));
grid_info->additional_info = grid_id;
grid_id->num_block_x = 1;
grid_id->num_block_y = 1;
grid_id->num_block_z = 1;
switch (grid_info->p_num) {
case 4: {
grid_id->num_block_x = 1;
grid_id->num_block_y = 2;
grid_id->num_block_z = 2;
break;
}
case 8: {
grid_id->num_block_x = 2;
grid_id->num_block_y = 2;
grid_id->num_block_z = 2;
break;
}
case 16: {
grid_id->num_block_x = 2;
grid_id->num_block_y = 2;
grid_id->num_block_z = 4;
break;
}
case 28: {
grid_id->num_block_x = 2;
grid_id->num_block_y = 2;
grid_id->num_block_z = 7;
break;
}
default: {
grid_id->num_block_x = 1;
grid_id->num_block_y = 1;
grid_id->num_block_z = grid_info->p_num;
break;
}
}
grid_id->id_x = grid_info->p_id % grid_id->num_block_x;
grid_id->id_y =
(grid_info->p_id / grid_id->num_block_x) % grid_id->num_block_y;
grid_id->id_z =
grid_info->p_id / (grid_id->num_block_x * grid_id->num_block_y);
Blocking(grid_id->id_x, grid_info->global_size_x, grid_id->num_block_x,
&(grid_info->local_size_x), &(grid_info->offset_x));
Blocking(grid_id->id_y, grid_info->global_size_y, grid_id->num_block_y,
&(grid_info->local_size_y), &(grid_info->offset_y));
Blocking(grid_id->id_z, grid_info->global_size_z, grid_id->num_block_z,
&(grid_info->local_size_z), &(grid_info->offset_z));
grid_info->halo_size_x = 1;
grid_info->halo_size_y = 1;
grid_info->halo_size_z = 1;
}
void destroy_dist_grid(dist_grid_info_t *grid_info) {
free(grid_info->additional_info);
}
#ifdef INTRINSIC
void Kernel7(cptr_t a0, ptr_t a1, const int x, const int y, const int z,
const int ldx, const int ldy) {
const __m256d alpha_zzz = _mm256_set1_pd((double)ALPHA_ZZZ);
const __m256d alpha_nzz = _mm256_set1_pd((double)ALPHA_NZZ);
const __m256d alpha_pzz = _mm256_set1_pd((double)ALPHA_PZZ);
const __m256d alpha_znz = _mm256_set1_pd((double)ALPHA_ZNZ);
const __m256d alpha_zpz = _mm256_set1_pd((double)ALPHA_ZPZ);
const __m256d alpha_zzn = _mm256_set1_pd((double)ALPHA_ZZN);
const __m256d alpha_zzp = _mm256_set1_pd((double)ALPHA_ZZP);
__m256d zzz = _mm256_loadu_pd(a0 + INDEX(x, y, z, ldx, ldy));
__m256d nzz = _mm256_loadu_pd(a0 + INDEX(x - 1, y, z, ldx, ldy));
__m256d pzz = _mm256_loadu_pd(a0 + INDEX(x + 1, y, z, ldx, ldy));
__m256d znz = _mm256_loadu_pd(a0 + INDEX(x, y - 1, z, ldx, ldy));
__m256d zpz = _mm256_loadu_pd(a0 + INDEX(x, y + 1, z, ldx, ldy));
__m256d zzn = _mm256_loadu_pd(a0 + INDEX(x, y, z - 1, ldx, ldy));
__m256d zzp = _mm256_loadu_pd(a0 + INDEX(x, y, z + 1, ldx, ldy));
__m256d res = _mm256_mul_pd(alpha_zzz, zzz);
res = _mm256_fmadd_pd(alpha_nzz, nzz, res);
res = _mm256_fmadd_pd(alpha_pzz, pzz, res);
res = _mm256_fmadd_pd(alpha_znz, znz, res);
res = _mm256_fmadd_pd(alpha_zpz, zpz, res);
res = _mm256_fmadd_pd(alpha_zzn, zzn, res);
res = _mm256_fmadd_pd(alpha_zzp, zzp, res);
_mm256_storeu_pd(a1 + INDEX(x, y, z, ldx, ldy), res);
}
#endif
ptr_t stencil_7(ptr_t grid, ptr_t aux, const dist_grid_info_t *grid_info,
int nt) {
ptr_t buffer[2] = {grid, aux};
int x_start = grid_info->halo_size_x,
x_end = grid_info->local_size_x + grid_info->halo_size_x;
int y_start = grid_info->halo_size_y,
y_end = grid_info->local_size_y + grid_info->halo_size_y;
int z_start = grid_info->halo_size_z,
z_end = grid_info->local_size_z + grid_info->halo_size_z;
int ldx = grid_info->local_size_x + 2 * grid_info->halo_size_x;
int ldy = grid_info->local_size_y + 2 * grid_info->halo_size_y;
int ldz = grid_info->local_size_z + 2 * grid_info->halo_size_z;
MPI_Datatype XY_PLANE, XZ_PLANE, YZ_PLANE;
MPI_Type_vector(1, ldx * ldy, 0,
MPI_DOUBLE, &XY_PLANE);
MPI_Type_vector(ldz, ldx, ldx * ldy,
MPI_DOUBLE, &XZ_PLANE);
MPI_Type_vector(ldy * ldz, 1, ldx,
MPI_DOUBLE, &YZ_PLANE);
MPI_Type_commit(&XY_PLANE);
MPI_Type_commit(&XZ_PLANE);
MPI_Type_commit(&YZ_PLANE);
GridId *grid_id = grid_info->additional_info;
int pid[6];
int block_size[6] = {0};
int offset;
if (grid_id->id_x > 0) {
pid[kXPred] = INDEX(grid_id->id_x - 1, grid_id->id_y, grid_id->id_z,
grid_id->num_block_x, grid_id->num_block_y);
Blocking(grid_id->id_x - 1, grid_info->global_size_x, grid_id->num_block_x,
&block_size[kXPred], &offset);
}
if (grid_id->id_x + 1 < grid_id->num_block_x) {
pid[kXSucc] = INDEX(grid_id->id_x + 1, grid_id->id_y, grid_id->id_z,
grid_id->num_block_x, grid_id->num_block_y);
Blocking(grid_id->id_x + 1, grid_info->global_size_x, grid_id->num_block_x,
&block_size[kXSucc], &offset);
}
if (grid_id->id_y > 0) {
pid[kYPred] = INDEX(grid_id->id_x, grid_id->id_y - 1, grid_id->id_z,
grid_id->num_block_x, grid_id->num_block_y);
Blocking(grid_id->id_y - 1, grid_info->global_size_y, grid_id->num_block_y,
&block_size[kYPred], &offset);
}
if (grid_id->id_y + 1 < grid_id->num_block_y) {
pid[kYSucc] = INDEX(grid_id->id_x, grid_id->id_y + 1, grid_id->id_z,
grid_id->num_block_x, grid_id->num_block_y);
Blocking(grid_id->id_y + 1, grid_info->global_size_y, grid_id->num_block_y,
&block_size[kYSucc], &offset);
}
if (grid_id->id_z > 0) {
pid[kZPred] = INDEX(grid_id->id_x, grid_id->id_y, grid_id->id_z - 1,
grid_id->num_block_x, grid_id->num_block_y);
Blocking(grid_id->id_z - 1, grid_info->global_size_z, grid_id->num_block_z,
&block_size[kZPred], &offset);
}
if (grid_id->id_z + 1 < grid_id->num_block_z) {
pid[kZSucc] = INDEX(grid_id->id_x, grid_id->id_y, grid_id->id_z + 1,
grid_id->num_block_x, grid_id->num_block_y);
Blocking(grid_id->id_z + 1, grid_info->global_size_z, grid_id->num_block_z,
&block_size[kZSucc], &offset);
}
for (int t = 0; t < nt; ++t) {
ptr_t a0 = buffer[t % 2];
ptr_t a1 = buffer[(t + 1) % 2];
if (block_size[kXPred]) {
MPI_Sendrecv(&a0[INDEX(x_start, 0, 0, ldx, ldy)],
1, YZ_PLANE,
pid[kXPred], kXPred,
&a0[INDEX(x_start - 1, 0, 0, ldx, ldy)],
1, YZ_PLANE,
pid[kXPred], kXSucc,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
if (block_size[kXSucc]) {
MPI_Sendrecv(&a0[INDEX(x_end - 1, 0, 0, ldx, ldy)],
1, YZ_PLANE,
pid[kXSucc], kXSucc,
&a0[INDEX(x_end, 0, 0, ldx, ldy)],
1, YZ_PLANE,
pid[kXSucc], kXPred,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
if (block_size[kYPred]) {
MPI_Sendrecv(&a0[INDEX(0, y_start, 0, ldx, ldy)],
1, XZ_PLANE,
pid[kYPred], kYPred,
&a0[INDEX(0, y_start - 1, 0, ldx, ldy)],
1, XZ_PLANE,
pid[kYPred], kYSucc,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
if (block_size[kYSucc]) {
MPI_Sendrecv(&a0[INDEX(0, y_end - 1, 0, ldx, ldy)],
1, XZ_PLANE,
pid[kYSucc], kYSucc,
&a0[INDEX(0, y_end, 0, ldx, ldy)],
1, XZ_PLANE,
pid[kYSucc], kYPred,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
if (block_size[kZPred]) {
MPI_Sendrecv(&a0[INDEX(0, 0, z_start, ldx, ldy)],
1, XY_PLANE,
pid[kZPred], kZPred,
&a0[INDEX(0, 0, z_start - 1, ldx, ldy)],
1, XY_PLANE,
pid[kZPred], kZSucc,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
if (block_size[kZSucc]) {
MPI_Sendrecv(&a0[INDEX(0, 0, z_end - 1, ldx, ldy)],
1, XY_PLANE,
pid[kZSucc], kZSucc,
&a0[INDEX(0, 0, z_end, ldx, ldy)],
1, XY_PLANE,
pid[kZSucc], kZPred,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
for (int z = z_start; z < z_end; ++z) {
for (int y = y_start; y < y_end; ++y) {
#ifdef INTRINSIC
for (int x = x_start; x < x_end / 4 * 4; x += 4)
Kernel7(a0, a1, x, y, z, ldx, ldy);
for (int x = x_end / 4 * 4; x < x_end; ++x) {
#else
for (int x = x_start; x < x_end; ++x) {
#endif
a1[INDEX(x, y, z, ldx, ldy)] =
ALPHA_ZZZ * a0[INDEX(x, y, z, ldx, ldy)] +
ALPHA_NZZ * a0[INDEX(x - 1, y, z, ldx, ldy)] +
ALPHA_PZZ * a0[INDEX(x + 1, y, z, ldx, ldy)] +
ALPHA_ZNZ * a0[INDEX(x, y - 1, z, ldx, ldy)] +
ALPHA_ZPZ * a0[INDEX(x, y + 1, z, ldx, ldy)] +
ALPHA_ZZN * a0[INDEX(x, y, z - 1, ldx, ldy)] +
ALPHA_ZZP * a0[INDEX(x, y, z + 1, ldx, ldy)];
}
}
}
}
return buffer[nt % 2];
}
Threads | Computation Time (s) | Performance (Gflop/s) | Speedup / Single Thread | Speedup / Naive Single Thread |
---|
1 | 31.195547 | 5.593204 | 1. | 8.561332290 |
2 | 16.056561 | 10.866776 | 1.94285351 | 16.63341446 |
4 | 8.557309 | 20.389943 | 3.64548531 | 31.21021108 |
8 | 4.688768 | 37.212980 | 6.65324919 | 56.96067717 |
16 | 3.599826 | 48.469855 | 8.66584788 | 74.19120326 |
28 | 3.494326 | 49.933255 | 8.92748682 | 76.43118122 |
Size | Computation Time (s) | Performance (Gflop/s) |
---|
256 x 256 x 256 | 1.310379 | 16.644326 |
512 x 512 x 512 | 10.179150 | 17.141220 |
768 x 768 x 768 | 33.206675 | 17.733793 |
Size | Computation Time (s) | Performance (Gflop/s) | Speedup / Naive |
---|
256 x 256 x 256 | 1.042685 | 20.917516 | 1.25673554 |
512 x 512 x 512 | 6.357943 | 27.443318 | 1.60101311 |
768 x 768 x 768 | 21.682391 | 27.159379 | 1.53150423 |
static const int tx = 256;
static const int ty = 16;
Size | Computation Time (s) | Performance (Gflop/s) | Speedup / Naive | Speedup / Naive OPT |
---|
256 x 256 x 256 | 0.656014 | 33.246826 | 1.99748707 | 1.58942515 |
512 x 512 x 512 | 4.403168 | 39.626707 | 2.31177868 | 1.44394738 |
768 x 768 x 768 | 15.625801 | 37.686406 | 2.12511819 | 1.38760190 |
static const int kCutoff = (1 << 20);
static const int ds = 1;
Size | Computation Time (s) | Performance (Gflop/s) | Speedup / Naive | Speedup / Naive OPT |
---|
256 x 256 x 256 | 0.290660 | 75.037455 | 4.50829039 | 3.58730238 |
512 x 512 x 512 | 5.862571 | 29.762205 | 1.73629444 | 1.08449733 |
768 x 768 x 768 | 17.230060 | 34.177494 | 1.92725234 | 1.25840484 |
const int tx = x_end - x_start;
const int ty = 32;
const int tz = 64;
Size | Computation Time (s) | Performance (Gflop/s) | Speedup / Naive | Speedup / Naive OPT |
---|
256 x 256 x 256 | 0.244840 | 89.080177 | 5.35198463 | 4.25864032 |
512 x 512 x 512 | 3.916044 | 44.555941 | 2.59934480 | 1.62356246 |
768 x 768 x 768 | 11.524923 | 51.096243 | 2.88129240 | 1.88134799 |
Size | Computation Time (s) | Performance (Gflop/s) | Speedup / Naive | Speedup / Naive OPT |
---|
256 x 256 x 256 | 1.461332 | 14.925003 | 0.89670216 | 0.71351699 |
512 x 512 x 512 | 12.507371 | 13.950418 | 0.81385211 | 0.50833569 |
768 x 768 x 768 | 40.458871 | 14.555035 | 0.82075138 | 0.53591192 |
Size | Computation Time (s) | Performance (Gflop/s) | Speedup / Naive | Speedup / Naive OPT |
---|
256 x 256 x 256 | 0.728613 | 29.934117 | 1.79845774 | 1.43105505 |
512 x 512 x 512 | 5.759591 | 30.294345 | 1.76733891 | 1.10388784 |
768 x 768 x 768 | 16.067388 | 36.650655 | 2.06671269 | 1.34946587 |
Size | Computation Time (s) | Performance (Gflop/s) | Speedup / Naive | Speedup / Naive OPT |
---|
256 x 256 x 256 | 0.715376 | 30.487998 | 1.83173521 | 1.45753435 |
512 x 512 x 512 | 5.494364 | 31.756734 | 1.85265308 | 1.15717546 |
768 x 768 x 768 | 16.741729 | 35.174399 | 1.98346733 | 1.29511058 |
const int tx = x_end - x_start;
const int ty = 16;
const int tz = 112;
Size | Computation Time (s) | Performance (Gflop/s) | Speedup / Naive | Speedup / Naive OPT |
---|
256 x 256 x 256 | 0.252214 | 86.475845 | 5.19551498 | 4.13413548 |
512 x 512 x 512 | 2.525018 | 69.101709 | 4.03131802 | 2.51797939 |
768 x 768 x 768 | 8.686356 | 67.793709 | 3.82285442 | 2.49614356 |
Size | Computation Time (s) | Performance (Gflop/s) | Speedup / Naive | Speedup / Naive OPT |
---|
256 x 256 x 256 | 0.501527 | 43.487911 | 2.61277693 | 2.07901889 |
512 x 512 x 512 | 4.659057 | 37.450291 | 2.18480896 | 1.36464151 |
768 x 768 x 768 | 15.672776 | 37.573451 | 2.11874871 | 1.38344294 |
Size | Computation Time (s) | Performance (Gflop/s) | Speedup / Naive | Speedup / Naive OPT |
---|
256 x 256 x 256 | 0.490962 | 44.423804 | 2.66900588 | 2.12376097 |
512 x 512 x 512 | 4.637739 | 37.622439 | 2.19485188 | 1.37091437 |
768 x 768 x 768 | 16.453953 | 35.789594 | 2.01815788 | 1.31776187 |
Size | Computation Time (s) | Performance (Gflop/s) | Speedup / Naive | Speedup / Naive OPT |
---|
256 x 256 x 256 | 0.377116 | 57.834689 | 3.47473902 | 2.76489278 |
512 x 512 x 512 | 3.540040 | 49.288444 | 2.87543384 | 1.79600892 |
768 x 768 x 768 | 14.868124 | 39.606897 | 2.23341374 | 1.45831379 |
Size | Computation Time (s) | Performance (Gflop/s) | Speedup / Naive | Speedup / Naive OPT |
---|
256 x 256 x 256 | 0.400361 | 54.476819 | 3.27299640 | 2.60436368 |
512 x 512 x 512 | 3.460323 | 50.423912 | 2.94167580 | 1.83738395 |
768 x 768 x 768 | 14.525021 | 40.542475 | 2.28617053 | 1.49276149 |