Skip to main content

PA2: 模板计算

Course WorkIntroduction to High Performance ComputingMPIOMPAbout 11 minAbout 3246 words

Size: 512 x 512 x 512

Naive

No OPT Performance

ThreadsComputation Time (s)Performance (Gflop/s)Speedup / Single Thread
1267.0754840.6533101.
2135.9022811.2838861.96520182
470.0801112.4897663.81100243
836.8136514.7396297.25479328
1625.7931476.76470610.35451164
2817.40006210.02772615.34910839

OPT Performance

ThreadsComputation Time (s)Performance (Gflop/s)Speedup / Single ThreadSpeedup / Naive Single Thread
133.7939195.1631491.7.90306133
220.3668898.5669961.6592579513.11321731
411.61584815.0211202.9092943122.99233136
810.49495216.6254263.2200167025.44798947
168.61810320.2461093.9212715030.99004913
287.62238522.8908714.4335096735.03829882

OMP

使用 Time Skewing + Intrinsic 手动向量化进行优化. Time Skewing 通过在空间维度上进行分块, 在时间维度上进行斜向划分, 使得 t + 1 时刻的计算能够利用 t 时刻缓存在 cache 中的计算结果, 因而提升性能. 注意, 边界上的块会随 t 的增大而逐渐减小. 因此块的大小并不均衡. 好消息是, Time Skewing 并不并行块, 而是在每个块内部进行并行计算, 并不会导致负载的严重不均衡.

使用 Intel Intrinsic 手动向量化的优化效果并不明显, 与自动向量化相比几乎没有提升, 但聊胜于无.

此外, 由于计算顺序的不同, 结果可能会与串行得到的结果产生微小偏差, 在可接受范围之内, 算法本身没有错误.

其余的算法, 如 2D Cache Blocking, Cache Oblivious, Circular Queue 等算法也参考 StencilProbeopen in new window 进行了实现和测试, 均不如 Time Skewing 高效, 测试结果见 Performance.

#define INTRINSIC

#include <immintrin.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>

#include "common.h"

const char *version_name = "OMP";

void create_dist_grid(dist_grid_info_t *grid_info) {
  // Naive implementation uses Process 0 to do all computations
  if (grid_info->p_id == 0) {
    grid_info->local_size_x = grid_info->global_size_x;
    grid_info->local_size_y = grid_info->global_size_y;
    grid_info->local_size_z = grid_info->global_size_z;
  } else {
    grid_info->local_size_x = 0;
    grid_info->local_size_y = 0;
    grid_info->local_size_z = 0;
  }
  grid_info->offset_x = 0;
  grid_info->offset_y = 0;
  grid_info->offset_z = 0;
  grid_info->halo_size_x = 1;
  grid_info->halo_size_y = 1;
  grid_info->halo_size_z = 1;
}

void destroy_dist_grid(dist_grid_info_t *grid_info) {}

int Min(const int a, const int b) { return (a < b) ? a : b; }

int Max(const int a, const int b) { return (a < b) ? b : a; }

#ifdef INTRINSIC
// calculate 4 elements as a vector
void Kernel7(cptr_t a0, ptr_t a1, const int x, const int y, const int z,
             const int ldx, const int ldy) {
  const __m256d alpha_zzz = _mm256_set1_pd((double)ALPHA_ZZZ);
  const __m256d alpha_nzz = _mm256_set1_pd((double)ALPHA_NZZ);
  const __m256d alpha_pzz = _mm256_set1_pd((double)ALPHA_PZZ);
  const __m256d alpha_znz = _mm256_set1_pd((double)ALPHA_ZNZ);
  const __m256d alpha_zpz = _mm256_set1_pd((double)ALPHA_ZPZ);
  const __m256d alpha_zzn = _mm256_set1_pd((double)ALPHA_ZZN);
  const __m256d alpha_zzp = _mm256_set1_pd((double)ALPHA_ZZP);
  __m256d zzz = _mm256_loadu_pd(a0 + INDEX(x, y, z, ldx, ldy));
  __m256d nzz = _mm256_loadu_pd(a0 + INDEX(x - 1, y, z, ldx, ldy));
  __m256d pzz = _mm256_loadu_pd(a0 + INDEX(x + 1, y, z, ldx, ldy));
  __m256d znz = _mm256_loadu_pd(a0 + INDEX(x, y - 1, z, ldx, ldy));
  __m256d zpz = _mm256_loadu_pd(a0 + INDEX(x, y + 1, z, ldx, ldy));
  __m256d zzn = _mm256_loadu_pd(a0 + INDEX(x, y, z - 1, ldx, ldy));
  __m256d zzp = _mm256_loadu_pd(a0 + INDEX(x, y, z + 1, ldx, ldy));
  __m256d res = _mm256_mul_pd(alpha_zzz, zzz);
  res = _mm256_fmadd_pd(alpha_nzz, nzz, res);
  res = _mm256_fmadd_pd(alpha_pzz, pzz, res);
  res = _mm256_fmadd_pd(alpha_znz, znz, res);
  res = _mm256_fmadd_pd(alpha_zpz, zpz, res);
  res = _mm256_fmadd_pd(alpha_zzn, zzn, res);
  res = _mm256_fmadd_pd(alpha_zzp, zzp, res);
  _mm256_storeu_pd(a1 + INDEX(x, y, z, ldx, ldy), res);
}
#endif

ptr_t stencil_7(ptr_t grid, ptr_t aux, const dist_grid_info_t *grid_info,
                int nt) {
  omp_set_num_threads(28);

  ptr_t buffer[2] = {grid, aux};
  int x_start = grid_info->halo_size_x,
      x_end = grid_info->local_size_x + grid_info->halo_size_x;
  int y_start = grid_info->halo_size_y,
      y_end = grid_info->local_size_y + grid_info->halo_size_y;
  int z_start = grid_info->halo_size_z,
      z_end = grid_info->local_size_z + grid_info->halo_size_z;
  int ldx = grid_info->local_size_x + 2 * grid_info->halo_size_x;
  int ldy = grid_info->local_size_y + 2 * grid_info->halo_size_y;
  int ldz = grid_info->local_size_z + 2 * grid_info->halo_size_z;

  const int tx = x_end - x_start;  // block size along x-axis
  const int ty = 16;               // block size along y-axis
  const int tz = 112;              // block size along z-axis
  for (int zz = z_start; zz < z_end; zz += tz) {
    // shrink size
    const int neg_z_slope = (zz == z_start) ? 0 : 1;
    const int pos_z_slope = (zz + tz < z_end) ? -1 : 0;
    for (int yy = y_start; yy < y_end; yy += ty) {
      const int neg_y_slope = (yy == y_start) ? 0 : 1;
      const int pos_y_slope = (yy + ty < y_end) ? -1 : 0;
      for (int xx = x_start; xx < x_end; xx += tx) {
        const int neg_x_slope = (xx == x_start) ? 0 : 1;
        const int pos_x_slope = (xx + tx < x_end) ? -1 : 0;
        for (int t = 0; t < nt; ++t) {
          const int block_min_x = Max(x_start, xx - t * neg_x_slope);
          const int block_min_y = Max(y_start, yy - t * neg_y_slope);
          const int block_min_z = Max(z_start, zz - t * neg_z_slope);
          const int block_max_x =
              Min(x_end, Max(x_start, xx + tx + t * pos_x_slope));
          const int block_max_y =
              Min(y_end, Max(y_start, yy + ty + t * pos_y_slope));
          const int block_max_z =
              Min(z_end, Max(z_start, zz + tz + t * pos_z_slope));
          cptr_t a0 = buffer[t % 2];
          ptr_t a1 = buffer[(t + 1) % 2];
#pragma omp parallel for
          for (int z = block_min_z; z < block_max_z; z++) {
            for (int y = block_min_y; y < block_max_y; y++) {
#ifdef INTRINSIC
              for (int x = block_min_x; x < block_max_x / 4 * 4; x += 4)
                Kernel7(a0, a1, x, y, z, ldx, ldy);
              for (int x = block_max_x / 4 * 4; x < block_max_x; ++x) {
                a1[INDEX(x, y, z, ldx, ldy)] =
                    ALPHA_ZZZ * a0[INDEX(x, y, z, ldx, ldy)] +
                    ALPHA_NZZ * a0[INDEX(x - 1, y, z, ldx, ldy)] +
                    ALPHA_PZZ * a0[INDEX(x + 1, y, z, ldx, ldy)] +
                    ALPHA_ZNZ * a0[INDEX(x, y - 1, z, ldx, ldy)] +
                    ALPHA_ZPZ * a0[INDEX(x, y + 1, z, ldx, ldy)] +
                    ALPHA_ZZN * a0[INDEX(x, y, z - 1, ldx, ldy)] +
                    ALPHA_ZZP * a0[INDEX(x, y, z + 1, ldx, ldy)];
              }
#else
#pragma omp simd
              for (int x = block_min_x; x < block_max_x; ++x) {
                a1[INDEX(x, y, z, ldx, ldy)] =
                    ALPHA_ZZZ * a0[INDEX(x, y, z, ldx, ldy)] +
                    ALPHA_NZZ * a0[INDEX(x - 1, y, z, ldx, ldy)] +
                    ALPHA_PZZ * a0[INDEX(x + 1, y, z, ldx, ldy)] +
                    ALPHA_ZNZ * a0[INDEX(x, y - 1, z, ldx, ldy)] +
                    ALPHA_ZPZ * a0[INDEX(x, y + 1, z, ldx, ldy)] +
                    ALPHA_ZZN * a0[INDEX(x, y, z - 1, ldx, ldy)] +
                    ALPHA_ZZP * a0[INDEX(x, y, z + 1, ldx, ldy)];
              }
#endif
            }
          }
        }
      }
    }
  }
  return buffer[nt % 2];
}

OMP Performance

ThreadsComputation Time (s)Performance (Gflop/s)Speedup / Single ThreadSpeedup / Naive Single Thread
123.7605127.34340401.11.24030552
212.09551714.4254321.964406722.08053145
46.76370125.7969783.512945539.48658064
83.64794147.8305576.51340473.21265096
162.62537666.4602089.05032707101.72844132
282.35908973.96203810.07190099113.21124428

MPI

考虑对数据进行分块, 并尽量减小通信. 不难发现, 每个块都需要与其相邻的块交换边界上的数据, 因此通信量的大小与分块后内部多出的表面积成正比. 显然, 如果只沿一个方向分块, 无疑是增加面积最大的分块方法, 因此考虑沿 x, y, z 轴进行 3D Blocking, 在进程数为 2, 4, 8, 16, 28 时进行手动分块, 以减少通信.

在测试时, 我们发现非阻塞通信的提升并不显著, 且编程相对复杂, 容易产生死锁等问题, 因此最终选用 Sendrecv() 进行通信.

此外, 还使用了 Intel Intrinsic 手动向量化进行优化.

#define INTRINSIC

#include <immintrin.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>

#include "common.h"

const char *version_name = "MPI";

typedef struct {
  int num_block_x, num_block_y, num_block_z;
  int id_x, id_y, id_z;
} GridId;

enum Direction { kXPred, kXSucc, kYPred, kYSucc, kZPred, kZSucc };

int Min(const int a, const int b) { return (b < a) ? b : a; }

int Ceiling(const int a, const int b) { return (a + (b - 1)) / b; }

void Blocking(const int id, const int global_size, const int num_block,
              int *local_size, int *offset) {
  int block_size = Ceiling(global_size, num_block);
  *offset = block_size * id;
  if ((*offset) < global_size) {
    *local_size = Min(block_size, global_size - (*offset));
  } else {
    *local_size = 0;
  }
}

void create_dist_grid(dist_grid_info_t *grid_info) {
  GridId *grid_id = malloc(sizeof(GridId));
  grid_info->additional_info = grid_id;
  grid_id->num_block_x = 1;
  grid_id->num_block_y = 1;
  grid_id->num_block_z = 1;
  switch (grid_info->p_num) {
    case 4: {
      grid_id->num_block_x = 1;
      grid_id->num_block_y = 2;
      grid_id->num_block_z = 2;
      break;
    }
    case 8: {
      grid_id->num_block_x = 2;
      grid_id->num_block_y = 2;
      grid_id->num_block_z = 2;
      break;
    }
    case 16: {
      grid_id->num_block_x = 2;
      grid_id->num_block_y = 2;
      grid_id->num_block_z = 4;
      break;
    }
    case 28: {
      grid_id->num_block_x = 2;
      grid_id->num_block_y = 2;
      grid_id->num_block_z = 7;
      break;
    }
    default: {
      grid_id->num_block_x = 1;
      grid_id->num_block_y = 1;
      grid_id->num_block_z = grid_info->p_num;
      break;
    }
  }
  grid_id->id_x = grid_info->p_id % grid_id->num_block_x;
  grid_id->id_y =
      (grid_info->p_id / grid_id->num_block_x) % grid_id->num_block_y;
  grid_id->id_z =
      grid_info->p_id / (grid_id->num_block_x * grid_id->num_block_y);
  Blocking(grid_id->id_x, grid_info->global_size_x, grid_id->num_block_x,
           &(grid_info->local_size_x), &(grid_info->offset_x));
  Blocking(grid_id->id_y, grid_info->global_size_y, grid_id->num_block_y,
           &(grid_info->local_size_y), &(grid_info->offset_y));
  Blocking(grid_id->id_z, grid_info->global_size_z, grid_id->num_block_z,
           &(grid_info->local_size_z), &(grid_info->offset_z));
  grid_info->halo_size_x = 1;
  grid_info->halo_size_y = 1;
  grid_info->halo_size_z = 1;
}

void destroy_dist_grid(dist_grid_info_t *grid_info) {
  free(grid_info->additional_info);
}

#ifdef INTRINSIC
void Kernel7(cptr_t a0, ptr_t a1, const int x, const int y, const int z,
             const int ldx, const int ldy) {
  const __m256d alpha_zzz = _mm256_set1_pd((double)ALPHA_ZZZ);
  const __m256d alpha_nzz = _mm256_set1_pd((double)ALPHA_NZZ);
  const __m256d alpha_pzz = _mm256_set1_pd((double)ALPHA_PZZ);
  const __m256d alpha_znz = _mm256_set1_pd((double)ALPHA_ZNZ);
  const __m256d alpha_zpz = _mm256_set1_pd((double)ALPHA_ZPZ);
  const __m256d alpha_zzn = _mm256_set1_pd((double)ALPHA_ZZN);
  const __m256d alpha_zzp = _mm256_set1_pd((double)ALPHA_ZZP);
  __m256d zzz = _mm256_loadu_pd(a0 + INDEX(x, y, z, ldx, ldy));
  __m256d nzz = _mm256_loadu_pd(a0 + INDEX(x - 1, y, z, ldx, ldy));
  __m256d pzz = _mm256_loadu_pd(a0 + INDEX(x + 1, y, z, ldx, ldy));
  __m256d znz = _mm256_loadu_pd(a0 + INDEX(x, y - 1, z, ldx, ldy));
  __m256d zpz = _mm256_loadu_pd(a0 + INDEX(x, y + 1, z, ldx, ldy));
  __m256d zzn = _mm256_loadu_pd(a0 + INDEX(x, y, z - 1, ldx, ldy));
  __m256d zzp = _mm256_loadu_pd(a0 + INDEX(x, y, z + 1, ldx, ldy));
  __m256d res = _mm256_mul_pd(alpha_zzz, zzz);
  res = _mm256_fmadd_pd(alpha_nzz, nzz, res);
  res = _mm256_fmadd_pd(alpha_pzz, pzz, res);
  res = _mm256_fmadd_pd(alpha_znz, znz, res);
  res = _mm256_fmadd_pd(alpha_zpz, zpz, res);
  res = _mm256_fmadd_pd(alpha_zzn, zzn, res);
  res = _mm256_fmadd_pd(alpha_zzp, zzp, res);
  _mm256_storeu_pd(a1 + INDEX(x, y, z, ldx, ldy), res);
}
#endif

ptr_t stencil_7(ptr_t grid, ptr_t aux, const dist_grid_info_t *grid_info,
                int nt) {
  ptr_t buffer[2] = {grid, aux};
  int x_start = grid_info->halo_size_x,
      x_end = grid_info->local_size_x + grid_info->halo_size_x;
  int y_start = grid_info->halo_size_y,
      y_end = grid_info->local_size_y + grid_info->halo_size_y;
  int z_start = grid_info->halo_size_z,
      z_end = grid_info->local_size_z + grid_info->halo_size_z;
  int ldx = grid_info->local_size_x + 2 * grid_info->halo_size_x;
  int ldy = grid_info->local_size_y + 2 * grid_info->halo_size_y;
  int ldz = grid_info->local_size_z + 2 * grid_info->halo_size_z;

  MPI_Datatype XY_PLANE, XZ_PLANE, YZ_PLANE;
  MPI_Type_vector(/*count=*/1, /*blocklength=*/ldx * ldy, /*stride=*/0,
                  /*oldtype=*/MPI_DOUBLE, /*newtype=*/&XY_PLANE);
  MPI_Type_vector(/*count=*/ldz, /*blocklength=*/ldx, /*stride=*/ldx * ldy,
                  /*oldtype=*/MPI_DOUBLE, /*newtype=*/&XZ_PLANE);
  MPI_Type_vector(/*count=*/ldy * ldz, /*blocklength=*/1, /*stride=*/ldx,
                  /*oldtype=*/MPI_DOUBLE, /*newtype=*/&YZ_PLANE);
  MPI_Type_commit(&XY_PLANE);
  MPI_Type_commit(&XZ_PLANE);
  MPI_Type_commit(&YZ_PLANE);

  GridId *grid_id = grid_info->additional_info;
  int pid[6];
  int block_size[6] = {0};
  int offset;
  if (grid_id->id_x > 0) {
    pid[kXPred] = INDEX(grid_id->id_x - 1, grid_id->id_y, grid_id->id_z,
                        grid_id->num_block_x, grid_id->num_block_y);
    Blocking(grid_id->id_x - 1, grid_info->global_size_x, grid_id->num_block_x,
             &block_size[kXPred], &offset);
  }
  if (grid_id->id_x + 1 < grid_id->num_block_x) {
    pid[kXSucc] = INDEX(grid_id->id_x + 1, grid_id->id_y, grid_id->id_z,
                        grid_id->num_block_x, grid_id->num_block_y);
    Blocking(grid_id->id_x + 1, grid_info->global_size_x, grid_id->num_block_x,
             &block_size[kXSucc], &offset);
  }
  if (grid_id->id_y > 0) {
    pid[kYPred] = INDEX(grid_id->id_x, grid_id->id_y - 1, grid_id->id_z,
                        grid_id->num_block_x, grid_id->num_block_y);
    Blocking(grid_id->id_y - 1, grid_info->global_size_y, grid_id->num_block_y,
             &block_size[kYPred], &offset);
  }
  if (grid_id->id_y + 1 < grid_id->num_block_y) {
    pid[kYSucc] = INDEX(grid_id->id_x, grid_id->id_y + 1, grid_id->id_z,
                        grid_id->num_block_x, grid_id->num_block_y);
    Blocking(grid_id->id_y + 1, grid_info->global_size_y, grid_id->num_block_y,
             &block_size[kYSucc], &offset);
  }
  if (grid_id->id_z > 0) {
    pid[kZPred] = INDEX(grid_id->id_x, grid_id->id_y, grid_id->id_z - 1,
                        grid_id->num_block_x, grid_id->num_block_y);
    Blocking(grid_id->id_z - 1, grid_info->global_size_z, grid_id->num_block_z,
             &block_size[kZPred], &offset);
  }
  if (grid_id->id_z + 1 < grid_id->num_block_z) {
    pid[kZSucc] = INDEX(grid_id->id_x, grid_id->id_y, grid_id->id_z + 1,
                        grid_id->num_block_x, grid_id->num_block_y);
    Blocking(grid_id->id_z + 1, grid_info->global_size_z, grid_id->num_block_z,
             &block_size[kZSucc], &offset);
  }
  for (int t = 0; t < nt; ++t) {
    ptr_t a0 = buffer[t % 2];
    ptr_t a1 = buffer[(t + 1) % 2];
    if (block_size[kXPred]) {
      MPI_Sendrecv(/*sendbuf=*/&a0[INDEX(x_start, 0, 0, ldx, ldy)],
                   /*sendcount=*/1, /*sendtype=*/YZ_PLANE,
                   /*dest=*/pid[kXPred], /*sendtag=*/kXPred,
                   /*recvbuf=*/&a0[INDEX(x_start - 1, 0, 0, ldx, ldy)],
                   /*recvcount=*/1, /*recvtype=*/YZ_PLANE,
                   /*source=*/pid[kXPred], /*recvtag=*/kXSucc,
                   /*comm=*/MPI_COMM_WORLD, /*status=*/MPI_STATUS_IGNORE);
    }
    if (block_size[kXSucc]) {
      MPI_Sendrecv(/*sendbuf=*/&a0[INDEX(x_end - 1, 0, 0, ldx, ldy)],
                   /*sendcount=*/1, /*sendtype=*/YZ_PLANE,
                   /*dest=*/pid[kXSucc], /*sendtag=*/kXSucc,
                   /*recvbuf=*/&a0[INDEX(x_end, 0, 0, ldx, ldy)],
                   /*recvcount=*/1, /*recvtype=*/YZ_PLANE,
                   /*source=*/pid[kXSucc], /*recvtag=*/kXPred,
                   /*comm=*/MPI_COMM_WORLD, /*status=*/MPI_STATUS_IGNORE);
    }
    if (block_size[kYPred]) {
      MPI_Sendrecv(/*sendbuf=*/&a0[INDEX(0, y_start, 0, ldx, ldy)],
                   /*sendcount=*/1, /*sendtype=*/XZ_PLANE,
                   /*dest=*/pid[kYPred], /*sendtag=*/kYPred,
                   /*recvbuf=*/&a0[INDEX(0, y_start - 1, 0, ldx, ldy)],
                   /*recvcount=*/1, /*recvtype=*/XZ_PLANE,
                   /*source=*/pid[kYPred], /*recvtag=*/kYSucc,
                   /*comm=*/MPI_COMM_WORLD, /*status=*/MPI_STATUS_IGNORE);
    }
    if (block_size[kYSucc]) {
      MPI_Sendrecv(/*sendbuf=*/&a0[INDEX(0, y_end - 1, 0, ldx, ldy)],
                   /*sendcount=*/1, /*sendtype=*/XZ_PLANE,
                   /*dest=*/pid[kYSucc], /*sendtag=*/kYSucc,
                   /*recvbuf=*/&a0[INDEX(0, y_end, 0, ldx, ldy)],
                   /*recvcount=*/1, /*recvtype=*/XZ_PLANE,
                   /*source=*/pid[kYSucc], /*recvtag=*/kYPred,
                   /*comm=*/MPI_COMM_WORLD, /*status=*/MPI_STATUS_IGNORE);
    }
    if (block_size[kZPred]) {
      MPI_Sendrecv(/*sendbuf=*/&a0[INDEX(0, 0, z_start, ldx, ldy)],
                   /*sendcount=*/1, /*sendtype=*/XY_PLANE,
                   /*dest=*/pid[kZPred], /*sendtag=*/kZPred,
                   /*recvbuf=*/&a0[INDEX(0, 0, z_start - 1, ldx, ldy)],
                   /*recvcount=*/1, /*recvtype=*/XY_PLANE,
                   /*source=*/pid[kZPred], /*recvtag=*/kZSucc,
                   /*comm=*/MPI_COMM_WORLD, /*status=*/MPI_STATUS_IGNORE);
    }
    if (block_size[kZSucc]) {
      MPI_Sendrecv(/*sendbuf=*/&a0[INDEX(0, 0, z_end - 1, ldx, ldy)],
                   /*sendcount=*/1, /*sendtype=*/XY_PLANE,
                   /*dest=*/pid[kZSucc], /*sendtag=*/kZSucc,
                   /*recvbuf=*/&a0[INDEX(0, 0, z_end, ldx, ldy)],
                   /*recvcount=*/1, /*recvtype=*/XY_PLANE,
                   /*source=*/pid[kZSucc], /*recvtag=*/kZPred,
                   /*comm=*/MPI_COMM_WORLD, /*status=*/MPI_STATUS_IGNORE);
    }
    for (int z = z_start; z < z_end; ++z) {
      for (int y = y_start; y < y_end; ++y) {
#ifdef INTRINSIC
        for (int x = x_start; x < x_end / 4 * 4; x += 4)
          Kernel7(a0, a1, x, y, z, ldx, ldy);
        for (int x = x_end / 4 * 4; x < x_end; ++x) {
#else
        for (int x = x_start; x < x_end; ++x) {
#endif
          a1[INDEX(x, y, z, ldx, ldy)] =
              ALPHA_ZZZ * a0[INDEX(x, y, z, ldx, ldy)] +
              ALPHA_NZZ * a0[INDEX(x - 1, y, z, ldx, ldy)] +
              ALPHA_PZZ * a0[INDEX(x + 1, y, z, ldx, ldy)] +
              ALPHA_ZNZ * a0[INDEX(x, y - 1, z, ldx, ldy)] +
              ALPHA_ZPZ * a0[INDEX(x, y + 1, z, ldx, ldy)] +
              ALPHA_ZZN * a0[INDEX(x, y, z - 1, ldx, ldy)] +
              ALPHA_ZZP * a0[INDEX(x, y, z + 1, ldx, ldy)];
        }
      }
    }
  }
  return buffer[nt % 2];
}

MPI Performance

ThreadsComputation Time (s)Performance (Gflop/s)Speedup / Single ThreadSpeedup / Naive Single Thread
131.1955475.5932041.8.561332290
216.05656110.8667761.9428535116.63341446
48.55730920.3899433.6454853131.21021108
84.68876837.2129806.6532491956.96067717
163.59982648.4698558.6658478874.19120326
283.49432649.9332558.9274868276.43118122

Performance

Naive

No OPT

SizeComputation Time (s)Performance (Gflop/s)
256 x 256 x 2561.31037916.644326
512 x 512 x 51210.17915017.141220
768 x 768 x 76833.20667517.733793

OPT

SizeComputation Time (s)Performance (Gflop/s)Speedup / Naive
256 x 256 x 2561.04268520.9175161.25673554
512 x 512 x 5126.35794327.4433181.60101311
768 x 768 x 76821.68239127.1593791.53150423

OMP

2D Cache Blocking

static const int tx = 256;
static const int ty = 16;
SizeComputation Time (s)Performance (Gflop/s)Speedup / NaiveSpeedup / Naive OPT
256 x 256 x 2560.65601433.2468261.997487071.58942515
512 x 512 x 5124.40316839.6267072.311778681.44394738
768 x 768 x 76815.62580137.6864062.125118191.38760190

Cache Oblivious

static const int kCutoff = (1 << 20);
static const int ds = 1;
SizeComputation Time (s)Performance (Gflop/s)Speedup / NaiveSpeedup / Naive OPT
256 x 256 x 2560.29066075.0374554.508290393.58730238
512 x 512 x 5125.86257129.7622051.736294441.08449733
768 x 768 x 76817.23006034.1774941.927252341.25840484

Time Skewing

const int tx = x_end - x_start;
const int ty = 32;
const int tz = 64;
SizeComputation Time (s)Performance (Gflop/s)Speedup / NaiveSpeedup / Naive OPT
256 x 256 x 2560.24484089.0801775.351984634.25864032
512 x 512 x 5123.91604444.5559412.599344801.62356246
768 x 768 x 76811.52492351.0962432.881292401.88134799

Circular Queue

SizeComputation Time (s)Performance (Gflop/s)Speedup / NaiveSpeedup / Naive OPT
256 x 256 x 2561.46133214.9250030.896702160.71351699
512 x 512 x 51212.50737113.9504180.813852110.50833569
768 x 768 x 76840.45887114.5550350.820751380.53591192

Auto SIMD

SizeComputation Time (s)Performance (Gflop/s)Speedup / NaiveSpeedup / Naive OPT
256 x 256 x 2560.72861329.9341171.798457741.43105505
512 x 512 x 5125.75959130.2943451.767338911.10388784
768 x 768 x 76816.06738836.6506552.066712691.34946587

Intrinsic

SizeComputation Time (s)Performance (Gflop/s)Speedup / NaiveSpeedup / Naive OPT
256 x 256 x 2560.71537630.4879981.831735211.45753435
512 x 512 x 5125.49436431.7567341.852653081.15717546
768 x 768 x 76816.74172935.1743991.983467331.29511058

Time Skewing + Intrinsic

const int tx = x_end - x_start;
const int ty = 16;
const int tz = 112;
SizeComputation Time (s)Performance (Gflop/s)Speedup / NaiveSpeedup / Naive OPT
256 x 256 x 2560.25221486.4758455.195514984.13413548
512 x 512 x 5122.52501869.1017094.031318022.51797939
768 x 768 x 7688.68635667.7937093.822854422.49614356

MPI

Blocking Communication

SizeComputation Time (s)Performance (Gflop/s)Speedup / NaiveSpeedup / Naive OPT
256 x 256 x 2560.50152743.4879112.612776932.07901889
512 x 512 x 5124.65905737.4502912.184808961.36464151
768 x 768 x 76815.67277637.5734512.118748711.38344294

Non-Blocking Communication

SizeComputation Time (s)Performance (Gflop/s)Speedup / NaiveSpeedup / Naive OPT
256 x 256 x 2560.49096244.4238042.669005882.12376097
512 x 512 x 5124.63773937.6224392.194851881.37091437
768 x 768 x 76816.45395335.7895942.018157881.31776187

3D Blocking

SizeComputation Time (s)Performance (Gflop/s)Speedup / NaiveSpeedup / Naive OPT
256 x 256 x 2560.37711657.8346893.474739022.76489278
512 x 512 x 5123.54004049.2884442.875433841.79600892
768 x 768 x 76814.86812439.6068972.233413741.45831379

3D Blocking + Intrinsic

SizeComputation Time (s)Performance (Gflop/s)Speedup / NaiveSpeedup / Naive OPT
256 x 256 x 2560.40036154.4768193.272996402.60436368
512 x 512 x 5123.46032350.4239122.941675801.83738395
768 x 768 x 76814.52502140.5424752.286170531.49276149