Index
HPC - Universidad de Sevilla

Example - Scale/add of a vector

Source code in hpc/src/scale

We will start with a situation we might find in an already existing program:

Data organized in structures (pad1..3 are "placeholders" for other real variables you will have in your program).

struct
{
   char pad1;
   REAL va[ARRAY_SIZE];
   char pad2;
   REAL vb[ARRAY_SIZE];
   char pad3;
} data;

Vector operations on data: scale + add (REAL = double or float)

/**
 * va = a*va + b
 */
void scale(int n, REAL a, REAL *va, REAL *vb) {
  int i;
  for(i=0; i < n; i++) {
    va[i] = a * va[i] + vb[i];
  }
}

The main program is organized in a way that these transformations are applied iteratively and vector size is small so both vectors fit in L1 cache, this eliminates limitations from memory bandwidth.

In order to check our programs, it is always interesting to have an estimation of the error. Applying some simple math,

Mathematica

we can find the effect of applying N iterations to be

va_N = (a^(N-1)-1)/(a-1) * vb +  a^N  * va_0   (a != 1)
va_N = N * vb +  a^N  * va_0                   (a == 1)

The Makefile in hpc/src/scale is a simplified version of the one you can find in hpc/src/vectorize. Make shure you understand what the Makefile does.

We will now try to apply a series of modifications to the original code and meassure the effect these changes have on performance. You can try to change the CFLAGS = .. setting in the Makefile and time a version without vectorization (CFLAGS= -O1 for gcc).

scale1 - original version
// define in extra .c file:
void scale(int n, REAL a, REAL *va, REAL *vb) {
  int i;
  for(i=0; i < n; i++) {
    va[i] = a * va[i] + vb[i];
  }
}
scale2 - inline function call to scale()
// define only in .h file:
inline void scale(int n, REAL a, REAL *va, REAL *vb) {
  int i;
  for(i=0; i < n; i++) {
    va[i] = a * va[i] + vb[i];
  }
}
scale3 - no dependecies from pointers
inline void scale(int n, REAL a,
                  REAL * __restrict__ va, REAL * __restrict__ vb) {
scale4 - align arrays
struct
{
   char pad1;
   REAL va[ARRAY_SIZE] __attribute__((aligned (32)));
   char pad2;
   REAL vb[ARRAY_SIZE] __attribute__((aligned (32)));
   char pad3;
} data
scale5 - intrinsics (example for float)

Before checking the scale5 example, please go through the simple example `hpc/src/scale/intrin.c'.

inline void scale(int n, REAL a,
                  REAL * __restrict__ va, REAL * __restrict__ vb) {
  int i;
  if (n % 8 != 0) {
    printf("Fatal error: loop count must be divisibe by 8!\n");
    exit(-1);
  }

  __m256 vrega = _mm256_set1_ps(a);

  for(i=0; i < n; i+=8) {
    __m256 vregva1 = _mm256_load_ps (&va[i]);
    __m256 vregvb1 = _mm256_load_ps (&vb[i]);
#ifdef __AVX2__
    vregva1 =  _mm256_fmadd_ps (vrega, vregva1, vregvb1);
#else
    vregva1 = _mm256_mul_ps(vrega,vregva1);
    vregva1 = _mm256_add_ps(vregva1,vregvb1);
#endif
    _mm256_store_ps (&va[i], vregva1);
  }
}

I have run these codes on my laptop (I7-4600U, Haswell) and on the CICA cluster (AMD Opteron 6344, Piledriver). The results can be found in this PDF.

Fortran version
[denk@scadm01 ~]$ gfortran --version | grep GCC
GNU Fortran (GCC) 4.4.7 20120313 (Red Hat 4.4.7-4)
[denk@scadm01 ~]$ gfortran -O3 -ftree-vectorize -march=native -Wall iterate1.f
[denk@scadm01 ~]$ ./a.out < data.in
Flops =   0.1681590E+10 va =   0.6331605E+10
[denk@scadm01 ~]$ scl enable devtoolset-3 bash
[denk@scadm01 ~]$ gfortran --version | grep GCC
GNU Fortran (GCC) 4.9.2 20150212 (Red Hat 4.9.2-6)

[denk@scadm01 ~]$ gfortran -O3 -ftree-vectorize -march=native -Wall iterate1.f
[denk@scadm01 ~]$ ./a.out < data.in
Flops =   0.2863329E+10 va =   0.6331605E+10

Extract of machine code, gfortran 4.4.7

    call    _gfortran_cpu_time_4
    xorl    %r12d, %r12d
    leaq    32(%rsp), %rax
    ....    ...... (various instructions)
    call    scale_
    ....    ...... (various instructions)
    call    _gfortran_cpu_time_4

Extract of machine code, gfortran 4.9.2

    call    _gfortran_cpu_time_4
    vmovddup        40(%rsp), %xmm1
    movl    $100000000, %edx
    .p2align 4,,10
    .p2align 3
.L4:
    movq    %rbp, %rax
    .p2align 4,,10
    .p2align 3
.L3:
    vmovups (%rax), %xmm5
    addq    $16, %rax
    vfmaddpd        504(%rax), %xmm1, %xmm5, %xmm0
    vmovups %xmm0, -16(%rax)
    cmpq    %rbx, %rax
    jne     .L3
    decl    %edx
    jne     .L4
    xorl    %eax, %eax
    leaq    28(%rsp), %rdi
    call    _gfortran_cpu_time_4

Force no inline:

[denk@scadm01 ~]$ gfortran -O3 -ftree-vectorize -fno-inline -march=native -Wall iterate1.f
[denk@scadm01 ~]$ ./a.out < data.in
Flops =   0.2573773E+10 va =   0.6331605E+10

Data alignment Intel compiler

[denk@scadm01 ~]$ gfortran -O3 -ftree-vectorize  -Wall -march=native iterate1.f
[denk@scadm01 ~]$ ./a.out < data.in
        8                   16
Flops =   0.2423234E+10 va =   0.6331605E+10
[denk@scadm01 ~]$ vi iterate1.f
[denk@scadm01 ~]$ gfortran -O3 -ftree-vectorize  -Wall -march=native iterate1.f
[denk@scadm01 ~]$ ./a.out < data.in
        0                    0
Flops =   0.2775791E+10 va =   0.6331605E+10

Loop unrolling

[denk@scadm01 ~]$ gfortran -O3 -ftree-vectorize -Wall -march=native iterate1.f  [denk@scadm01 ~]$ ./a.out < data.in
&va % 32 =   0 &vb % 32 =   0
Flops =   0.2751332E+10 va =   0.6331605E+10
[denk@scadm01 ~]$ gfortran -O3 -ftree-vectorize -funroll-loops -Wall -march=native iterate1.f
[denk@scadm01 ~]$ ./a.out < data.in
&va % 32 =   0 &vb % 32 =   0
Flops =   0.5740783E+10 va =   0.6331605E+10
Some links