Index
HPC - Universidad de Sevilla

Programming for shared memory with OpenMP (Open Multi-Processing)

Our first OpenMP program

The example files for this section can be found in src/openmp/simple_examples

C: parfor.c

#include <stdio.h>

#define NMAX 5
double a[NMAX],b[NMAX];

int main() {
  int i,n=NMAX;

  printf("initialization\n");
  // serial loop
  for (i=0; i<n; i++) {
    a[i] = i;
    b[i] = 0.0;
    printf("%d %10.3e\n",i,a[i]);
  }

  printf("calculation\n");
  // parallel loop
#pragma omp parallel for
  for (i=1; i<n; i++) { 
    b[i] = 0.5*(a[i-1] + a[i]);
    printf("%d %10.3e\n",i,b[i]);
  }

  printf("results\n");
  for (i=1; i<n; i++) {
    printf("%d %10.3e\n",i,b[i]);
  }
  return 0;
}

Fortran: parfor_f.f

      PROGRAM PARFOR
      PARAMETER (NMAX=5)
      REAL*8 a(NMAX), b(NMAX)
C
      write(*,*) 'Initialization'
      Do i=1,NMAX
        a(I)=I
        b(I)=0
        write(*,*) i,a(i)
      ENDDO
C
      write(*,*) 'Calculation'
!$OMP PARALLEL DO
      DO i=2,NMAX
        b(i) = 0.5*(a(i-1) + a(i))
        write(*,*) i,b(i)
      ENDDO
C
      write(*,*) 'Results'
  Do i=2,NMAX
        write(*,*) i,b(i)
      ENDDO
      END PROGRAM PARFOR

Exercises:

-> Parallel means out of "order"

Race conditions

C: race.c

#include <stdio.h>

int main() {
  int i;
  double sum=0;
#pragma omp parallel for
  for (i=0; i<1000; i++) {
    sum = sum + 1;
  }
  printf("sum %10.3e\n",sum);
  return 0;
}

Exercises:

-> unsynchronized access and shared memory = possible mess

A team and a cake

Team

Cake

C: teamcake.c

#include <omp.h>
  ...
  code
  ...
#pragma omp parallel private(iam,nt,ipoints,istart)
  {
    iam = omp_get_thread_num();
    nt =  omp_get_num_threads();
    ipoints = NMAX / nt;      /* cake slice size */
    istart = iam * ipoints;   /* where it starts */
    if (iam == nt-1)          /* last gets leftovers */
      ipoints = NMAX - istart;
    calc(a,b, istart, ipoints);
  }

  for (i=1;i<NMAX;i++) {
    printf("%02d %10.3e\n",i,b[i]);
  }

Fortran: teamcake_f.f

!$    INCLUDE "omp_lib.h"
      ...
      code
      ... 
!$OMP PARALLEL PRIVATE(iam,nt,ipoints,istart)
      iam = omp_get_thread_num()
      nt =  omp_get_num_threads()
      ipoints = NMAX / nt
      istart = iam * ipoints
      if (iam .EQ. nt-1) then
        ipoints=NMAX-istart
      endif
      CALL WORK(a,b,istart,ipoints)
!$OMP END PARALLEL

      Do i=2,NMAX
        write(*,*) i,b(i)
      ENDDO

Exercises:

Share a loop

Change the example program of the previous section to include the initialization loop in the parallel section using #pragma omp for

C: teamfor.c

#pragma omp parallel private(iam,nt,ipoints,istart)
  {
#pragma omp for
    for (i=0;i<NMAX;i++) {
      a[i]=i; 
      b[i]=0;
    }
    iam = omp_get_thread_num();
    nt =  omp_get_num_threads();
    ipoints = NMAX / nt;      /* cake slice size */
    istart = iam * ipoints;   /* where it starts */
    if (iam == nt-1)          /* last gets leftovers */
      ipoints = NMAX - istart;
    printf(".thread %d of %d: %d %d\n",iam,nt,istart,ipoints);
    calc(a,b, istart, ipoints);
  }

Exercises

Now review our first example:

#pragma omp parallel for
  for (i=1; i<n; i++) {
    b[i] = 0.5*(a[i-1] + a[i]);
  }

Can be thought to be

#pragma omp parallel
{
#pragma omp for
  for (i=1; i<n; i++) {
    b[i] = 0.5*(a[i-1] + a[i]);
  }
}

Synchronization: Only one can pass at a time

Turnstile

C: reduction.c

  ..
  for (i=0;i<NMAX;i++) {
    a[i]=i+1;
  }

#pragma omp parallel private(iam,nt,ipoints,istart,psum)
  {
    iam = omp_get_thread_num();
    nt =  omp_get_num_threads();
    ipoints = NMAX / nt;     /* size of partition */
    istart = iam * ipoints;  /* starting array index */
    if (iam == nt-1)         /* last thread may do more */
      ipoints = NMAX - istart;
    for (i=istart;i < istart + ipoints;i++)
      psum += a[i];
#pragma omp critical
    tsum += psum;
  }
  printf("%10.3e\n",tsum);

Notes:

Synchronization: Wait till all are done

Startbox

#pragma omp parallel private(iam,nt,ipoints,istart,i)
  {
    iam = omp_get_thread_num();
    nt =  omp_get_num_threads();
    ipoints = NMAX / nt;
    istart = iam * ipoints;
    if (iam == nt-1)
      ipoints = NMAX - istart;

    for (i=istart;i < istart + ipoints;i++) {
      a[i]=sin((2*i+1)*x)/(2.0*i+1);
    }
//#pragma omp barrier
#pragma omp for schedule(dynamic) reduction(+:psum) 
    for (i=0; i < NMAX; i++) {
      psum += a[i];
    }
  }
  printf("%10.3e\n",4.0/M_PI*psum);

Ejercicios:

Only do stuff on one thread

#pragma omp parallel
  {
#pragma omp master
    printf("Running with %d threads\n",omp_get_num_threads());
    printf("Hello from thread %d!\n",omp_get_thread_num());
  }

Usually only one thread does i/o

#pragma omp parallel
  {
#pragma omp single
    printf("Running with %d threads\n",omp_get_num_threads());
    printf("Hello from thread %d!\n",omp_get_thread_num());
  }

Lecturas