Job samples for Pi supercomputer

Job samples used in this document can be found in /lustre/usr/samples. Before proceeding, please read the docs about Environment Modules.

An OpenMP example

An OpenMP example code, named omp_hello.c is as follows:

#include <omp.h>
#include <stdio.h>
#include <stdlib.h>

int main (int argc, char *argv[])
{
int nthreads, tid;

/* Fork a team of threads giving them their own copies of variables */
#pragma omp parallel private(nthreads, tid)
  {

  /* Obtain thread number */
  tid = omp_get_thread_num();
  printf("Hello World from thread = %d\n", tid);

  /* Only master thread does this */
  if (tid == 0)
    {
    nthreads = omp_get_num_threads();
    printf("Number of threads = %d\n", nthreads);
    }

  }  /* All threads join master thread and disband */

}

Build and run with GCC 4.9.1

$ module purge; module load gcc
$ gcc -fopenmp omp_hello.c -o omphello

Run the application with 4 threads locally:

$ export OMP_NUM_THREADS=4 && ./omphello

Prepare a job script named ompgcc.slurm:

#!/bin/bash

#SBATCH --job-name=Hello_OpenMP
#SBATCH --partition=cpu
#SBATCH --mail-type=end
#SBATCH --mail-user=YOU@EMAIL.COM
#SBATCH --output=%j.out
#SBATCH --error=%j.err
#SBATCH -n 16
#SBATCH --ntasks-per-node=16

source /usr/share/Modules/init/bash
unset MODULEPATH
module use /lustre/usr/modulefiles/pi
module purge
module load gcc

export OMP_NUM_THREADS=16
./omphello

Submit to SLURM:

$ sbatch -p cpu ompgcc.slurm

Build and run with Intel Compiler

$ module purge; module load icc
$ icc -fopenmp omp_hello.c -o omphello

Run the application with 4 threads locally:

$ export OMP_NUM_THREADS=4 && ./omphello

Prepare a job script named ompicc.slurm:

#!/bin/bash

#SBATCH --job-name=Hello_OpenMP
#SBATCH --partition=cpu
#SBATCH --mail-type=end
#SBATCH --mail-user=YOU@EMAIL.COM
#SBATCH --output=%j.out
#SBATCH --error=%j.err
#SBATCH -n 16
#SBATCH --ntasks-per-node=16


source /usr/share/Modules/init/bash
unset MODULEPATH
module use /lustre/usr/modulefiles/pi
module purge
module load icc

export OMP_NUM_THREADS=16
./omphello

Submit to SLURM:

$ sbatch -p cpu ompicc.slurm

An MPI Sample

A sample code mpihello.c is as follows:

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <netdb.h>

#define MAX_HOSTNAME_LENGTH 256

int main(int argc, char *argv[])
{
    int pid;
    char hostname[MAX_HOSTNAME_LENGTH];

    int numprocs;
    int rank;

    int rc;

    /* Initialize MPI. Pass reference to the command line to
     * allow MPI to take any arguments it needs
     */
    rc = MPI_Init(&argc, &argv);

    /* It's always good to check the return values on MPI calls */
    if (rc != MPI_SUCCESS)
    {
        fprintf(stderr, "MPI_Init failed\n");
        return 1;
    }

    /* Get the number of processes and the rank of this process */
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    /* let's see who we are to the "outside world" - what host and what PID */
    gethostname(hostname, MAX_HOSTNAME_LENGTH);
    pid = getpid();

    /* say who we are */
    printf("Rank %d of %d has pid %5d on %s\n", rank, numprocs, pid, hostname);
    fflush(stdout);

    /* allow MPI to clean up after itself */
    MPI_Finalize();
    return 0;
}

Build and run with OpenMPI+GCC

$ module purge && module load gcc openmpi
$ mpicc mpihello.c -o mpihello

Prepare a SLURM job script named job_openmpi.slurm:

#!/bin/bash

#SBATCH --job-name=mpihello
#SBATCH --partition=cpu
#SBATCH --mail-type=end
#SBATCH --mail-user=YOU@EMAIL.COM
#SBATCH --output=%j.out
#SBATCH --error=%j.err
#SBATCH -n 64
#SBATCH --ntasks-per-node=16

source /usr/share/Modules/init/bash
unset MODULEPATH
module use /lustre/usr/modulefiles/pi
module purge
module load gcc openmpi

srun --mpi=pmi2 ./mpihello

Finally, submit your jobs to SLURM:

$ sbatch -p cpu job_openmpi.slurm

Build and run with Intel compiler collection

$ module purge; module load icc impi
$ mpiicc mpihello -o mpihello

Prepare a SLURM job script named job_impi.slurm:

#!/bin/bash

#SBATCH --job-name=mpihello
#SBATCH --partition=cpu
#SBATCH --mail-type=end
#SBATCH --mail-user=YOU@EMAIL.COM
#SBATCH --output=%j.out
#SBATCH --error=%j.err
#SBATCH -n 64
#SBATCH --ntasks-per-node=16

source /usr/share/Modules/init/bash
unset MODULEPATH
module use /lustre/usr/modulefiles/pi
module purge
module load icc impi

export I_MPI_PMI_LIBRARY=/usr/lib64/libpmi.so
export I_MPI_FABRICS=shm:dapl

srun ./mpihello

Finally, submit your jobs to SLURM:

$ sbatch -p cpu job_impi.slurm

A Hybrid MPI+OpenMP example

hybridmpi.c

#include <stdio.h>
#include "mpi.h"
#include <omp.h>

int main(int argc, char *argv[]) {
  int numprocs, rank, namelen;
  char processor_name[MPI_MAX_PROCESSOR_NAME];
  int iam = 0, np = 1;

  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Get_processor_name(processor_name, &namelen);

  #pragma omp parallel default(shared) private(iam, np)
  {
    np = omp_get_num_threads();
    iam = omp_get_thread_num();
    printf("Hello from thread %d out of %d from process %d out of %d on %s\n",
           iam, np, rank, numprocs, processor_name);
  }

  MPI_Finalize();
}

GCC

Build:

$ module load gcc openmpi
$ mpicc -O3 -fopenmp hybridmpi.c -o hybridmpi

Prepare a SLURM job script named hybridmpi.slurm:

#!/bin/bash

#SBATCH --job-name=HybridMPI
#SBATCH --partition=cpu
#SBATCH --mail-type=end
#SBATCH --mail-user=YOU@EMAIL.COM
#SBATCH --output=%j.out
#SBATCH --error=%j.err
#SBAkCH --ntasks-per-node=1
#SBATCH --exclusive
#SBATCH --time=00:01:00 

source /usr/share/Modules/init/bash
unset MODULEPATH
module use /lustre/usr/modulefiles/pi
module purge
module load gcc openmpi

OMP_NUM_THREADS=16
srun --mpi=pmi2 ./hybridmpi

ICC

Build:

$ module load icc impi
$ mpiicc -O3 -fopenmp hybridmpi.c -o hybridmpi

Prepare a SLURM job script named hybridmpi.slurm:

#!/bin/bash

#SBATCH --job-name=HybridMPI
#SBATCH --partition=cpu
#SBATCH --mail-type=end
#SBATCH --mail-user=YOU@EMAIL.COM
#SBATCH --output=%j.out
#SBATCH --error=%j.err
#SBAkCH --ntasks-per-node=1
#SBATCH --exclusive
#SBATCH --time=00:01:00 

source /usr/share/Modules/init/bash
unset MODULEPATH
module use /lustre/usr/modulefiles/pi
module purge
module load icc impi

export I_MPI_DEBUG=5
export I_MPI_PMI_LIBRARY=/usr/lib64/libpmi.so
export I_MPI_FABRICS=shm:dapl

OMP_NUM_THREADS=16
srun ./hybridmpi

Submit the job onto 4 compute nodes

Submit to SLURM:

$ sbatch -N 4 hybridmpi.slurm

A CUDA Sample

cublashello.cu

//Example 2. Application Using C and CUBLAS: 0-based indexing
//-----------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#define M 6
#define N 5
#define IDX2C(i,j,ld) (((j)*(ld))+(i))

static __inline__ void modify (cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta){
    cublasSscal (handle, n-p, &alpha, &m[IDX2C(p,q,ldm)], ldm);
    cublasSscal (handle, ldm-p, &beta, &m[IDX2C(p,q,ldm)], 1);
}

int main (void){
    cudaError_t cudaStat;    
    cublasStatus_t stat;
    cublasHandle_t handle;
    int i, j;
    float* devPtrA;
    float* a = 0;
    a = (float *)malloc (M * N * sizeof (*a));
    if (!a) {
        printf ("host memory allocation failed");
        return EXIT_FAILURE;
    }
    for (j = 0; j < N; j++) {
        for (i = 0; i < M; i++) {
            a[IDX2C(i,j,M)] = (float)(i * M + j + 1);
        }
    }
    cudaStat = cudaMalloc ((void**)&devPtrA, M*N*sizeof(*a));
    if (cudaStat != cudaSuccess) {
        printf ("device memory allocation failed");
        return EXIT_FAILURE;
    }
    stat = cublasCreate(&handle);
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("CUBLAS initialization failed\n");
        return EXIT_FAILURE;
    }
    stat = cublasSetMatrix (M, N, sizeof(*a), a, M, devPtrA, M);
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("data download failed");
        cudaFree (devPtrA);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }
    modify (handle, devPtrA, M, N, 1, 2, 16.0f, 12.0f);
    stat = cublasGetMatrix (M, N, sizeof(*a), devPtrA, M, a, M);
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("data upload failed");
        cudaFree (devPtrA);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }
    cudaFree (devPtrA);
    cublasDestroy(handle);
    for (j = 0; j < N; j++) {
        for (i = 0; i < M; i++) {
            printf ("%7.0f", a[IDX2C(i,j,M)]);
        }
        printf ("\n");
    }
    free(a);
    return EXIT_SUCCESS;
}

Build:

$ module purge; module load gcc cuda
$ nvcc cublashello.cu -o cublashello -lcublas

The job script cublashello.slurm is as follows:

#!/bin/bash

#SBATCH --job-name=cublas
#SBATCH --partition=gpu
#SBATCH --mail-type=end
#SBATCH --mail-user=YOU@EMAIL.COM
#SBATCH --output=%j.out
#SBATCH --error=%j.err
#SBATCH -n 1
#SBATCH --gres=gpu:1

source /usr/share/Modules/init/bash
module purge
module load gcc cuda

./cublashello

Submit the job to the gpu partition on SLURM:

$ sbatch -p gpu cublashello.slurm

Run Intel LINPACK via sbatch

Here is a MPI job running on multiple nodes.

Firstly, let’s prepare the execution file and input data:

$ cd ~/tmp
$ cp /lustre/usr/samples/LINPACK/64/xhpl_intel64 .
$ cp /lustre/usr/samples/LINPACK/64/HPL.dat .

Secondly, prepare a job script named linpack.sh. In this script, we request 64 cores with 16 cores per node on the cpu partition. Please note that MPI jobs are launched via srun, not mpirun.

#!/bin/bash

#SBATCH --job-name=Intel_MPLINPACK
#SBATCH --partition=cpu
#SBATCH --mail-type=end
#SBATCH --mail-user=YOU@EMAIL.COM
#SBATCH --output=%j.out
#SBATCH --error=%j.err
#SBATCH -n 64
#SBATCH --ntasks-per-node=16

source /usr/share/Modules/init/bash
unset MODULEPATH
module use /lustre/usr/modulefiles/pi
module purge
module load icc mkl impi

export I_MPI_PMI_LIBRARY=/usr/lib64/libpmi.so
export I_MPI_FABRICS=shm:dapl
export I_MPI_DEBUG=100

srun ./xhpl_intel64

Finally, submit the job to SLURM.

$ sbatch linpack.sh
Submitted batch job 358

We can attach to the running process and watch its STDOUT and STDERR:

$ sattach 358.0
$ CTRL-C

We can watch the job’s output to files:

$ tail -f /lustre/home/hpc-jianwen/tmp/358.out

Terminate the job:

$ scancel 358