Skip to content
Snippets Groups Projects
Commit 0266ce25 authored by Ewan Roche's avatar Ewan Roche
Browse files

initial commit

parents
No related branches found
No related tags found
No related merge requests found
\begin{frame}
\titlepage
\centering
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\textit{Here's the code written by the postdoc who has just left - compile it and run it on the SCITAS clusters}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Hardware\\ is\\ complicated
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\includegraphics[width=11cm]{images/ccNUMA.pdf}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
Pinning processes to cores
\end{block}
\begin{itemize}
\item<1-> \tt{1} - task is allowed
\item<2-> \tt{0} - task is excluded
\end{itemize}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
CPU masks
\begin{itemize}
\item<1-> \tt{10000000}
\item<2-> \tt{01000000}
\item<3-> \tt{00110000}
\item<4-> \tt{11110000}
\item<5-> \tt{00001111}
\item<6-> \tt{11111111}
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
SIMD = Single Instruction Multiple Data\\
\vspace{5mm}
Why processing "power" has increased while clock speeds have decreased.
\end{block}
\end{frame}
%---------------------
\begin{frame}
\includegraphics[width=11cm]{images/SIMD_Intel.pdf}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{itemize}
\item<1-> \( 2.6 \times 10^9\) Hz
\item<2-> \(\times \)14 Cores per Socket
\item<3-> \(\times \)2 Sockets
\item<4-> \(\times \)8 Doubles per cycle
\item<5-> \(\times \)2 FP math units per core
\item<6-> \(\times \)2 FMA
\item<7-> \( = 2.3\) TFLOPS
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
HPC and HPCG
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
SIMD
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Shared Memory
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Distributed Memory
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
MPI is the {\it de facto} standard for distributed memory parallelism
\end{block}
\begin{block}{}
OpenMP is the {\it de facto} standard for shared memory parallelism
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
MPI is a standard with many implementations
\begin{itemize}
\item<2-> MPICH2
\item<3-> MVAPICH2
\item<4-> Intel MPI
\item<5-> OpenMPI
\item<6-> Platform MPI
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Modules on the SCITAS clusters
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
Modules are hidden until the dependencies are loaded:
\begin{itemize}
\item<2-> module load compiler
\item<3-> module load MPI implementation
\item<4-> module load BLAS library
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Compiling is magic
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
The recipe
\begin{block}{}
\begin{itemize}
\item<2-> The name of the source file(s)
\item<3-> The libraries to link against
\item<4-> Where to find these libraries
\item<5-> Where to find the header files
\item<6-> A nice name for the executable
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\tt{compiler \\ -l libraries \\ -L <path to libraries> \\ -I <path to header files> \\ -o <name of executable> \\ mycode.c}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\tt
\$ pwd \\
/home/user/qmcode
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\tt
\$ ls \\
lib \\
include \\
src \\
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\tt
\$ ls src/ \\
qmsolve.c
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\tt
\$ ls lib/ \\
libfastqm.so
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\tt
\$ ls include/ \\
fastqm.h
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\tt
\$ icc \\ -lfastqm \\ -L/home/user/qmcode/lib \\ -I/home/user/qmcode/include \\-o qmsolve \\src/qmsolve.c
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Linking makes life better
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Write your own or use already written ones
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
The recipe
\begin{block}{}
\begin{itemize}
\item<2->{\tt-l} name of the library
\item<3->{\tt-L} location of the library
\item<4->{\tt-I} location of the header files containing the library function definitions
\end{itemize}
\end{block}
\end{frame}
\begin{frame}[fragile]
\begin{block}{}
\begin{verbatim}
$ module load gcc fftw
$ gcc mycode.c \
-lfftw3 \
-L${FFTW_ROOT}/lib \
-I${FFTW_ROOT}/include \
-o mycode.x
\end{verbatim}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
What's linked?
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}[fragile]
\begin{block}{}
\begin{verbatim}
$ ldd hi
linux-vdso.so.1 => (0x00007...)
liboutput.so => \
/home/user/mycode/liboutput.so
libc.so.6 => /lib64/libc.so.6
\end{verbatim}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Libraries to make your life better
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{itemize}
\item<1->{\tt MKL}
\item<2->{\tt OpenBLAS}
\item<3->{\tt FFTW}
\item<4->{\tt Eigen}
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Optimising code
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
Do you want to optimise so that:
\begin{itemize}
\item<2-> the executable is as small as possible
\item<3-> the code runs as fast as possible
\item<4-> the code has "perfect" numerical accuracy
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Compilers are lazy
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}[fragile]
\begin{block}{}
\footnotesize
\begin{verbatim}
float matest(float a, float b, float c)
{
a = a*b + c;
return a;
}
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
\large
\tt gcc -s matest.c
\normalsize
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}[fragile]
\begin{block}{}
\tiny
\begin{verbatim}
matest(float, float, float):
push rbp
mov rbp,rsp
movss DWORD PTR [rbp-0x4],xmm0
movss DWORD PTR [rbp-0x8],xmm1
movss DWORD PTR [rbp-0xc],xmm2
movss xmm0,DWORD PTR [rbp-0x4]
mulss xmm0,DWORD PTR [rbp-0x8]
addss xmm0,DWORD PTR [rbp-0xc]
movss DWORD PTR [rbp-0x4],xmm0
mov eax,DWORD PTR [rbp-0x4]
mov DWORD PTR [rbp-0x10],eax
movss xmm0,DWORD PTR [rbp-0x10]
pop rbp
ret
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
Let's make things faster
\begin{itemize}
\item<2-> {\tt -O0}
\item<3-> {\tt -O1}
\item<4-> {\tt -O2}
\item<5-> {\tt -O3}
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
\large
\tt gcc -s -O3 matest.c
\normalsize
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}[fragile]
\begin{block}{}
\small
\begin{verbatim}
matest(float, float, float):
mulss xmm0,xmm1
addss xmm0,xmm2
ret
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
Let's take advantage of the hardware
\begin{itemize}
\item<2-> {\tt -xAVX}
\item<3-> {\tt -xCORE-AVX2}
\item<4-> {\tt -xCORE-AVX512}
\item<5-> {\tt -xHOST}
\end{itemize}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{center}
\large
\tt gcc -s -O3 -xAVX2 matest.c
\normalsize
\end{center}
\end{block}
\end{frame}
\begin{frame}[fragile]
\begin{block}{}
\small
\begin{verbatim}
matest(float, float, float):
vfmadd132ss xmm0,xmm2,xmm1
ret
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
This is an improvement on the default
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}[fragile]
\begin{block}{}
\tiny
\begin{verbatim}
matest(float, float, float):
push rbp
mov rbp,rsp
movss DWORD PTR [rbp-0x4],xmm0
movss DWORD PTR [rbp-0x8],xmm1
movss DWORD PTR [rbp-0xc],xmm2
movss xmm0,DWORD PTR [rbp-0x4]
mulss xmm0,DWORD PTR [rbp-0x8]
addss xmm0,DWORD PTR [rbp-0xc]
movss DWORD PTR [rbp-0x4],xmm0
mov eax,DWORD PTR [rbp-0x4]
mov DWORD PTR [rbp-0x10],eax
movss xmm0,DWORD PTR [rbp-0x10]
pop rbp
ret
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Compilers are be stupid \\
\footnotesize
\it
or at least not as intelligent as you might like
\normalsize
\end{center}
\end{block}
\end{frame}
\begin{frame}[fragile]
\begin{block}{}
\tiny
\begin{verbatim}
void myfunc( double *a1, double *a2, double *prod, int c)
{
for(int x=0; x < c; x++)
{
prod[x] = a1[x] * a2[x];
}
}
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
\large
{\tt gcc -O3 -xCORE-AVX512 matest.c } \\
\normalsize
\footnotesize
\it
sometimes isn't enough :-(
\normalsize
\end{center}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{center}
Manual intervention is required...\\
\end{center}
\end{block}
\end{frame}
\begin{frame}[fragile]
\begin{block}{}
\tiny
\begin{verbatim}
void myfunc( double *restrict a1, double *restrict a2,
double *prod, int c)
{
for(int x=0; x < c; x=x+8)
{
prod[x] = array1[x] * array2[x];
prod[x+1] = a1[x+1] * a2[x+1];
prod[x+2] = a1[x+2] * a2[x+2];
prod[x+3] = a1[x+3] * a2[x+3];
prod[x+4] = a1[x+4] * a2[x+4];
prod[x+5] = a1[x+5] * a2[x+5];
prod[x+6] = a1[x+6] * a2[x+6];
prod[x+7] = a1[x+7] * a2[x+7];
}
}
\end{verbatim}
\normalsize
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{center}
\large
{\tt icc -O2 -xCORE-AVX512 -qopt-zmm-usage=high } \\
\normalsize
\end{center}
\end{block}
\end{frame}
\begin{frame}[fragile]
The loop then becomes:
\begin{block}{}
\footnotesize
\begin{verbatim}
vmovups zmm0, ZMMWORD PTR [rdi+r8*8]
vmulpd zmm1, zmm0, ZMMWORD PTR[rsi+r8*8]
vmovupd ZMMWORD PTR[rdx+r8*8], zmm1
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
mpicc is your friend\\
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
mpicc is the "MPI code compiler"
\begin{itemize}
\item<2-> {\tt mpicc}
\item<3-> {\tt mpiicc}
\item<4-> {\tt mpiifort}
\item<5-> {\tt mpicxx}
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
mpicc is really a wrapper for the standard (GCC and Intel) compilers
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}[fragile]
\begin{block}{}
\tiny
\begin{verbatim}
$ mpicc -show mycode.c
/ssoft/spack/paien/v2/opt/gcc/bin/gcc mycode.c
-I/ssoft/spack/paien/v2/opt/mvapich2/include
-L/ssoft/spack/paien/v2/opt/mvapich2/lib
-lmpi
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
OpenMP support comes from the standard compiler
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
Syntax is compiler dependent
\begin{itemize}
\item<2-> {\tt gcc -fopenmp mycode.c}
\item<3-> {\tt icc -qopenmp mycode.c}
\item<4-> {\tt gfortran -fopenmp mycode.f95}
\item<5-> {\tt ifort -qopenmp mycode.f95 }
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Running parallel codes
\end{center}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{center}
\it
How can we start our parallel code on lots of nodes at the same time?
\end{center}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{center}
\tt
\large
srun mycode.x
\normalsize
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
{\it What resources do I need?}
\begin{itemize}
\item<2-> Number of nodes?
\item<3-> Memory per node?
\item<4-> Total number of tasks?
\item<5-> Tasks per node?
\item<6-> Hardware architecture?
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Job scripts
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}[fragile]
\begin{block}{}
\footnotesize
\begin{verbatim}
#!/bin/bash
#SBATCH --ntasks=96
#SBATCH --cpus-per-task=1
#SBATCH --mem-per-cpu=4096
module purge
module load gcc
module load mvapich2
srun my_mpi_code.x --in=myinput.dat
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
{\it Where will these 96 tasks end up? }
\begin{itemize}
\item<2-> 24:24:24:24 ?
\item<3-> 28:28:28:12 ?
\item<4-> 16:16:16:16:16 ?
\end{itemize}
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
{\tt --ntasks-per-node=24} \\
or \\
{\tt --distribution=cyclic:block}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Hybrid codes
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}[fragile]
\begin{block}{}
\footnotesize
\begin{verbatim}
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=6
#SBATCH --cpus-per-task=4
#SBATCH --mem=100G
module purge
module load intel intel-mpi
export OMP_NUM_THREADS=4
srun my_hybrid_code.x --in=myinput.dat
\end{verbatim}
\normalsize
\end{block}
\end{frame}
\begin{frame}[fragile]
\begin{block}{}
\footnotesize
\begin{verbatim}
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=6
#SBATCH --cpus-per-task=4
#SBATCH --mem=100G
module purge
module load intel intel-mpi
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
srun my_hybrid_code.x --in=myinput.dat
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Binding tasks to cores with srun
\begin{itemize}
\item<2-> {\tt --cpu\_bind=rank }
\item<3-> {\tt --cpu\_bind=verbose,rank }
\item<4-> {\tt --cpu\_bind=sockets}
\item<5-> {\tt --cpu\_bind=mask\_cpu:f,f0}
\end{itemize}
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
With OpenMP we can
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
Set the number of threads per domain:
\begin{itemize}
\item<2-> {\tt export OMP\_NUM\_THREADS=8 }
\end{itemize}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
Control how the threads are placed when using Intel
\begin{itemize}
\small
\item<2-> {\tt export KMP\_AFFINITY=scatter}
\item<3-> {\tt export KMP\_AFFINITY=compact }
\normalsize
\end{itemize}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
Control how the threads are placed when using GCC
\begin{itemize}
\small
\item<2-> {\tt export OMP\_PROC\_BIND=SPREAD}
\item<3-> {\tt export OMP\_PROC\_BIND=CLOSE}
\item<4-> {\tt export GOMP\_CPU\_AFFINITY="0 4 8 12"}
\normalsize
\end{itemize}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{center}
Errors...
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\tt
\small
Please verify that both the operating system and the processor support Intel MOVBE, FMA, BMI, LZCNT and AVX2 instructions.
\normalsize
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\tt
\small
./run.x: error while loading shared libraries:
libmkl\_intel\_lp64.so: cannot open shared object file:
No such file or directory
\normalsize
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\tt
\footnotesize
Fatal error in MPI\_Init: Other MPI error, error
stack:
.MPIR\_Init\_thread(514):
.MPID\_Init(320).......: channel initialization failed
.MPID\_Init(716).......: PMI\_Get\_id returned 14
\normalsize
\end{block}
\end{frame}
File added
This diff is collapsed.
\documentclass[20pt, xcolor={usenames,dvipsnames}]{beamer}
\usepackage{alltt}
\hypersetup{
pdftitle={Compiling code and using MPI},
pdfauthor={http://scitas.epfl.ch},
colorlinks=true,
urlcolor=blue
}
\title{Compiling code and using MPI}
\author{\url{scitas.epfl.ch}}
\date{\today}
\begin{document}
\input{document}
\end{document}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment