-
Ewan Roche authoredEwan Roche authored
document.tex 20.45 KiB
\begin{frame}
\titlepage
\centering
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\textit{Here's the code written by the postdoc who has just left - compile it and run it on the SCITAS clusters}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
Today we will look at
\end{block}
\begin{itemize}
\item<2-> Hardware
\item<3-> Parallel programming
\item<4-> Compiling and linking
\item<5-> Running on a cluster
\end{itemize}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
\Large
Hardware\\ is\\ complicated
\normalsize
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\includegraphics[width=11cm]{images/ccNUMA.pdf}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
Pinning processes to cores
\end{block}
\begin{itemize}
\item<2-> \tt{1} - task is allowed
\item<3-> \tt{0} - task is excluded
\end{itemize}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
CPU masks for a 8 core system
\begin{itemize}
\item<2-> \tt{10000000}
\item<3-> \tt{01000000}
\item<4-> \tt{00110000}
\item<5-> \tt{11110000}
\item<6-> \tt{00001111}
\item<7-> \tt{11111111}
\end{itemize}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
CPU masks for a 8 core system
\begin{itemize}
\item \tt{10000000 = 0x80}
\item \tt{01000000 = 0x40}
\item \tt{00110000 = 0x30}
\item \tt{11110000 = 0xF0}
\item \tt{00001111 = 0xF}
\item \tt{11111111 = 0xFF}
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\Large
\textbf{S}ingle\\
\textbf{I}nstruction\\
\textbf{M}ultiple\\
\textbf{D}ata\\
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\includegraphics[width=11cm]{images/SIMD_Intel.pdf}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{itemize}
\item<1-> \( 2.6 \times 10^9\) Hz
\item<2-> \(\times \)14 Cores per Socket
\item<3-> \(\times \)2 Sockets
\item<4-> \(\times \)8 Doubles per cycle
\item<5-> \(\times \)2 FP math units per core
\item<6-> \(\times \)2 FMA
\item<7-> \( = 2.3\) TFLOPS
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
\large
HPL and HPCG
\normalsize
\end{center}
\end{block}
\end{frame}
%---------------------
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
HPL can use roughly 80\% of the peak performance
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
HPCG can use roughly 2\% of the peak performance
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
HPCG is representative of lots of scientific codes
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
\large
Shared Memory
\normalsize
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Every task sees all the data
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
\large
Distributed Memory
\normalsize
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Workers have their own data and can't see data belonging to other workers
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
If they need to ask to see something they have to send a message
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
MPI is the {\it de facto} standard for distributed memory parallelism
\end{block}
\begin{block}{}
OpenMP is the {\it de facto} standard for shared memory parallelism
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
MPI is a standard with many implementations
\begin{itemize}
\item<2-> MPICH2
\item<3-> MVAPICH2
\item<4-> Intel MPI
\item<5-> OpenMPI
\item<6-> Platform MPI
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Modules on the SCITAS clusters
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
Modules are hidden until the dependencies are loaded:
\begin{itemize}
\item<2-> module load compiler
\item<3-> module load MPI implementation
\item<4-> module load BLAS library
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
{ \Large Compiling is magic}
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
The recipe
\begin{block}{}
\begin{itemize}
\item<2-> The name of the source file(s)
\item<3-> The libraries to link against
\item<4-> Where to find these libraries
\item<5-> Where to find the header files
\item<6-> A nice name for the executable
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\tt{compiler \\ -l libraries \\ -L <path to libraries> \\ -I <path to header files> \\ -o <name of executable> \\ mycode.c}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\tt
\$ pwd \\
/home/user/qmcode
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\tt
\$ ls \\
lib \\
include \\
src \\
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\tt
\$ ls src/ \\
qmsolve.c
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\tt
\$ ls lib/ \\
libfastqm.so
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\tt
\$ ls include/ \\
fastqm.h
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\tt
\$ icc \\ -lfastqm \\ -L/home/user/qmcode/lib \\ -I/home/user/qmcode/include \\-o qmsolve \\src/qmsolve.c
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Linking makes life better
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Write your own or use already written ones
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
The recipe
\begin{block}{}
\begin{itemize}
\item<2->{\tt-l} name of the library
\item<3->{\tt-L} location of the library
\item<4->{\tt-I} location of the header files containing the library function definitions
\end{itemize}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{center}
At runtime we need to set \\
\vspace{5mm}
{\tt LD\_LIBRARY\_PATH}\\
\vspace{5mm}
so the system knows where to find the library
\end{center}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{center}
What's linked?
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}[fragile]
\small
\begin{block}{}
\begin{verbatim}
$ ldd qmsolve
linux-vdso.so.1 => (0x00007...)
libfastqm.so => \
/home/user/qmcode/lib/libfastqm.so
libc.so.6 => /lib64/libc.so.6
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Libraries to make your life better
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{itemize}
\item<2->{\tt MKL}
\item<3->{\tt OpenBLAS}
\item<4->{\tt FFTW}
\item<5->{\tt Eigen}
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}[fragile]
\begin{block}{}
\small
\begin{verbatim}
$ module load gcc fftw
$ gcc mycode.c \
-lfftw3 \
-L${FFTW_ROOT}/lib \
-I${FFTW_ROOT}/include \
-o mycode.x
\end{verbatim}
\normalsize
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{center}
Making your own library
\end{center}
\end{block}
\end{frame}
\begin{frame}[fragile]
\begin{block}{}
\begin{verbatim}
gcc -fPIC -c fastqm.c
\end{verbatim}
\end{block}
\end{frame}
\begin{frame}[fragile]
\begin{block}{}
\begin{verbatim}
gcc -shared -o libfastqm.so
fastqm.o
\end{verbatim}
\end{block}
\end{frame}
%---------------------
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
If you have more than one source file then use a build system
\end{center}
\end{block}
\end{frame}
\begin{frame}
Autotools
\begin{block}{}
\begin{itemize}
\item<2->{\tt ./configure}
\item<3->{\tt make}
\item<4->{\tt make install}
\end{itemize}
\end{block}
\end{frame}
\begin{frame}
cmake
\begin{block}{}
\begin{itemize}
\item<2->{\tt cmake}
\item<3->{\tt make}
\item<4->{\tt make install}
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Optimising code
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
Do you want to optimise so that:
\begin{itemize}
\item<2-> the executable is as small as possible?
\item<3-> the code runs as fast as possible?
\item<4-> the code has "perfect" numerical accuracy?
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Compilers are lazy
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}[fragile]
\begin{block}{}
\footnotesize
\begin{verbatim}
float matest(float a, float b, float c)
{
a = a*b + c;
return a;
}
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
\large
\tt gcc -s matest.c
\normalsize
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}[fragile]
\begin{block}{}
\tiny
\begin{verbatim}
matest(float, float, float):
push rbp
mov rbp,rsp
movss DWORD PTR [rbp-0x4],xmm0
movss DWORD PTR [rbp-0x8],xmm1
movss DWORD PTR [rbp-0xc],xmm2
movss xmm0,DWORD PTR [rbp-0x4]
mulss xmm0,DWORD PTR [rbp-0x8]
addss xmm0,DWORD PTR [rbp-0xc]
movss DWORD PTR [rbp-0x4],xmm0
mov eax,DWORD PTR [rbp-0x4]
mov DWORD PTR [rbp-0x10],eax
movss xmm0,DWORD PTR [rbp-0x10]
pop rbp
ret
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
Let's make things faster
\begin{itemize}
\item<2-> {\tt -O0}
\item<3-> {\tt -O1}
\item<4-> {\tt -O2}
\item<5-> {\tt -O3}
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
\large
\tt gcc -s -O3 matest.c
\normalsize
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}[fragile]
\begin{block}{}
\small
\begin{verbatim}
matest(float, float, float):
mulss xmm0,xmm1
addss xmm0,xmm2
ret
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
Let's take advantage of the hardware
\begin{itemize}
\item<2-> {\tt -xAVX}
\item<3-> {\tt -xCORE-AVX2}
\item<4-> {\tt -xCORE-AVX512}
\item<5-> {\tt -xHOST}
\end{itemize}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{center}
\large
\tt gcc -s -O3 -xAVX2 matest.c
\normalsize
\end{center}
\end{block}
\end{frame}
\begin{frame}[fragile]
\begin{block}{}
\small
\begin{verbatim}
matest(float, float, float):
vfmadd132ss xmm0,xmm2,xmm1
ret
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
This is an improvement on the default
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}[fragile]
\begin{block}{}
\tiny
\begin{verbatim}
matest(float, float, float):
push rbp
mov rbp,rsp
movss DWORD PTR [rbp-0x4],xmm0
movss DWORD PTR [rbp-0x8],xmm1
movss DWORD PTR [rbp-0xc],xmm2
movss xmm0,DWORD PTR [rbp-0x4]
mulss xmm0,DWORD PTR [rbp-0x8]
addss xmm0,DWORD PTR [rbp-0xc]
movss DWORD PTR [rbp-0x4],xmm0
mov eax,DWORD PTR [rbp-0x4]
mov DWORD PTR [rbp-0x10],eax
movss xmm0,DWORD PTR [rbp-0x10]
pop rbp
ret
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Compilers are stupid \\
\footnotesize
\it
or at least not as intelligent as you might like
\normalsize
\end{center}
\end{block}
\end{frame}
\begin{frame}[fragile]
\begin{block}{}
\tiny
\begin{verbatim}
void myfunc( double *a1, double *a2, double *prod, int c)
{
for(int x=0; x < c; x++)
{
prod[x] = a1[x] * a2[x];
}
}
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
\large
{\tt gcc -O3 -xCORE-AVX512 matest.c } \\
\normalsize
\footnotesize
\it
sometimes isn't enough :-(
\normalsize
\end{center}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{center}
Manual intervention is required...\\
\end{center}
\end{block}
\end{frame}
\begin{frame}[fragile]
\begin{block}{}
\tiny
\begin{verbatim}
void myfunc( double *restrict a1, double *restrict a2,
double *prod, int c)
{
for(int x=0; x < c; x=x+8)
{
prod[x] = a1[x] * a2[x];
prod[x+1] = a1[x+1] * a2[x+1];
prod[x+2] = a1[x+2] * a2[x+2];
prod[x+3] = a1[x+3] * a2[x+3];
prod[x+4] = a1[x+4] * a2[x+4];
prod[x+5] = a1[x+5] * a2[x+5];
prod[x+6] = a1[x+6] * a2[x+6];
prod[x+7] = a1[x+7] * a2[x+7];
}
}
\end{verbatim}
\normalsize
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{center}
\large
{\tt icc -O2 -xCORE-AVX512 -qopt-zmm-usage=high } \\
\normalsize
\end{center}
\end{block}
\end{frame}
\begin{frame}[fragile]
The loop then becomes:
\begin{block}{}
\footnotesize
\begin{verbatim}
vmovups zmm0, ZMMWORD PTR [rdi+r8*8]
vmulpd zmm1, zmm0, ZMMWORD PTR[rsi+r8*8]
vmovupd ZMMWORD PTR[rdx+r8*8], zmm1
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
{ \Large mpicc is your friend }\\
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
mpicc is the "MPI code compiler"
\begin{itemize}
\item<2-> {\tt mpicc}
\item<3-> {\tt mpiicc}
\item<4-> {\tt mpiifort}
\item<5-> {\tt mpicxx}
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
mpicc is really a wrapper for the standard (GCC and Intel) compilers
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}[fragile]
\begin{block}{}
\tiny
\begin{verbatim}
$ mpicc -show mycode.c
/ssoft/spack/paien/v2/opt/gcc/bin/gcc mycode.c
-I/ssoft/spack/paien/v2/opt/mvapich2/include
-L/ssoft/spack/paien/v2/opt/mvapich2/lib
-lmpi
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
OpenMP support comes from the standard compiler
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
Syntax is compiler dependent
\begin{itemize}
\item<2-> {\tt gcc -fopenmp mycode.c}
\item<3-> {\tt icc -qopenmp mycode.c}
\item<4-> {\tt gfortran -fopenmp mycode.f95}
\item<5-> {\tt ifort -qopenmp mycode.f95 }
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Running parallel codes
\end{center}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{center}
\it
How can we start our parallel code on lots of nodes at the same time?
\end{center}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{center}
\tt
\large
srun mycode.x
\normalsize
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
{\it What resources do I need?}
\begin{itemize}
\item<2-> Number of nodes?
\item<3-> Memory per node?
\item<4-> Total number of tasks?
\item<5-> Tasks per node?
\item<6-> Hardware architecture?
\end{itemize}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Job scripts
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}[fragile]
\begin{block}{}
\footnotesize
\begin{verbatim}
#!/bin/bash
#SBATCH --ntasks=96
#SBATCH --cpus-per-task=1
#SBATCH --mem-per-cpu=4096
module purge
module load gcc
module load mvapich2
srun my_mpi_code.x --in=myinput.dat
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
{\it Where will these 96 tasks end up? }
\begin{itemize}
\item<2-> 24:24:24:24 ?
\item<3-> 28:28:28:12 ?
\item<4-> 16:16:16:16:16 ?
\end{itemize}
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
{\tt --ntasks-per-node=24} \\
or \\
{\tt --distribution=cyclic:block}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Hybrid codes
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}[fragile]
\begin{block}{}
\footnotesize
\begin{verbatim}
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=6
#SBATCH --cpus-per-task=4
#SBATCH --mem=100G
module purge
module load intel intel-mpi
export OMP_NUM_THREADS=4
srun my_hybrid_code.x --in=myinput.dat
\end{verbatim}
\normalsize
\end{block}
\end{frame}
\begin{frame}[fragile]
\begin{block}{}
\footnotesize
\begin{verbatim}
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=6
#SBATCH --cpus-per-task=4
#SBATCH --mem=100G
module purge
module load intel intel-mpi
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
srun my_hybrid_code.x --in=myinput.dat
\end{verbatim}
\normalsize
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
Binding tasks to cores with srun
\begin{itemize}
\item<2-> {\tt --cpu\_bind=rank }
\item<3-> {\tt --cpu\_bind=verbose,rank }
\item<4-> {\tt --cpu\_bind=sockets}
\item<5-> {\tt --cpu\_bind=mask\_cpu:f,f0}
\end{itemize}
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\begin{center}
With OpenMP we can
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
Set the number of threads per domain:
\begin{itemize}
\item<2-> {\tt export OMP\_NUM\_THREADS=8 }
\end{itemize}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
Control how the threads are placed when using Intel
\begin{itemize}
\small
\item<2-> {\tt export KMP\_AFFINITY=scatter}
\item<3-> {\tt export KMP\_AFFINITY=compact }
\normalsize
\end{itemize}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
Control how the threads are placed when using GCC
\begin{itemize}
\small
\item<2-> {\tt export OMP\_PROC\_BIND=SPREAD}
\item<3-> {\tt export OMP\_PROC\_BIND=CLOSE}
\item<4-> {\tt export GOMP\_CPU\_AFFINITY="0 4 8 12"}
\normalsize
\end{itemize}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{center}
Errors...
\end{center}
\end{block}
\end{frame}
%---------------------
\begin{frame}
\begin{block}{}
\tt
\small
Please verify that both the operating system and the processor support Intel MOVBE, FMA, BMI, LZCNT and AVX2 instructions.
\normalsize
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\tt
\small
./run.x: error while loading shared libraries:
libmkl\_intel\_lp64.so: cannot open shared object file:
No such file or directory
\normalsize
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\tt
\footnotesize
Fatal error in MPI\_Init: Other MPI error, error
stack:
.MPIR\_Init\_thread(514):
.MPID\_Init(320).......: channel initialization failed
.MPID\_Init(716).......: PMI\_Get\_id returned 14
\normalsize
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{center}
Going further
\end{center}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
Introduction to profiling and software optimisation (1 day)
\end{block}
\begin{block}{}
Introduction to parallel programming with OpenMP and MPI (3 days)
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{center}
https://scitas-data.epfl.ch/kb
\end{center}
\end{block}
\end{frame}