\begin{frame} \titlepage \centering \end{frame} %--------------------- \begin{frame} \begin{block}{} \textit{Here's the code written by the postdoc who has just left - compile it and run it on the SCITAS clusters} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} Today we will look at \end{block} \begin{itemize} \item<2-> Hardware \item<3-> Parallel programming \item<4-> Compiling and linking \item<5-> Running on a cluster \end{itemize} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} \Large Hardware\\ is\\ complicated \normalsize \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \includegraphics[width=11cm]{images/ccNUMA.pdf} \end{frame} %--------------------- \begin{frame} \begin{block}{} Pinning processes to cores \end{block} \begin{itemize} \item<2-> \tt{1} - task is allowed \item<3-> \tt{0} - task is excluded \end{itemize} \end{frame} %--------------------- \begin{frame} \begin{block}{} CPU masks for a 8 core system \begin{itemize} \item<2-> \tt{10000000} \item<3-> \tt{01000000} \item<4-> \tt{00110000} \item<5-> \tt{11110000} \item<6-> \tt{00001111} \item<7-> \tt{11111111} \end{itemize} \end{block} \end{frame} \begin{frame} \begin{block}{} CPU masks for a 8 core system \begin{itemize} \item \tt{10000000 = 0x80} \item \tt{01000000 = 0x40} \item \tt{00110000 = 0x30} \item \tt{11110000 = 0xF0} \item \tt{00001111 = 0xF} \item \tt{11111111 = 0xFF} \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \Large \textbf{S}ingle\\ \textbf{I}nstruction\\ \textbf{M}ultiple\\ \textbf{D}ata\\ \normalsize \end{block} \end{frame} %--------------------- \begin{frame} \includegraphics[width=11cm]{images/SIMD_Intel.pdf} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{itemize} \item<1-> \( 2.6 \times 10^9\) Hz \item<2-> \(\times \)14 Cores per Socket \item<3-> \(\times \)2 Sockets \item<4-> \(\times \)8 Doubles per cycle \item<5-> \(\times \)2 FP math units per core \item<6-> \(\times \)2 FMA \item<7-> \( = 2.3\) TFLOPS \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} \large HPL and HPCG \normalsize \end{center} \end{block} \end{frame} %--------------------- %--------------------- \begin{frame} \begin{block}{} \begin{center} HPL can use roughly 80\% of the peak performance \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} HPCG can use roughly 2\% of the peak performance \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} HPCG is representative of lots of scientific codes \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} \large Shared Memory \normalsize \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} Every task sees all the data \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} \large Distributed Memory \normalsize \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} Workers have their own data and can't see data belonging to other workers \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} If they need to ask to see something they have to send a message \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} MPI is the {\it de facto} standard for distributed memory parallelism \end{block} \begin{block}{} OpenMP is the {\it de facto} standard for shared memory parallelism \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} MPI is a standard with many implementations \begin{itemize} \item<2-> MPICH2 \item<3-> MVAPICH2 \item<4-> Intel MPI \item<5-> OpenMPI \item<6-> Platform MPI \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} Modules on the SCITAS clusters \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} Modules are hidden until the dependencies are loaded: \begin{itemize} \item<2-> module load compiler \item<3-> module load MPI implementation \item<4-> module load BLAS library \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} { \Large Compiling is magic} \end{center} \end{block} \end{frame} %--------------------- \begin{frame} The recipe \begin{block}{} \begin{itemize} \item<2-> The name of the source file(s) \item<3-> The libraries to link against \item<4-> Where to find these libraries \item<5-> Where to find the header files \item<6-> A nice name for the executable \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \tt{compiler \\ -l libraries \\ -L <path to libraries> \\ -I <path to header files> \\ -o <name of executable> \\ mycode.c} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \tt \$ pwd \\ /home/user/qmcode \end{block} \end{frame} \begin{frame} \begin{block}{} \tt \$ ls \\ lib \\ include \\ src \\ \end{block} \end{frame} \begin{frame} \begin{block}{} \tt \$ ls src/ \\ qmsolve.c \end{block} \end{frame} \begin{frame} \begin{block}{} \tt \$ ls lib/ \\ libfastqm.so \end{block} \end{frame} \begin{frame} \begin{block}{} \tt \$ ls include/ \\ fastqm.h \end{block} \end{frame} \begin{frame} \begin{block}{} \tt \$ icc \\ -lfastqm \\ -L/home/user/qmcode/lib \\ -I/home/user/qmcode/include \\-o qmsolve \\src/qmsolve.c \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} Linking makes life better \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} Write your own or use already written ones \end{center} \end{block} \end{frame} %--------------------- \begin{frame} The recipe \begin{block}{} \begin{itemize} \item<2->{\tt-l} name of the library \item<3->{\tt-L} location of the library \item<4->{\tt-I} location of the header files containing the library function definitions \end{itemize} \end{block} \end{frame} \begin{frame} \begin{block}{} \begin{center} At runtime we need to set \\ \vspace{5mm} {\tt LD\_LIBRARY\_PATH}\\ \vspace{5mm} so the system knows where to find the library \end{center} \end{block} \end{frame} \begin{frame} \begin{block}{} \begin{center} What's linked? \end{center} \end{block} \end{frame} %--------------------- \begin{frame}[fragile] \small \begin{block}{} \begin{verbatim} $ ldd qmsolve linux-vdso.so.1 => (0x00007...) libfastqm.so => \ /home/user/qmcode/lib/libfastqm.so libc.so.6 => /lib64/libc.so.6 \end{verbatim} \normalsize \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} Libraries to make your life better \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{itemize} \item<2->{\tt MKL} \item<3->{\tt OpenBLAS} \item<4->{\tt FFTW} \item<5->{\tt Eigen} \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame}[fragile] \begin{block}{} \small \begin{verbatim} $ module load gcc fftw $ gcc mycode.c \ -lfftw3 \ -L${FFTW_ROOT}/lib \ -I${FFTW_ROOT}/include \ -o mycode.x \end{verbatim} \normalsize \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} If you have more than one source file then use a build system \end{center} \end{block} \end{frame} \begin{frame} Autotools \begin{block}{} \begin{itemize} \item<2->{\tt ./configure} \item<3->{\tt make} \item<4->{\tt make install} \end{itemize} \end{block} \end{frame} \begin{frame} cmake \begin{block}{} \begin{itemize} \item<2->{\tt cmake} \item<3->{\tt make} \item<4->{\tt make install} \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} Optimising code \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} Do you want to optimise so that: \begin{itemize} \item<2-> the executable is as small as possible? \item<3-> the code runs as fast as possible? \item<4-> the code has "perfect" numerical accuracy? \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} Compilers are lazy \end{center} \end{block} \end{frame} %--------------------- \begin{frame}[fragile] \begin{block}{} \footnotesize \begin{verbatim} float matest(float a, float b, float c) { a = a*b + c; return a; } \end{verbatim} \normalsize \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} \large \tt gcc -s matest.c \normalsize \end{center} \end{block} \end{frame} %--------------------- \begin{frame}[fragile] \begin{block}{} \tiny \begin{verbatim} matest(float, float, float): push rbp mov rbp,rsp movss DWORD PTR [rbp-0x4],xmm0 movss DWORD PTR [rbp-0x8],xmm1 movss DWORD PTR [rbp-0xc],xmm2 movss xmm0,DWORD PTR [rbp-0x4] mulss xmm0,DWORD PTR [rbp-0x8] addss xmm0,DWORD PTR [rbp-0xc] movss DWORD PTR [rbp-0x4],xmm0 mov eax,DWORD PTR [rbp-0x4] mov DWORD PTR [rbp-0x10],eax movss xmm0,DWORD PTR [rbp-0x10] pop rbp ret \end{verbatim} \normalsize \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} Let's make things faster \begin{itemize} \item<2-> {\tt -O0} \item<3-> {\tt -O1} \item<4-> {\tt -O2} \item<5-> {\tt -O3} \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} \large \tt gcc -s -O3 matest.c \normalsize \end{center} \end{block} \end{frame} %--------------------- \begin{frame}[fragile] \begin{block}{} \small \begin{verbatim} matest(float, float, float): mulss xmm0,xmm1 addss xmm0,xmm2 ret \end{verbatim} \normalsize \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} Let's take advantage of the hardware \begin{itemize} \item<2-> {\tt -xAVX} \item<3-> {\tt -xCORE-AVX2} \item<4-> {\tt -xCORE-AVX512} \item<5-> {\tt -xHOST} \end{itemize} \end{block} \end{frame} \begin{frame} \begin{block}{} \begin{center} \large \tt gcc -s -O3 -xAVX2 matest.c \normalsize \end{center} \end{block} \end{frame} \begin{frame}[fragile] \begin{block}{} \small \begin{verbatim} matest(float, float, float): vfmadd132ss xmm0,xmm2,xmm1 ret \end{verbatim} \normalsize \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} This is an improvement on the default \end{center} \end{block} \end{frame} %--------------------- \begin{frame}[fragile] \begin{block}{} \tiny \begin{verbatim} matest(float, float, float): push rbp mov rbp,rsp movss DWORD PTR [rbp-0x4],xmm0 movss DWORD PTR [rbp-0x8],xmm1 movss DWORD PTR [rbp-0xc],xmm2 movss xmm0,DWORD PTR [rbp-0x4] mulss xmm0,DWORD PTR [rbp-0x8] addss xmm0,DWORD PTR [rbp-0xc] movss DWORD PTR [rbp-0x4],xmm0 mov eax,DWORD PTR [rbp-0x4] mov DWORD PTR [rbp-0x10],eax movss xmm0,DWORD PTR [rbp-0x10] pop rbp ret \end{verbatim} \normalsize \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} Compilers are stupid \\ \footnotesize \it or at least not as intelligent as you might like \normalsize \end{center} \end{block} \end{frame} \begin{frame}[fragile] \begin{block}{} \tiny \begin{verbatim} void myfunc( double *a1, double *a2, double *prod, int c) { for(int x=0; x < c; x++) { prod[x] = a1[x] * a2[x]; } } \end{verbatim} \normalsize \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} \large {\tt gcc -O3 -xCORE-AVX512 matest.c } \\ \normalsize \footnotesize \it sometimes isn't enough :-( \normalsize \end{center} \end{block} \end{frame} \begin{frame} \begin{block}{} \begin{center} Manual intervention is required...\\ \end{center} \end{block} \end{frame} \begin{frame}[fragile] \begin{block}{} \tiny \begin{verbatim} void myfunc( double *restrict a1, double *restrict a2, double *prod, int c) { for(int x=0; x < c; x=x+8) { prod[x] = a1[x] * a2[x]; prod[x+1] = a1[x+1] * a2[x+1]; prod[x+2] = a1[x+2] * a2[x+2]; prod[x+3] = a1[x+3] * a2[x+3]; prod[x+4] = a1[x+4] * a2[x+4]; prod[x+5] = a1[x+5] * a2[x+5]; prod[x+6] = a1[x+6] * a2[x+6]; prod[x+7] = a1[x+7] * a2[x+7]; } } \end{verbatim} \normalsize \end{block} \end{frame} \begin{frame} \begin{block}{} \begin{center} \large {\tt icc -O2 -xCORE-AVX512 -qopt-zmm-usage=high } \\ \normalsize \end{center} \end{block} \end{frame} \begin{frame}[fragile] The loop then becomes: \begin{block}{} \footnotesize \begin{verbatim} vmovups zmm0, ZMMWORD PTR [rdi+r8*8] vmulpd zmm1, zmm0, ZMMWORD PTR[rsi+r8*8] vmovupd ZMMWORD PTR[rdx+r8*8], zmm1 \end{verbatim} \normalsize \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} mpicc is your friend\\ \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} mpicc is the "MPI code compiler" \begin{itemize} \item<2-> {\tt mpicc} \item<3-> {\tt mpiicc} \item<4-> {\tt mpiifort} \item<5-> {\tt mpicxx} \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} mpicc is really a wrapper for the standard (GCC and Intel) compilers \end{center} \end{block} \end{frame} %--------------------- \begin{frame}[fragile] \begin{block}{} \tiny \begin{verbatim} $ mpicc -show mycode.c /ssoft/spack/paien/v2/opt/gcc/bin/gcc mycode.c -I/ssoft/spack/paien/v2/opt/mvapich2/include -L/ssoft/spack/paien/v2/opt/mvapich2/lib -lmpi \end{verbatim} \normalsize \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} OpenMP support comes from the standard compiler \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} Syntax is compiler dependent \begin{itemize} \item<2-> {\tt gcc -fopenmp mycode.c} \item<3-> {\tt icc -qopenmp mycode.c} \item<4-> {\tt gfortran -fopenmp mycode.f95} \item<5-> {\tt ifort -qopenmp mycode.f95 } \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} Running parallel codes \end{center} \end{block} \end{frame} \begin{frame} \begin{block}{} \begin{center} \it How can we start our parallel code on lots of nodes at the same time? \end{center} \end{block} \end{frame} \begin{frame} \begin{block}{} \begin{center} \tt \large srun mycode.x \normalsize \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} {\it What resources do I need?} \begin{itemize} \item<2-> Number of nodes? \item<3-> Memory per node? \item<4-> Total number of tasks? \item<5-> Tasks per node? \item<6-> Hardware architecture? \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} Job scripts \end{center} \end{block} \end{frame} %--------------------- \begin{frame}[fragile] \begin{block}{} \footnotesize \begin{verbatim} #!/bin/bash #SBATCH --ntasks=96 #SBATCH --cpus-per-task=1 #SBATCH --mem-per-cpu=4096 module purge module load gcc module load mvapich2 srun my_mpi_code.x --in=myinput.dat \end{verbatim} \normalsize \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} {\it Where will these 96 tasks end up? } \begin{itemize} \item<2-> 24:24:24:24 ? \item<3-> 28:28:28:12 ? \item<4-> 16:16:16:16:16 ? \end{itemize} \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} {\tt --ntasks-per-node=24} \\ or \\ {\tt --distribution=cyclic:block} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} Hybrid codes \end{center} \end{block} \end{frame} %--------------------- \begin{frame}[fragile] \begin{block}{} \footnotesize \begin{verbatim} #!/bin/bash #SBATCH --nodes=1 #SBATCH --ntasks=6 #SBATCH --cpus-per-task=4 #SBATCH --mem=100G module purge module load intel intel-mpi export OMP_NUM_THREADS=4 srun my_hybrid_code.x --in=myinput.dat \end{verbatim} \normalsize \end{block} \end{frame} \begin{frame}[fragile] \begin{block}{} \footnotesize \begin{verbatim} #!/bin/bash #SBATCH --nodes=1 #SBATCH --ntasks=6 #SBATCH --cpus-per-task=4 #SBATCH --mem=100G module purge module load intel intel-mpi export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK srun my_hybrid_code.x --in=myinput.dat \end{verbatim} \normalsize \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} Binding tasks to cores with srun \begin{itemize} \item<2-> {\tt --cpu\_bind=rank } \item<3-> {\tt --cpu\_bind=verbose,rank } \item<4-> {\tt --cpu\_bind=sockets} \item<5-> {\tt --cpu\_bind=mask\_cpu:f,f0} \end{itemize} \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \begin{center} With OpenMP we can \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} Set the number of threads per domain: \begin{itemize} \item<2-> {\tt export OMP\_NUM\_THREADS=8 } \end{itemize} \end{block} \end{frame} \begin{frame} \begin{block}{} Control how the threads are placed when using Intel \begin{itemize} \small \item<2-> {\tt export KMP\_AFFINITY=scatter} \item<3-> {\tt export KMP\_AFFINITY=compact } \normalsize \end{itemize} \end{block} \end{frame} \begin{frame} \begin{block}{} Control how the threads are placed when using GCC \begin{itemize} \small \item<2-> {\tt export OMP\_PROC\_BIND=SPREAD} \item<3-> {\tt export OMP\_PROC\_BIND=CLOSE} \item<4-> {\tt export GOMP\_CPU\_AFFINITY="0 4 8 12"} \normalsize \end{itemize} \end{block} \end{frame} \begin{frame} \begin{block}{} \begin{center} Errors... \end{center} \end{block} \end{frame} %--------------------- \begin{frame} \begin{block}{} \tt \small Please verify that both the operating system and the processor support Intel MOVBE, FMA, BMI, LZCNT and AVX2 instructions. \normalsize \end{block} \end{frame} \begin{frame} \begin{block}{} \tt \small ./run.x: error while loading shared libraries: libmkl\_intel\_lp64.so: cannot open shared object file: No such file or directory \normalsize \end{block} \end{frame} \begin{frame} \begin{block}{} \tt \footnotesize Fatal error in MPI\_Init: Other MPI error, error stack: .MPIR\_Init\_thread(514): .MPID\_Init(320).......: channel initialization failed .MPID\_Init(716).......: PMI\_Get\_id returned 14 \normalsize \end{block} \end{frame} \begin{frame} \begin{block}{} \end{block} \end{frame} \begin{frame} \begin{block}{} \begin{center} https://scitas-data.epfl.ch/kb \end{center} \end{block} \end{frame}