%!TEX root = ../s2hat_jpaper.tex
\section{Numerical experiments}\label{sec:experiments} % (fold)
In order to evaluate the accuracy, efficiency and scalability of our new algorithm, we perform a series of tests with different geometry of the 2-sphere grid and different values of $\ell_{max}$ and $m_{max}$. To validate our algorithms, in each experiment we perform a pair of backward and forward SHTs starting out with a backward transform on a set of random $\bm{a}^{init}_{\ell m}$ coefficients which are uniformly distributed random numbers in the range $(-1,1)$. To measure the accuracy of the algorithm we analyse the resulting map and we calculate the error as the difference between the final output $\bm{a}^{out}_{\ell m}$ and the initial set $\bm{a}^{init}_{\ell m}$ via the following formula,
\begin{eqnarray}\label{eq:error}
{\cal D}_{err} & = & \sqrt{\frac{\sum\limits_{\ell}\sum\limits_{m}\l|\bm{a}^{init}_{\ell m} - \bm{a}^{out}_{\ell m}\r|^{2}}{\sum\limits_{\ell}\sum\limits_{m}\l|\bm{a}^{init}_{\ell m}\r|^{2}}}.
\end{eqnarray}
The pairs of transforms were tested on HEALPix grids with a different resolution parameter $N_{side}$. Accordingly to this parameter $n_{pix} = 12N^{2}_{side}$, ${\cal R}^{N}=4N_{side}-1$ where maximum lenght of the ring (equator) is equal to $4N_{side}$. Finally we also choose $\ell_{max}=m_{max}=2 \times N_{side}$, except for some specific experiments.
\subsection{Validation against other implementation}\label{sssub:validate}
In this experiment, a set of $a^{init}_{\ell m}$ coefficients with a different value of $\ell_{max}$ was generated, then a pair of backward and forward transforms were performed to converted $a^{init}_{\ell m}$ to a HEALPIx map with a fixed $N_{side}=1024$ and then back to $a_{\ell m}^{out}$. Starting with the same input array we used our both implementation (MPI/OpenMP \& MPI/CUDA) and the \textbf{libpsht} library \cite{libpsht} to evaluate transforms. We measured the total time and the final error ${\cal D}_{err}$ Eq.~\eqref{eq:error}.
Figure~\ref{fig:libpsht_err} depicts ${\cal D}_{err}$ errors for \textbf{s2hat\_\{mt,cuda\}} and \textbf{libpsht} transforms pairs on HEALPIx grids with various $\ell_{max}$ and $N_{side}$ values. Every data point is the maximum error value encountered in the pairs of transforms. As we can observe, all implementations provide almost identical quality of solutions with respect to different sizes of the problem.
\begin{figure}[htp]
\begin{center}
\includegraphics[width=0.95\textwidth]{figures/alm2map_libpsht_err.png}
\end{center}
\caption{${\cal D}_{err}$ for \textbf{s2hat\_\{mt,cuda\}} and \textbf{libpsht} transforms pairs on HEALPIx grids with various $\ell_{max}$ and $N_{side}$. Every data point is the maximum error value encountered in transforms pairs. Loss of precision seen in the left panel for $\ell_{max} \simgt 2048$ and shared by both implementations is due to aliasing inherent to the adopted pixelization scheme. }
\label{fig:libpsht_err}
\end{figure}
% \subsubsection{Scaling with $\ell_{max}$}
%
% To study the time scaling of the algorithm with different value of band limit, we perform a number of experiments with a fixed $N_{side} = 1024$ and various values of $\ell_{max}$. The results are shown in Figure~\ref{fig:libpsht_lmax_t}. As expected, the time scales almost with $\ell_{max}^{3}$. The deviation from near perfect scalability can be observed only for GPU implementation. The effect is more evident when we represent the same result in terms of CPU time ratios between \textbf{libpsht} and our implementation. The GPU version loses rapidly performance with growing $\ell_{max}$ but only down to some determined level.
%
% \begin{figure}[htp]
% \begin{center}
% \includegraphics[width=0.95\textwidth]{figures/alm2map_libpsht_lmax_t.pdf}
% \end{center}
% \caption{Single CPU/GPU time of forward and inverse SHTs on a HEALPIx grid with $N_{side}=1024$ and various $\ell_{max}$.}\label{fig:libpsht_lmax_t}
% \end{figure}
%
% \begin{figure}[htp]
% \begin{center}
% \includegraphics[width=0.95\textwidth]{figures/alm2map_libpsht_lmax_sup.pdf}
% \end{center}
% \caption{CPU time ratios between our hybrid implementation and \textbf{libpsht}. Timing measured on a single CPU/GPU in experiment with fixed $N_{side}$ and various $\ell_{max}$}\label{fig:libpsht_lmax_sup}
% \end{figure}
\subsubsection{Scaling with growing resolution and performance comparison with \textbf{libpsht}}
To study the time scaling of the algorithm with growing resolution of the problem defined via $N_{side}$ parameter, we perform a number of experiments with various $N_{side} = \ell_{max}/2$ parameter. Like in previous experiment we compare our results with \textbf{libpsht} package.
We choose Martin Reinecke's \textbf{libpsht} because it is to our knowledge, the fastest implementation of SHTs suitable for an astrophysical application and it is essentially a realisation of the same algorithm as the one used in our parallel algorithm. This highly optimized implementation is based on explicit multithreading and SIMD extensions (SSE and SSE2) but its functionality is limited to the shared memory systems only. Consequently, we perform all comparison experiments with only one pair of multi-core processor and GPU on three different systems listed in Table~\ref{tab:arch}.
\begin{table}[h]
\begin{center}
\begin{tabular}{lcllr}
\emph{CPU} & \emph{clock speed} & \emph{GPU} & \emph{Compilators} & \emph{line style} \\ \hline\hline
{\sc Core i7-960K} & 3.20 GHz & {\sc GeForce GTX 480 } & gcc 4.4.3 / nvcc 4.0 & solid line \\
{\sc Core i7-2600K} & 3.40 GHz & {\sc GeForce GTX 460 } & gcc 4.4.5 / nvcc 4.0 & dashed line \\
{\sc Xeon X5570} & 2.93 GHz & {\sc Tesla S1070 } & icc 12.1.0 / nvcc 4.0 & dot line \\ \hline
\end{tabular}
\end{center}
\caption{Different Intel multi-core processors and NVDIA GPUs used in our experiments.}\label{tab:arch}
\end{table}
In a similar fashion as in our multithreaded version, all CPU-intensive parts of \textbf{libpsht}'s transforms algorithm are instrumented for parallel execution on multi-core computers using OpenMP directives. Therefore, in all our tests we set-up for all runs a number of OpenMP threads equal to the number of physical cores ($\text{OMP\_NUM\_THREADS} = 4$).
The range of pixelization sizes was limit by the size of the memory of a single GPU. Therefore we limit our tests to $N_{side} = 4096$. We also limit timing measurement in this experiment to the recurrence part only (step 3 in Algorithm \ref{algo:map2almBasic} and step 2 in Algorithm \ref{algo:alm2mapBasic}), however in case of CUDA routines, those measurements includes also the time spend on any data movements between GPU and CPU.
The results of this experiment are shown in Figures~\ref{fig:libpsht_ndside_alm2map_t} and~\ref{fig:libpsht_ndside_map2alm_t}. The figures depict the measured times in seconds and the number of Gflops per second in logarithmic scale. The number or flops include all the operations necessary for the purpose of the numerical stability like rescaling in case of \s2hat routines (see paragraph~\ref{sec:background}).
\begin{figure}[htp]
\begin{center}
\includegraphics[width=0.95\textwidth]{figures/res_alm2map.png}
\end{center}
\caption{Benchmarks for direct SHT performed on different (see Table~\ref{tab:arch} for details) pairs of Intel multi-core processor with GPU on a HEALPIx grid. The top plot shows timings for projection on Legendre polynomials, the libpsht code (orange circles) is compared with hybrid \s2hat codes (blue\&green lines). In each case we use a HEALPIx grid with $N_{side}=\ell_{max}/2$. In case of CUDA routines, timings include time of data movements between CPU and GPU. Bottom pane shows number of GFLOPS per second. Note that CUDA routines loose performance in comparison to the SSE optimised lipsht due to very costly reduction of partial results. First in shared memory and then on host memory with use of CPU. Nevertheless, NVIDIA GTX 480 outperforms libpsht on Core i7-960K. }\label{fig:libpsht_ndside_alm2map_t}
\end{figure}
\begin{figure}[htp]
\begin{center}
\includegraphics[width=0.95\textwidth]{figures/res_map2alm.png}
\end{center}
\caption{Benchmarks for inverse SHT performed on different (see Table~\ref{tab:arch} for details) pairs of Intel multi-core processor with GPU on a HEALPIx grid. The top pane shows timings for projection on Legendre polynomials, the libpsht code (orange circles) is compared with hybrid \s2hat codes (blue\&green lines). In each case we use a HEALPIx grid with $N_{side}=\ell_{max}/2$. In case of CUDA routines, timings includes time of data movements between host and device. Bottom pane shows number of GFLOPS per second. Note that GeForce 400 Series equipped with latest CUDA architecture (codename "Fermi") outperform libpsht on the latest, Intel Core i7-2600K. Bear in mind that GeForce family is not design for professional use therefore their double-precision throughput has been reduced to 25\% of the full design.}\label{fig:libpsht_ndside_map2alm_t}
\end{figure}
As expected, our multithreaded version is always slower in comparison to \textbf{libpsht} by a constant factor. Mainly, due to lack of SSE intrinsics in our code, which allows on some hardware to perform two or more arithmetic operations of the same kind with a single machine instruction. Especially on latest intel i7-2600K processor, SSE optimised \textbf{libpsht} implementation takes advantage of the extended width of the SIMD register\footnote{The width of the SIMD register in file on i7-2600K is increased from 128 bits to 256 bits in respect with previous generation of i7 processors} and Advanced Vector Extensions (AVX) which significantly increase parallelism and throughput in floating point SIMD calculations.
The other important observation is that forward and backward SHTs on CPU, with identical parameters, require almost identical time. Furthermore, performance analysis allows several deductions about performance of GPU version of SHTs algorithms.
\begin{itemize}
\item It is fairly evident that forward transform on CUDA with slower, not only with respect to its CPU version, but also in comparison with backward transform on the same architecture. Since the implementation of the CUDA kernel for realisation of projection onto associated Legendre polynomials is essentially the same for both directions of the transform, the performance lost is mainly due to very expensive reduction of partial results. First in shared memory and then on slow global memory or on host memory with use of CPU.
\item Thanks to the latest CUDA architecture (codename "Fermi") GeForce 400 Series of GPUs by far dominate over old generation Tesla S1070. Note that, unlike Tesla series, GeForce family is not designed for professional use therefore their double-precision throughput has been reduced to 25\% of the full design. Nevertheless, those two GPUs (GTX 460 \& GTX 480) outperform \textbf{libpsht} on the latest intel i7 processor in the evaluation inverse transform.
\item CUDA version seems to favour cases with big $N_{side}$ which indicates usage of many threads (as we explained in~\S\ref{sub:sht_on_gpu}, each ring of the sphere is assigned to one CUDA thread) i.e., both GPU routines gain performance with growing number of sky-pixels (thus, also rings).
\end{itemize}
\subsection{Scaling with number of multi-core processors and GPUs}
The main purpose of using parallel SHTs is to exhibit data parallelism i.e., to allow us perform SHTs in big resolutions by distributing our input and output data across different computing nodes to be processed in parallel. A good parallel algorithm should scale with a size of the problem and a number of computing nodes. To measure those two quantities we perform a series of experiments with different values of $N_{side}$ and a number of MPI processes spanned on hybrid pairs of multi-core CPU-GPU. For all experiments in this section, we set $\ell_{max}= 2\times N_{side}$.
All experiments in this section were executed on CCRT's
% \footnote{{\scriptsize http://www-ccrt.cea.fr/fr/moyen\_de\_calcul/titane.htm}}
super computer, one of the first hybrid cluster built, based on Intel and Nvidia processors. It has 1068 nodes dedicated for computation, each node is composed of 2 Intel Nehalem quad-cores (2.93 GHz) and 24 GB of memory. One part of the system has a Multi-GPU support with 48 Nvidia Tesla S1070 servers, 4 Tesla cards with 4GB of memory and 960 processing units are available for each server. Two computation nodes are connected to one Tesla server by the PCI-Express bus, therefore 4 processors handle 4 Tesla cards. The high performance interconnect is based on Infiniband DDR. This system provides a peak performance of 100 Tflops for the Intel part and 200 Tflops for the GPU part. Intel compiler version 12.0.3 and NVIDIA Cuda compiler 4.0 were used for compilation. Furthermore, we span MPI processes in our experiment in such a way that exactly one MPI is assigned to one pair of multi-core processor and GPU (4 MPI processes per one Tesla server). To increase intra-node parallelism we set-up for all runs a number of OpenMP threads equal to the number of physical cores of Nehalem CPU ($\text{OMP\_NUM\_THREADS} = 4$).
Figure~\ref{fig:scale_all_nside} shows maximum overall time measured on one of 128 CPU/GPUs involves in computation of pair of SHTs in different size of the map. As we can observe, our both implementations scale well with respect to the size of the problem.
\begin{figure}[htp]
\begin{center}
\includegraphics[width=0.95\textwidth]{figures/scale_all_nside.png}
\end{center}
\caption{Maximum elapsed wall clock time for a pair of transforms with various $N_{side}$ and $\ell_{max} = 2\times N_{side}$. Experiment evaluated with 128 MPI processes spanned on the same number of pairs Intel Nehalem quad-cores and Nvidia Tesla S1070.}\label{fig:scale_all_nside}
\end{figure}
Figures~\ref{fig:scale_all_cpu} and~\ref{fig:scale_all_gpu} depict the runtime distribution over main steps of the algorithm multiplied by the number of MPI processes. In both versions, all steps scale almost perfectly (at least up to 128 MPI processes). As expected, only the communication cost grows slowly with number of processes. Note, that according to our prediction (see \S\ref{sub:t_consumption}), the evaluation of the projection onto associated Legendre polynomials dominates by far over the remaining steps and it de facto defines the overall time. On contrary, the time spend on memory transfers between host and the GPU device never exceed $1s$, and deviation from perfect scalability visible on plots can be explained by random errors of {\tt MPI\_Wtime()} function used in our experiments for time measurement (due to its limited resolution, {\tt MPI\_Wtime()} rerun trustable results only for elapsed times greater then $1s$).
\begin{figure}[htp]
\begin{center}
\includegraphics[width=0.95\textwidth]{figures/scale_all_cpu.png}
\end{center}
\caption{Maximum elapsed wall clock time distribution over main steps of the algorithm for the hybrid MPI/OpenMP version of the \textbf{alm2map\_mt} and \textbf{map2alm\_mt} routines. Results for experiment with $N_{side} = 16384$ and $\ell_{max} = 2\times N_{side}$.}\label{fig:scale_all_cpu}
\end{figure}
\begin{figure}[htp]
\begin{center}
\includegraphics[width=0.95\textwidth]{figures/scale_all_gpu.png}
\end{center}
\caption{Maximum elapsed wall clock time distribution over steps of the algorithm for the hybrid MPI/CUDA version of the \textbf{alm2map\_cuda} and \textbf{map2alm\_cuda} routines. Results for experiment with $N_{side} = 16384$ and $\ell_{max} = 2\times N_{side}$.}\label{fig:scale_all_gpu}
\end{figure}
In both implementations we assign the evaluation of FFTs to the CPU due to its slightly better performance in comparison to CUFFT also available on Titane (see Fig.~\ref{fig:cufft_vrs_fftw3}).
\begin{figure}[htp]
\begin{center}
\includegraphics[width=0.95\textwidth]{figures/scale_fft_vrs_cufft.png}
\end{center}
\caption{The overall difference between the FFTs evaluation on Titane supercomputer via cuFFT on GPU and Intel MKL Fourier Transform wrapped to FFTW3 interface. Values correspond to experiment with $N_{side} = 16384$ and $\ell_{max} = 2\times N_{side}$.}\label{fig:cufft_vrs_fftw3}
\medskip
Finally, if we compare overall time between CPU and GPU version, we find out that implementation of \textbf{alm2map} for CUDA is in all our big test cases, in average $\times 3.3$ faster then multithreaded version. In contrary, \textbf{map2alm} due to time consuming reduction operations remains slower then its counterpart dedicated for CPUs. Average speed-up of hybrid implementation MPI/CUDA with respect to hybrid MPI/OpenMP version is depicted in Figure~\ref{fig:scale_sup}
\end{figure}
\begin{figure}[htp]
\begin{center}
\includegraphics[width=0.95\textwidth]{figures/scale_sup.png}
\end{center}
\caption{Average speed-up of hybrid implementation MPI/CUDA with respect to hybrid MPI/OpenMP version for different number of MPI processes.}\label{fig:scale_sup}
\end{figure}
\subsection{SHTs accuracy on CUDA}
Both x86 and CUDA architecture used in our experiments respect the IEEE Standard for Binary Floating-Point Arithmetic (IEEE 754-1985). IEEE 754 standardizes how arithmetic results should be approximated in floating point. Whenever working with inexact results, programming decisions can affect accuracy. It is important to consider many aspects of floating point behaviour in order to achieve the highest performance with the precision required for any specific application. This is especially true in a heterogeneous computing environment where operations are perform on different types of hardware.
The IEEE 754 standard defines a handful of basic arithmetic operations like add, subtract, multiply, divide and square root. The results of these operations are guaranteed to be the same for all implementations of the standard, for a given format and rounding mode. However, IEEE standard do not define accuracy of math functions like sinus, cosines or logarithm, which depends on implementation. For instance CUDA math library includes all the math functions listed in the C99 standard plus some additional useful functions \cite{CUDAManual}. These functions have been tuned for a reasonable compromise between performance and accuracy which may differ from their equivalent on x86 architectures. For that reason, we performed series of experiment in which we converted the same random set $a^{init}_{\ell m}$ (see beginning of this section) to a HEALPIx map on two different architecture, namely NVIDIA GPU and x86 processor, and we calculate an absolute difference of those two maps (see the example result in Figure~\ref{fig:cuda_cpu_diff}). In all our tests the maximum difference between two sky-pixels never exceeded value of $1.0\cdot10^{-11}$, which indicates a very good agreement between those two architectures.
\begin{figure}
\centering
\subfigure[Reference map evaluated on CPU]{\label{fig:ref}\includegraphics[width=0.45\textwidth]{figures/ref_map.png}}
\subfigure[Map generated on Nvidia Tesla S1070 GPU]{\label{fig:cuda}\includegraphics[width=0.45\textwidth]{figures/gpu_map.png}} \\
\subfigure[Absolute difference between maps generated on CPU and Nvidia GPU.]{\label{fig:diff}\includegraphics[width=0.9\textwidth]{figures/map_diff.png}}
\caption{CMB maps generated on two different architectures and their absolute difference.}
\label{fig:cuda_cpu_diff}
\end{figure}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "../s2hat_jpaper"
%%% End: