%!TEX root = ../s2hat_jpaper.tex
\section{Algebraic background}\label{sec:background} % (fold)
% Before deriving algorithms used in our analysis, it is necessary to outline some mathematical preliminaries i.e., parameterization of our domain and harmonic analysis of scalar functions on 2-sphere $S^{2}$.
\subsection{Definitions and notations}
Just as the Fourier basis facilitates a quick and efficient evaluation of convolutions in one or more dimensional spaces, the spherical harmonic basis does so but for functions defined on the surface of a sphere, i.e., depending on spherical coordinates: $\theta\in(0,\pi)$ (colatitude) and $\phi\in[0,2\pi)$ (the angle counterclockwise about positive $z$-axis from the positive $x$-axis). We can express a band-limited approximation $\tilde{f}$ to a real-valued spherical function $f(\theta, \phi)\;f : S^{2}\rightarrow\mathbb{R}$ in terms of the spherical harmonic basis functions as
{ \small
\begin{eqnarray}\label{eq:direct_continous}
\tilde{f}\l(\theta,\phi\r) & = & \sum_{\ell=0}^{\ell_{max}}\,\sum_{m=-\ell}^{\ell}\,a_{\ell m}\,Y_{\ell m}\l(\theta,\phi\r),
\end{eqnarray}
}
where $a_{\ell m}$ are spherical harmonic coefficients describing $f$ in the harmonic domain, $Y_{\ell m}$ is a spherical harmonic basis function of degree $\ell$ and order $m$, and the bar over $f$ indicates that it is merely a band-limited approximation of $f$ with a bandwidth $\ell_{max}$. The spherical harmonic coefficients are computed by taking the scalar inner product of $f$ with the corresponding basis function, which can be expressed as the integral
\begin{equation}\label{eq:invers_continous}
a_{lm}=\int\limits_{\theta = 0}^{\pi}\int\limits_{\phi=0}^{2\pi}\,d\theta\,d\phi\;f(\theta,\phi)Y^{\dagger}_{lm}(\theta,\phi)\sin\theta,
\end{equation}
where $\dagger$ denotes complex conjugation.
In actual applications the spherical harmonic transforms are rather used to project \emph{grid point data} (a discretized scalar field $\bm{s}_{n}$) on the sphere onto the spectral
modes in an \emph{analysis} step (as displayed in equation~\eqref{eqn:map2almDef}) and an inverse transform reconstructing the grid point data from the spectral information in a \emph{synthesis} step (as displayed in equation~\eqref{eqn:alm2mapDef}). Depending on the choice of $\ell_{max}$ and the grid geometry the transform $\bm{s}_{n} \rightarrow \bm{a}_{lm}$ may only be approximate, what is indicated by a tilde over $\bm{a}$.
{ \small
\begin{eqnarray}
\tilde{\bm{a}}_{\ell m} & = & \sum_{\l\{\bm{\theta}_{n},\bm{\phi}_{n}\r\}}\,\bm{s}_{n}\l(\bm{\theta}_{n},\bm{\phi}_{n}\r)\,Y_{\ell m}\l(\bm{\theta}_{n},\bm{\phi}_{n}\r),\label{eqn:map2almDef} \\
\bm{s}_{n}\l(\bm{\theta}_{n},\bm{\phi}_{n}\r) & = & \sum_{\ell=0}^{\ell_{max}}\,\sum_{m=-\ell}^{\ell}\,\bm{a}_{\ell m}\,Y_{\ell m}\l(\bm{\theta}_{n},\bm{\phi}_{n}\r). \label{eqn:alm2mapDef}
\end{eqnarray}
}
In CMB applications, $\bm{s}_{n}$ is a vector of pixelized data, e.g. the brightness of incoming CMB photons, assigned to $n_{pix}$ locations $\l(\theta_{n}, \phi_{n}\r)$ on the sky defined as centres of suitable chosen sky pixels. The parameter $\ell_{max}$ defines a maximum order of the Legendre function and thus a band-limit of the field $\bm{s}_{n}$, and in practice it is set by the experimental resolution.
The basis functions $Y_{\ell m}\l(\bm{\theta}_{n},\bm{\phi}_{n}\r)$ are defined in terms of normalised associated Legendre functions ${\cal P}_{\ell m}$ of degree $l$ and order $m$,
{ \small
\begin{eqnarray}
Y_{\ell m}\l(\bm{\theta}_{n},\bm{\phi}_{n}\r) \equiv {\cal P}_{\ell m}\l(\cos \bm{\theta}_{n}\r) e^{i m \bm{\phi}_{n}}, \label{eqn:YlmDef}
\end{eqnarray}
}
where we use the relation $Y_{l,-m}=(-1)^{m}Y^{\dagger}_{lm}$ to compute all spherical harmonics in terms of the associated Legendre functions $P_{lm}$ with $m \geq 0$.
Consequently, by separating variables following equation~\eqref{eqn:YlmDef} we can reduce the computation of the spherical harmonic transform to a regular Fourier transform in the longitudinal coordinate $\phi_{n}$ followed by a projection onto the associated Legendre functions,
\begin{eqnarray}\label{eq:separation}
\tilde{\bm{a}}_{\ell m} & = & \sum_{\l\{\bm{\theta}_{n}\r\}} \l[\l( \sum_{\l\{\bm{\phi}_{n}\r\}}\,\bm{s}_{n}\l(\bm{\theta}_{n},\bm{\phi}_{n}\r)\,e^{i m \bm{\phi}_{n}}\r){\cal P}_{\ell m}\l(\cos \bm{\theta}_{n}\r)\r].
\end{eqnarray}
The associated Legendre functions satisfy the following recurrence (with respect to the multipole number $\ell$ for a fixed value of $m$) critical for the algorithms developed in this paper,
%{ \small
\begin{eqnarray}\label{eqn:assLegRec}
{\cal P}_{\ell+2, m}\l(x\r) = \beta_{\ell+2, m} x \, {\cal P}_{\ell+1, m}\l(x\r) + %\nonumber \\
\frac{\beta_{\ell+2, m}}{\beta_{\ell+1, m}} {\cal P}_{\ell m}\l(x\r),%\Big]
\end{eqnarray}
%}
where
%{ \small
\begin{eqnarray}\label{eqn:betaDef}
\beta_{\ell m} = \sqrt{ \frac{4 \,\ell^2-1}{\ell^2-m^2}}.
\end{eqnarray}
%}
The recurrence is initialised by the starting values,
%{ \small
\begin{eqnarray}\label{eqn:pmm}
{\cal P}_{m m}\l(x\r) & = & \frac{1}{2^m\,m!}\, \sqrt{\frac{\l(2m+1\r)!}{4\pi}}\,\l(1-x^2\r)^{m} \nonumber \\
&\equiv& \mu_m \, \l(1-x^2\r)^{m}, \\
{\cal P}_{m+1, m}\l(x\r) & = & \beta_{\ell+1,m}\,x\,{\cal P}_{mm}\l(x\r).\label{eqn:pm1m}
\end{eqnarray}
%}
The recurrence is numerically stable but a special care has to be taken to avoid under- or overflow for large values of $\ell_{max}$~\cite{Gorski_etal_2005} e.g., for increasing $m$ the ${\cal P}_{m m}$ values can become extremely small such that they can no longer be represented by the binary floating-point numbers in IEEE 754\footnote{The IEEE has standardised the computer representation for binary floating-point numbers in IEEE 754. This standard is followed by almost all modern machines.} standard. However, since we have freedom to rescale all the values of ${\cal P}_{\ell m}$, we can dynamically rescale current values, while performing the recurrence, to prevent a potential overflow on a subsequent step. More precisely on each step the newly computed value of the associated Legendre function is tested and rescaled if found to be too close to the over- or underflow limits. The rescaling coefficients (e.g., in form of their logarithms) are kept track of and used to rescale back all the computed values of ${\cal P}_{\ell m}$ at the end as required. This scheme is based on two facts. First, the values of the associated Legendre functions calculated via the recurrence change gradually and rather slowly on each step. Second, their actual values as needed by the transforms are representable within the range of the double precision values.
The specific implementation of these ideas used in all our algorithms follows that of the \s2hat software and the {\cal HEALPix} package. It is based on a use of a precomputed vector of values, sampling the dynamic range of the representable double precision numbers and thus avoids any explicit computation of numerically-expensive logarithms and exponentials. This scaling vector is used to compare the values of ${\cal P}_{\ell m}$ computed on each step of the recurrence, and then it is use to rescale them if needed. For sake of simplicity, in this paper we assume that this is an integral part of associated Legendre function evaluation (independent of architecture) and we omit explicit description of algorithm related to rescaling. For more details we refer to \cite{hupca} and documentation of the {\cal HEALPix} package.
\subsection{Specialized formulation}\label{sec:additional_assumptions} % (fold)
For a general grid of ${\cal O}(n_{pix})$ points on the sphere, a direct calculation has computational complexity ${\cal O}(\ell^{2}_{max} n_{pix})$ which is a serious limitation for problems in range of our interest. For instance, the current and forthcoming balloon-borne and ground-based experiments will produce maps of sky containing as many as $n_{pix}\in[10^5,10^6]$ pixels and up to ${\cal O}(10^6 -10^7)$ harmonic modes. Maps from already operating Planck satellite will consist of between ${\cal O}(10^6)$ and ${\cal O}(10^8)$ pixels and harmonic
modes. However, the algorithm we use hereafter and which is described in the next section, scales as ${\cal O}(n_{pix}\,\ell_{max})$. We obtain such complexity with help of additional geometrical constraints imposed on the permissible pixelizations. These are~\cite{Gorski_etal_2005},
\begin{itemize}
\item the map pixels are arranged on ${\cal R}_N\approx\sqrt{n_{pix}}$ iso-latitudinal rings, where latitude of each ring is identified by a unique polar angle $\theta_{n}$ (for sake of simplicity hereafter we will refer to this angle by the ring index i.e., $r_{n} \rightarrow \theta_{n}$, where $n\in[0,{\cal R}_{N}]$)
\item within each ring $r_{n}$, pixels must be equidistant ($\Delta \phi=${\sc const}), though their number can vary from a ring to another ring.
\end{itemize}
The requirement of the iso-latitude distribution for all pixels helps to save on the number of required FLOPs as the associated Legendre function need now be calculated only once for each pixel ring. Consequently, nearly all spherical grids, which are currently used in CMB analysis (for instance HEALPix and GLESP), as well as other fields conform with such assumptions.
Taking into account these restrictions and definition of basis function $Y_{\ell m}$~\eqref{eqn:YlmDef} we can rewrite (adapted from \cite{Gorski_etal_2005}) the synthesis step from equation~\eqref{eqn:alm2mapDef} as
{ \small
\begin{equation}\label{eqn:Delta2map}
\bm{s}\l({r}_{n},\bm{\phi}_{n}\r) \, = \, \sum_{m=-\ell_{max}}^{\ell_{max}} \, e^{i m \phi_{0}}\,\bm{\Delta}^{A}_m\l({r}_{n}\r),
\end{equation}
}
where $\bm{\Delta}^{A}_m\l({r}_{n}\r)$ is a set of functions such as
{ \small
\begin{equation}\label{eqn:DeltaDef}
\bm{\Delta}^{A}_m\l({r}_{n}\r) \equiv \l\{
\begin{array}{l l}
{\displaystyle \sum_{\ell=0}^{\ell_{max}}\,\bm{a}_{\ell 0} \, {\cal P}_{\ell 0}\l(\cos{r}_{n}\r),} & {\displaystyle m = 0,}\\
{\displaystyle \sum_{\ell=m}^{\ell_{max}}\,\bm{a}_{\ell m}\,{\cal P}_{\ell m}\l(\cos{r}_{n}\r),} & {\displaystyle m > 0,}\\
{\displaystyle \sum_{\ell=\l|m\r|}^{\ell_{max}}\,\bm{a}_{\ell \l|m\r|}^\dagger\,{\cal P}_{\ell \l|m\r|}\l(\cos{r}_{n}\r),} & {\displaystyle m < 0,}
\end{array}
\r.
\end{equation}
}
and respectively the analysis step\eqref{eqn:map2almDef} as
{ \small
\begin{equation}\label{eqn:almdef}
\tilde{\bm{a}}_{\ell m} = \sum^{{\cal R}_{N}}_{n = 0} \, \bm{\Delta}^{S}_m\l({r}_{n}\r)\,{\cal P}_{\ell m}\l(\cos{r}_{n}\r),
\end{equation}
}
where
{ \small
\begin{equation}\label{eqn:almfftdef}
\bm{\Delta}^{S}_m\l({r}_{n}\r) \equiv \sum_{\l\{\phi_{n}\r\}}\, \bm{s}\l({r}_{n},\bm{\phi}_{n}\r)\,e^{-im\phi_{0}}.
\end{equation}
} The $\bm{\phi}_{0}$ denotes the $\phi$ angle of the first pixel in the ring.
From above, we can see that the spectral harmonic transforms can be split into two successive computations: one calculating the associated Legendre functions, Eqs.~\eqref{eqn:DeltaDef} and~\eqref{eqn:almdef}, and the other performing the series of Fast Fourier Transforms (FFTs), Eqs.~\eqref{eqn:Delta2map} and~\eqref{eqn:almfftdef}~\cite{Muciaccia_etal_1997}. This splitting immensely facilitates implementation, for instance, in order to evaluate Fourier transforms we can use third-party libraries devoted to this problem. However, we have to bear in mind that different libraries offer different robustness with strong dependence on length of FFT. This focus our special attention since the number of samples (pixels) per ring may vary and this strongly affects efficiency of the FFT algorithms. For instance, well-know {\tt fftpack} library \cite{Schw82} used in {\sc healpix} \cite{Gorski_etal_2005} and {\sc libpsht} \cite{libpsht} libraries performs well only for a length $N_{FFT}$, whose prime decomposition only contains the factors $2$, $3$, and $5$, while for prime number lengths its complexity degrades from ${\cal O}\l(N_{FFT} \log N_{FFT}\r)$ to ${\cal O}(N_{FFT}^{2})$. For this reason we employ by default in all versions of our algorithm the {\tt FFTW} library \cite{frigo2005design} which uses ${\cal O}\l(N_{FFT} \log N_{FFT}\r)$ algorithms even for prime sizes.
\subsection{Time consumption}\label{sub:t_consumption}
Three parameters typically determine the sizes and numerical complexity characteristic of our problem. These are a total number of pixels, $n_{pix}$, a number of rings of pixels in the analyzed map, ${\cal R}_N$, and a maximum order of the associated Legendre functions, $\ell_{max}$. In particular a number of modes in the harmonic domain is $\sim \ell_{max}^2$.
In well defined cases these three parameters are not completely independent. For example, in the case of
full sky maps, we have usually $\ell_{max} \propto{\cal R}_N \propto n_{pix}^{1/2}$. This imply that recurrence step requires ${\cal O}\l({\cal R}_{N}\ell_{max}^2\r)$ floating point operations (FLOPS), due to the fact that for each of ${\cal R}_{N}$ rings we have to calculate a full set, up to $\ell_{max}$, of the associated Legendre functions, ${\cal P}_{\ell m}$, at cost $\propto \ell_{max}^2$. The Fourier transform step, as mentioned above, has then subdominant complexity ${\cal O}\l({\cal R}_{N}\ell_{max} \log \ell_{max}\r)$. These theoretical expectations agree with our experimental results shown in Fig.~\ref{fig:pie_plot} depicting a typical breakdown of the average overall time between the main steps involved in the SHTs computation as obtained for a run performed on quad-core Intel i7-2600K processor.
\begin{figure}[htp]
\begin{center}
\includegraphics[width=0.75\textwidth]{figures/recurence_vrs_fft.png}
\end{center}
\vskip -0.5truecm
\caption{The overall time breakdown between the main steps of the SHTs algorithm as computed with the identical band limit ($\ell_{max} = 4096$) for quad-core Intel i7-2600K processor.}\label{fig:pie_plot}
\end{figure}
We can observe that indeed the evaluation of the Legendre transform dominates by far over FFT in terms of time of computation. This motivated us to study the possible improvements of this part of the calculation. We explore this issue in the context of the heterogeneous architectures and hybrid programming models. In particular we introduce a parallel algorithm that is suitable for clusters of accelerators, namely multi-core processors and GPUs which we employ for an efficient evaluation of spherical harmonic transforms.
\nocite{st_analysis}
% section complexity_analisis (end)
\section{Basic algorithm}\label{sec:basic_algorithm} % (fold)
We proceed by developing algorithms which agree with assumptions listed in the previous section. Then, in the next section we introduce their parallel versions and we discuss possible improvements specific to the targeted architectures.
Algorithm \ref{algo:map2almBasic} presents the computation of discrete, direct SHT which is a realisation of equation~\eqref{eqn:map2almDef} for a given "bandwidth" $\ell_{max}$ and input data vector $\bm{s}$ obtained from the samples of the original function which we wish to transform.
\begin{algorithm}
\caption{{\sc Direct transform (Eq.~\ref{eqn:map2almDef})}}
\begin{algorithmic}
{ \small \sl
\REQUIRE{$\bm{s}\in\mathbb{R}_{n_{pix}}$ values}
\STATE{ {\sc step 1 - $\bm{\Delta}^{S}_m\l(r\r)$ calculation} }
\FOR{every ring $r$}
\FOR{every $m = 0, ..., {m}_{max}$}
\STATE{ \hspace{0.1cm} $\circ$ calculate $\bm{\Delta}^{S}_m\l(r\r)$ via FFT Eq.~\eqref{eqn:almfftdef}}
\ENDFOR{($m$)}
\ENDFOR{($r$)}
\medskip
\STATE { {\sc step 2 - Pre-computation }} \\
\FOR{every $m = 0, ..., {m}_{max}$}
\STATE { \hspace{0.1cm} $\circ$ $\bm{\mu}_m$ pre-computation Eq.~\eqref{eqn:pmm}\\ }
\ENDFOR{ ($m$)}
\medskip
\STATE{ {\sc step 3 - Core calculation} }
\FOR{every ring $r$}
\FOR{every $m = 0, ..., {m}_{max}$}
\FOR{every $\ell = m, ..., \ell_{max}$}
\STATE{ \hspace{0.1cm} $\circ$ compute ${\cal P}_{\ell m}$ via Eq.~\eqref{eqn:assLegRec} }
\STATE{ \hspace{0.1cm} $\circ$ update $\bm{a}_{\ell m}$, Eq.~\eqref{eqn:almdef} }
\ENDFOR{ ($\ell$)}
\ENDFOR{ ($m$)}
\ENDFOR{ ($r$)}
\medskip
\RETURN {$\bm{a}_{\ell m}\in\mathbb{C}^{\ell\times m}$}
}
\end{algorithmic}
\label{algo:map2almBasic}
\end{algorithm}
Respectively, algorithm~\ref{algo:alm2mapBasic} presents the computation of discrete, inverse transform from equation~\eqref{eqn:map2almDef} which takes as input a set of complex numbers $\bm{a}_{\ell m}$, interpreted as Fourier coefficients in a spherical harmonic expansion and returns a set of sample values i.e., real data vector $\bm{s}$.
\begin{algorithm}
\caption{{\sc Inverse Transform (Eq.~\ref{eqn:alm2mapDef})}}
\begin{algorithmic}
{ \small \sl
\REQUIRE {$\bm{a}_{\ell m}\in\mathbb{C}^{\ell\times m}$ values}
\STATE { {\sc step 1 - Pre-computation }} \\
\FOR{every $m = 1, ..., {m}_{max}$}
\STATE { \hspace{0.1cm} $\circ$ $\bm{\mu}_m$ precomputation Eq.~\eqref{eqn:pmm}\\ }
\ENDFOR{ ($m$)}
\medskip
\STATE { {\sc step 2 - $\bm{\Delta}^{A}_m$ calculation} }
\FOR{every ring $r$}
\FOR{every $m = 0, ..., {m}_{max}$}
\FOR{every $\ell = m, ..., \ell_{max}$}
\STATE{ \hspace{0.1cm} $\circ$ compute ${\cal P}_{\ell m}$ via Eq.~\eqref{eqn:assLegRec} }
\STATE{ \hspace{0.1cm} $\circ$ update $\bm{\Delta}^{A}_m\l( r\r)$, Eq.~\eqref{eqn:almfftdef} }
\ENDFOR{ ($\ell$)}
\ENDFOR{ ($m$)}
\ENDFOR{ ($r$)}
\medskip
\STATE{ {\sc step 3 - $\bm{s}$ calculation} }
\FOR{every ring $r$}
\FOR{every $m = 0, ..., {m}_{max}$}
\STATE{ \hspace{0.1cm} $\circ$ calculate $\bm{s}$ via FFT and given $\bm{\Delta}^{A}_m\l(r\r)$, Eq.~\eqref{eqn:Delta2map}}
\ENDFOR{($m$)}
\ENDFOR{ ($r$)}
\medskip
\RETURN {$\bm{s}\in\mathbb{R}_{n_{pix}}$}
}
\end{algorithmic}
\label{algo:alm2mapBasic}
\end{algorithm}
\subsection{Similarities}
Both algorithms use a divide and conquer approach i.e., in this approach our problem of computing projection onto associated Legendre functions is decomposed into smaller subproblems of a similar form (rings of pixels). The subproblems are solved recursively by a further subdivision (computation of ${\cal P}_{\ell m}$ via recurrence with respect to the multipole number $\ell$ for a fixed value of $m$) and finally their solutions are combined to solve the original problem. Furthermore, both algorithms consist of three main steps:
\begin{itemize}
\item the pre-computation of the starting values of the recurrence \eqref{eqn:pmm} in the loop over all $m \in [0,m_{max}]$ (step 2 in Algorithm \ref{algo:map2almBasic} and step 1 in Algorithm \ref{algo:alm2mapBasic} ),
\item the evaluation of the associated Legendre function for each ring of pixels via the recurrence from equation~\eqref{eqn:assLegRec}. This requires three nested loops, where the loop over $\ell$ is set innermost to facilitate the evaluation of the recurrence with respect to the multipole number $\ell$ for a fixed value of $m$ (step 3 in Algorithm \ref{algo:map2almBasic} and step 2 in Algorithm \ref{algo:alm2mapBasic} ).
\item step where via FFTs we either generate the cosine transform representation of the input vector $\bm{s}$ \eqref{eqn:Delta2map} (step 1 in Algorithm \ref{algo:map2almBasic}) or to complete computation in inverse algorithm an abelian FFT is performed to compute the sums \eqref{eqn:almfftdef} (step 3 in Algorithm \ref{algo:alm2mapBasic}).
\end{itemize}
\subsection{Differences}
Both algorithms are almost the same in terms of number and type of arithmetic operations required for the computation (see for instance Figure~\ref{fig:pie_plot} or a detailed analysis in \cite{st_analysis}). The main difference consists in the dimension of the partial results which need to be updated within the nested loop where we evaluate the associated Legendre functions (step 3 in Algorithm \ref{algo:map2almBasic} and step 2 in Algorithm \ref{algo:alm2mapBasic}). In case of the direct transform, for each ring of pixels on the sphere we need to update all non-zero values of the matrix $\bm{a}_{\ell m}\in\mathbb{C}^{\ell_{max}\times m_{max}}$ by a product between the precomputed inner sums defined as $\bm{\Delta}^{S}_m$~\eqref{eqn:almfftdef} and associated Legendre function for each fixed $m$. In the case of the inverse transform, where the input is a set of complex numbers $\bm{a}_{\ell m}$, the nested loop (step 2 in Algorithm \ref{algo:alm2mapBasic}) is used to compute the inner sum $\bm{\Delta}^{A}_m$~\eqref{eqn:DeltaDef}, which can be obtained as a matrix-vector product
\begin{equation}\label{eq:matrix_vec}
\bm{\Delta}^{A}_m
=
\l(
\begin{array}{c}
\Delta^{A}_m(r_0,m)\\
\Delta^{A}_m(r_1,m)\\
\vdots\\
\Delta^{A}_m({\cal R}_N,m)\\
\end{array}
\r)
=
\l(
\begin{array}{ccc}
{\cal P}_{m m}\l(\cos r_{0}\r) & \hdots & {\cal P}_{\ell_{max} m}\l(\cos r_{0}\r) \\
{\cal P}_{m m}\l(\cos r_{1}\r) & \hdots & {\cal P}_{\ell_max m}\l(\cos r_{1}\r) \\
\vdots & \vdots & \vdots \\
{\cal P}_{m m}\l(\cos r_{{\cal R}_{N}}\r) & \hdots & {\cal P}_{\ell_{max} m}\l(\cos r_{{\cal R}_{N}}\r) \\
\end{array}
\r)
\,
\l(
\begin{array}{c}
a_{m m} \\
a_{m+1 m} \\
\vdots\\
a_{\ell_{max} m} \\
\end{array}
\r).
\end{equation}
As we will see further in this report, those differences may cause big contrast in terms of evaluation time and memory consumption. This effect can be additionally enhanced by the type of the architecture on which we execute our algorithm.
\section{Parallel Spherical Harmonic Transforms}\label{sec:parallelization}
\subsection{Top-level distributed-memory parallel framework}
The top level parallelism employed in this work is adopted from the S$^2$HAT library developed by the last author. Though not new, it has not been described in detail in the literature yet and we present below its first detailed description including its performance model and analysis. The framework is based on a distributed-memory programming model and involves two computational stages separated by a single instance of a global communication involving all the processes performing the calculations. These together with a specialized, but flexible data distribution, are the major hallmarks of the framework. The two stages of the computations can be further parallelized and/or optimized and we discuss two examples of potential intra-node parallelization approaches below. We note that the framework as such does not imply what algorithm should be used
by the MPI processes, e.g., ~\cite{wavemoth}, though of course its overall performance and scalability will depend on a specific choice.
The major motivation behind
such a distributed memory parallelization level comes from the volumes of the data, both maps and harmonic coefficients, to be analyzed in the applications targeted here and from the need to maintain flexibility of the routines designed to be used in massively parallel applications, where SHTs typically constitute only one step of the overall process. The latter requirement calls not only for a flexible interface and an ability to use efficiently as many MPI processes as available, but also for a low memory footprint per process.
\subsubsection{Data distribution and algorithm}
The simple structure of the top-level parallelism outlined above is facilitated by a data layout, which assumes that both the map $\bm{s}$ and its harmonic coefficients $\bm{a}_{\ell m}$, are distributed. The map distribution is ring-based, i.e., we assign to each process a part of the full map consisting of complete rings of pixels. To do so we make use of the assumed equatorial symmetry of the pixelization schemes and first divide all rings of one of the hemispheres (including the equatorial ring if present) into disjoint subsets made of complete, consecutive rings, and then assign to each process part of the map corresponding to one of such ring subsets together with its symmetric, counterpart from the other hemisphere. If we denote the rings mapped on a processor $i$ as ${\cal R}_{i}$, then $\cup^{n_{procs}}_{i=0} {\cal R}_{i} = {\cal R}$, where ${\cal R}$ is the set of all rings and $n_{procs}$ is the number of processes used in the computation. We also have ${\cal R}_{i}\cap{\cal R}_{j}=\emptyset$ if $i\ne j$. An example ring distribution is depicted in Figure~\ref{fig:dist_rings}, where different colours mark pixels on iso-latitudinal rings mapped on two different processes.
In order to distribute harmonic domain object we divide the 2-dimensional array of the coefficients $\bm{a}_{\ell m}$ with respect to a number, $m$, and assign to each process %$p_{i} = (m \mod n_{procs})$
a subset of values of $m$, ${\cal M}_i$, and a corresponding subset of the hamonic coefficients, $\l\{ {a}_{\ell m} : m\in {\cal M}_i\ \hbox{and}\ \ell \in [m, \ell_{max}]\r\}$. As with the ring subsets, the subsets of $m$ values have to be disjoint, ${\cal M}_i \cap {\cal M}_j = \emptyset$ if $i\ne j$ and include all the values, $\cup^{n_{procs}}_{i=0} {\cal M}_{i}={\cal M}$.
An example of the harmonic domain distribution is depicted in Figure~\ref{fig:dist_alm}, where like in the case of the ring distribution we marked with different colours ${a}_{\ell m}$ coefficients mapped on two different processes.
\begin{figure}[ht]
\centering
\subfigure[The iso-latitudinal rings distribution.]{
\includegraphics[width=0.55\textwidth]{figures/rings_distribution.png}
\label{fig:dist_rings}
}
\subfigure[Distribution of ${a}_{\ell m}$ coefficients.]{
\includegraphics[width=0.37\textwidth]{figures/alm_distribution.png}
\label{fig:dist_alm}
}
\caption{Example of a distribution of the iso-latitudinal rings and the $\bm{a}_{\ell m}$ coefficients among two processes. Each colour marks elements assigned to one process. For rings visualisation we used an August's Conformal Projection of the sphere on a two-cusped epicycloid.}
\end{figure}
This data layout allows to avoid any redundancy in terms of calculations and memory and any need for an intermediate inter-process communication, while at the same time is general enough that at least in principle can accommodate
an entire slew of possibilities as driven by real applications. Any freedom in selecting a specific layout should be used to ensure a proper load balancing in terms
of both memory and work, and it may, and in general will, depend on what algorithm is used by each of the MPI processes. Though perfect load balancing, memory- and work-wise, may not be always possible, in practice we have found that good compromises usually exist. In particular, the map distribution which
assigns the same number of rings to each process is typically satisfactory, even if it may in principle lead to an unbalanced work and/or memory distribution. The latter is indeed the case for the {\sc healpix} pixelization which does not preserve the same number of samples per ring, what introduces differences in the memory usage between the processes but
also in the workload. Some fine tuning could be used here to improve on both these aspects, as it is indeed done in {\sc s$^2$hat}, but even without that the differences are not
critical. For the distribution of the harmonic objects we propose ${\cal M}_i \equiv \l\{ m : m = i + k \, n_{procs} \ \hbox{or} \ m = m_{max}-i-k\, n_{procs}, \ \hbox{where}\ k \in [0, m_{max}/2]\r\}$, so the process $i$ stores the values of $m$ equal to $\l[i, i+n_{procs}, ..., m_{max}-i-n_{procs}, m_{max}-i\r]$. This usually provides a good first try, which appears satisfactory in a number of cases as it will be explained in the next section.
We note that this data layout imposes an upper limit on the number of processes which could be used, and which is given by $\min ( m_{max}/2, {\cal R}_N/2)$. This is typically
sufficiently generous not to lead to any problems, as usually we have ${\cal R}_N \propto m_{max}\simeq \ell_{max}$ and the limit on a number of processes is ${\cal O}\l(\ell_{max}\r)$.
With this data distribution, Algorithms~\ref{algo:map2almBasic} and~\ref{algo:alm2mapBasic} require only straightforward and minor modifications to become functional parallel codes. As an example, Algorithm~\ref{algo:Generallalm2mapS2HATParallel} details the parallel inverse transform,
which is a parallel version of Algorithm~\ref{algo:alm2mapBasic}.
\begin{algorithm*}
\caption{{\sc General parallel \alm2map algorithm ( Code executed by each MPI process )}}
\begin{algorithmic}
\REQUIRE {$\bm{a}_{\ell m}\in\mathbb{C}^{\ell\times m}$ for $m\in{\cal M}_{i}$ and all $\ell$}
\REQUIRE {indices of rings $r \in {\cal R}_i$}
\medskip
\STATE { {\sc step 1 - Pre-computation }} \\
\FOR{every $m\in{\cal M}_{i}$}
\STATE { \hspace{0.1cm} $\circ$ $\bm{\mu}_m$ pre-computation Eq.~\eqref{eqn:pmm}\\ }
\ENDFOR{ ($m$)}
\medskip
\STATE { {\sc step 2 - $\bm{\Delta}^{A}_m$ calculation }} \\
\FOR{ \underline{every} ring $r\in{\cal R}$ \textbf{and} every $m \in {\cal M}_i$ }
\STATE {
$\circ$ initialise the recurrence: ${\cal P}_{mm}$, ${\cal P}_{m+1, m}$ using precomputed $\bm{\mu}_m$ Eqs.~\eqref{eqn:pmm}~\&~\eqref{eqn:pm1m} \\
$\circ$ precompute recurrence coefficients, $\beta_{\ell m}$~Eq.~\eqref{eqn:betaDef} \\
}
\FOR{ every $\ell = m+2, ..., \ell_{max}$}
\STATE {
$\circ$ compute ${\cal P}_{\ell m}$ via the 2-point recurrence, given precomputed $\bm{\beta}_{\ell m}$, Eq.~\eqref{eqn:assLegRec} \\
$\circ$ update $\bm{\Delta}^{A}_m\l( r\r)$, Eq.~\eqref{eqn:DeltaDef} \\
}
\ENDFOR{ ($\ell$)}
\ENDFOR{ ($r$) \textbf{and} ($m$)}
\medskip
\STATE{{\sc global communication}}\\
\STATE{ \hspace{0.2cm} $\circ$ $\l\{\bm{\Delta}^{A}_{m}\l(r\r),\; m\in {\cal M}_i,\; \hbox{{\rm \underline{all} $r\in{\cal R}$}}\r\} {\hbox{ {\tt \scriptsize MPI\_Alltoallv}}\atop\Longleftrightarrow} \l\{\bm{\Delta}^{A}_{m}\l(r\r),\; r\in {\cal R}_{i},\; \hbox{{\rm \underline{all} $m\in{\cal M}$}}\r\}$ }
\medskip
\STATE{\sc step 3 - $\bm{s}_{n}$ calculation}\\
\FOR{every ring $r \in {\cal R}_i$}
\STATE{ $\circ$ via FFT calculate $\bm{s}\l(r, \phi\r)$ for all samples $\phi$ of ring $r$, given pre-computed $\bm{\Delta}^{A}_m\l(r\r)$ for \underline{all} $m$. }
\ENDFOR{($r$)}
\medskip
\RETURN {$\bm{s}_{n}$ for all $\l(r\in {\cal R}_i\r)$}
\end{algorithmic}
\label{algo:Generallalm2mapS2HATParallel}
\end{algorithm*}
The operations listed there are to be performed by each of the $n_{proc}$ MPI processes involved in the computation. Like its serial counterpart, the parallel algorithm can be
also split into steps. First, we precompute the starting values of the Legendre recurrence $\bm{\mu}_{m}$ \eqref{eqn:pmm}, but only results for $m \in {\cal M}_i$ are preserved in memory. In next step, we calculate the functions $\bm{\Delta}^{A}_m$ using Eq.~\eqref{eqn:DeltaDef} for {\em every} ring $r$ of the sphere, but only for $m$ values in ${\cal M}_i$. Once this is done, a global (with respect to the number of processes involved in the computation) data exchange is performed so that at the end each process has in its memory the values of $\bm{\Delta}^{A}_m(r)$ computed only for rings $r\in{\cal R}_{i}$ but for {\em all} $m$ values. Finally, via FFT we synthesize partial results as in equation~\eqref{eqn:Delta2map}. From the point of view of the framework, steps 1 and 2 constitute the first stage of the computation, followed by the global communication, and the second computation involving here a calculation of the FFT. As mentioned earlier, though the communication is an inherent part of the framework, the two computation stages can
be adapted as needed. Hereafter, we adhere to the standard SHT algorithm as used in Algorithm~\ref{algo:Generallalm2mapS2HATParallel} .
\subsubsection{Performance analysis}
\paragraph{Operation count.}
We note that the scalability of all the major data products can be assured by adopting an appropriate distribution of the harmonic coefficients and the map. This
stems from the fact that apart of the input and output objects, i.e.,
maps and harmonic coefficients themselves, the volumes of which are inversely proportional to a number of the MPI processes by the construction, the largest data objects
derived as intermediate products, are arrays $\Delta_A$ or $\Delta_S$. Their sizes are in turn given as ${\cal O}({\cal R}_N\,m_{max}/n_{proc})$, and again decrease inversely
proportionally with $n_{procs}$. These objects are computed
during the first computation stage, subsequently redistributed as the result of the global communication, and used as an input for the second stage of the computation. Their total volume,
as integrated over the number of processes, therefore also determines the total volume of the communication, as discussed in the next Section. As these three objects are
indeed the biggest memory consumers in the routines the overall memory usage scales as $(n_{pix}+2\,m_{max}\,\ell_{max}+{\cal R}_N\,m_{max})/n_{procs}$. For
typical pixelization schemes, full sky maps, analyzed at its sampling limit, we have $n_{pix} \sim \ell_{max}^2$, $\ell_{max} \simeq m_{max}$, and ${\cal R}_N \sim n_{pix}^{1/2}$,
and therefore each of these three data objects contribute nearly evenly to the total memory budget of the routines. Consequently, the actual high memory water mark is typically $\sim50$\% higher than the volume of the input and output data. We note that if all three parameters, $\ell_{max}, \ m_{max}, {\cal R}_N$, are allowed to assume arbitrary values
memory re-using and, in particular, in-place implementations are not straightforward, as it may not be easy or possible to fit any of these data objects in the space assigned for
others. For this reason, this kind of options are not implemented in the software discussed here.
Given the assumed data layout, also the number of floating point operations (FLOPS) required for the evaluation of FFTs and associated Legendre functions scales inversely proportionally to the number of the processes. The only exception is related to the cost of the initialization of the recurrence (Eq.~\ref{eqn:pmm}), which is performed redundantly by each MPI process to avoid an extra communication. Consequently, each process receives at least one $m$ value on order of $m_{max}$ and has to perform at least ${\cal O}(m_{max)}$ operations as part of the pre-computation, equation~\eqref{eqn:pmm}. A summary of the FLOPs required in each step of the parallel SHTs algorithms is presented in Table~\ref{tab:flops}, where we have assumed here that the subsets ${\cal R}_{i}$ and ${\cal M}_{i}$ contain $ \simeq {\cal R}_{N}/n_{proc}$ and
$\simeq m_{max}/n_{proc}$ elements, respectively.
\begin{table}[h]
\begin{center}
\begin{tabular}{llc}
& & \emph{FLOPs} \\
\hline\hline
\medskip
{\sc Pre-computation} & {\sc Step \{2,1\} } & $O\left(m_{max}\right)$ \\
\medskip
{\sc Recurrence} & {\sc Step \{3,2\} } & $O\left(({\displaystyle {\cal R}_{N}\ell_{max} \frac{m_{max}}{n_{proc}}}\right)$\\
\medskip
{\sc FFTs } & {\sc Step \{1,3\} } & $O\left({\displaystyle \frac{ {\cal R_{N}}} {n_{proc}}\;m_{max}\log m_{max}}\right)$\\ \hline
\end{tabular}
\end{center}
\caption{Number of floating point operations required in the parallel SHTs algorithm.}\label{tab:flops}
\end{table}
\paragraph{Communication cost.}
In order to determine the overall performance of the framework we have to also estimate the communication cost. The data exchange is performed with help of a single collective operation of the type all-to-all in which each process sends data to, and receives from, every other process. We use a simple model to estimate the cost of the collective communication algorithms in terms of latency and bandwidth use i.e., we assume that the time taken to send a message between any two nodes can be modelled as $\alpha + \beta\,n$, where $\alpha$ is the latency (or startup time) per message, independent of message size, $\beta$ is the transfer time per byte, and $n$ is the number of bytes transferred. However in case of the collective communication routines, different MPI libraries use different algorithms, which moreover may depend on a size of the message. For specificity, we base our analysis on MPICH~\cite{mpich}, a widely used MPI library. Similar considerations can be performed for other libraries.
The all-to-all communication in MPICH~\cite{collective_mpi} for short messages ($\leq 256$ kilobytes per message) uses the index algorithm by Bruck et al.~\cite{bruck}. It is a store-and-forward algorithm that takes $\lceil \log n \rceil$ steps at the expense of some extra data communication ( $\beta\,n \log n$ instead of $\beta\,n$). For long messages and even number of processes, MPICH uses a pairwise-exchange algorithm, which takes $n_{proc} - 1$ steps. On each step, each process calculates its target process and exchanges data directly with that process i.e., data is directly communicated from source to destination, with no intermediate steps.
Note that in case of even distribution of rings $r$ and $m$ values, the size of message to be exchanged between each pair of processes can be defined as
\begin{equation}\label{eq:msg_size}
S_{msg} := {\cal R}_{N}\frac{m_{max}}{n_{proc}}n_{\mathbb C},
\end{equation}
where $n_{\mathbb C}$ is the size in bytes of a complex number representation (usually $n_{\mathbb C} = 16$).
Finally, merging all the assumptions listed above we can derive a following formula for the total time require for a communication in our parallel SHTs algorithm.
\begin{equation}
T_{comm} =
\begin{dcases*}
\alpha \log n_{proc} + \beta\,S_{msg}\log n_{proc} & when $S_{msg}\leq$ 256\,kB \\
\alpha \l(n_{proc} - 1\r) + \beta\,S_{msg}\,\l(n_{proc}-1\r) & when $S_{msg}>$ 256\,kB
\end{dcases*}.
\label{eqn:comm}
\end{equation}
\begin{figure}[ht]
\centering
%\includegraphics[width=0.95\textwidth]{figures/times_peta.pdf}
\includegraphics[width=0.325\textwidth]{figures/nproc_fixed_nside.png}
\includegraphics[width=0.325\textwidth]{figures/nside_fixed_nproc.png}
\includegraphics[width=0.325\textwidth]{figures/finall.png}
\caption{The two left most plots show theoretical run-times (logarithmic scale) as function of a number of processes and a fixed size of the problem, left most panel, and as a function
of a resolution/size of the problem for the fixed number of processes, middle. The sizes of the problem shown in the leftmost panel with thick, solid lines, correspond to $\simeq 2\;10^8$
pixels and corresponding harmonic modes, as given the HEALPix parameter value $N_{side} = 4096$. Thin, dashed lines show cases with $4$ times fewer (more) pixels, lower and upper
lines respectively.
In the middle panel the number of processes is assumed to be $n_{proc}=512$, thick, solid line, or 256 (1024), as shown by thin dashed lines.
The contours in the rightmost panel display a ratio of the total computation to communication times shown as a function of a number of process and size of the problem.
}\label{fig:theoretical_performance}
\end{figure}
In Figure~\ref{fig:theoretical_performance} we illustrate theoretical runtimes depending on the number of the employed processes (leftmost panel) and on the size of the problem (middle panel).
The transform parameters have been set assuming a standard full sky, fully resolved case, and the HEALPix pixelization, i.e., $\ell_{max} = m_{max} = 2 \, N_{side}, \ {\cal R}_N = 4\,N_{side}-1, \ n_{pix} = 12 \, N_{side}^2$. Here, $N_{side}$ is the HEALPix resolution parameter (for more details see beginning of the section~\ref{sec:experiments} and \cite{Gorski_etal_2005}).
For the hardware parameters, in the estimation of the communication cost we have used following~\cite{predigting_run_times,passinggraph} $\alpha =10^{-5}$, $\beta=10^{-9}$, while for the cost of calculations we have assumed that each MPI process attains the effective performance of $10$Giga operations per second, resulting in a time of $\gamma=10^{-10}$ seconds per FLOP. The choice we have made is meant to be only indicative, yet at its face value the latter number is more typical of the MPI processes being nodes of multi-cores than just single cores, in particular given the usual efficiency of such calculations, which is usually on order of $10-20$\%. If less efficient MPI processes
are used then the solid curves in Fig.~\ref{fig:theoretical_performance} corresponding to the computation times should be shifted upwards by a relevant factor.
The dominant calculation is the recurrence needed to compute the associated Legendre functions in agreement with our earlier numerical result, Fig.~\ref{fig:pie_plot}. The computation scales
nearly perfectly with the number of processed used decaying as $\propto 1/n_{proc}$ and it increases with growing size of the problem, $\propto N_{side}^2$. At the same time the communication
cost is seen to be independent on a number of processes. This is because for the considered here numbers of processes the size of the message exchanged between any two processes is
always large and the communication time
is thus described by the second term of the second equation of Eqs.~\ref{eqn:comm}. Given that the single message size, $S_{msgs}$, decreases linearly with a number of processes, the total
communication time does not depend on it. The immediate consequence of these two observations is that both these times will be found comparable if sufficiently large number of processes is
used. A number of processes at which such a transition takes place will depend on the constants entering in the calculation of Eqs.~\ref{eqn:comm} and the size of the problem.
This dependence of the critical number of processes on the size of the problem is shown in the rightmost panel of Fig.~\ref{fig:theoretical_performance} with thick solid contour labelled $1.0$.
Clearly, from the perspective of the transform runtimes there is no extra gain, which could be obtained from using more processes than given by their critical value. In practice, a number of
processes may need to be fixed by memory rather than computational efficiency considerations. As the memory consumption depends on the problem size, i.e., $n_{pix}$, in the same way
as the communication volume, by increasing the number of processes in unison with the size of the problem we can ensure that the transform runs efficiently, i.e., the fraction of the total
execution time spent on communicating will not change, and that it will always fit within the machines memory.
We note that some improvement in the overall cost of the communication could be expected by employing a non-blocking communication pattern applied to smaller data objects, i.e., single rows of the arrays, $\Delta_A$ and $\Delta_S$ corresponding to a single ring or $m$ value, successively and immediately after they become available. Though such an approach could extend the parameter
space in which the computation time is dominant and the overall scaling favorable, it will most likely have only a limited impact given that the same, large, total volume of the data, which needs to be communicated. The communication volume could be decreased, and thus again the parameter space with the computation dominance extended, if redundant data distributions and redundant calculations are allowed for in the spirit of the currently popular {\em communication avoiding} algorithms, e.g., \cite{grigori2008communication}. Though these kinds of approaches are clearly of interest they are outside
of the scope of this work and are left here for future research.
We also not that the conclusions may disfavour algorithms which require on the one hand big memory overhead and/or need to communicate more. Such algorithms will tend to require
a larger number of processes to be used for a given size of the problem and thus will quickly get into bad scaling regime. Their application domain may be therefore rather limited.
We emphasize that this general conclusion should be reevaluated case-by-case for any specific algorithms. Likewise, it is clear that the regime of the scalability of the standard SHT algorithm
could be extended by decreasing the communication volume. Given that the communicated objects are dense, the latter would typically require some lossy compression techniques, we do not
consider them here. Instead we focus our attention on accelerating intra-node calculations. This could improve the transforms runtimes for a small number of processes, however, if successful
unavoidably will also result in lowering the critical values for the processes numbers. leading to losing the scalability earlier. Nevertheless, the overall performance of these improved transforms
would never be inferior to that of the standard ones whatever a number of processes.
\subsection{Exploiting intra-node parallelism}\label{sub:acclerating} % (fold)
In this Section we consider two extensions of the basic parallel algorithm, Algorithm~\ref{algo:Generallalm2mapS2HATParallel}, each going beyond its MPI-only paradigm. Our focus hereafter is therefore on the computational tasks as performed by each of the MPI processes within the top-level framework outlined earlier. Guided by the theoretical models from the previous Section we expect that any gain in performance of a single MPI process will translate directly into analogous gain in the overall performance of the transforms at least as long as the communication time is subdominant, i.e., in a relatively broad regime of cases of practical importance. Hereafter we therefore develop implementations suitable for multi-core processors and GPUs and discuss the benefits and trade-offs they imply.
As highlighted by Fig.~\ref{fig:theoretical_performance}, of the three computational steps, this is the recurrence used to compute the associated Legendre functions which takes by far the dominant fraction of the overall runtime. Consequently,
we will pay special attention to accelerating this part of the algorithm, while we will keep on using standard libraries to compute the FFTs. The latter will be shown to deliver a sufficient
performance given our goals here. The associated Legendre function recurrence involves three nested loops and we will particularly consider the benefits and downside of their different
ordering in order to attain the best possible performance.
Hereafter, for simplicity we will refer to the algorithms for direct and inverse spherical transforms by the core names of their corresponding routines i.e., \map2alm and \alm2map respectively.
\subsubsection{Multithreaded version for multi-core processors}\label{sub:multithreading_alm2mapmt} % (fold)
% Todays microarchitectures are multi-core and each core is often multithreaded. With the limit of CMOS technology in sight, increasing the number of threads in a system is fast becoming the major approach to improve system performance. Modern, high-performance microprocessors must also rely on efficient memory accesses, as the memory wall mounts. Therefore, there is a number of hardware solutions which provide the ability for all cores to access efficiently all memory as global address space i.e,. multiple processors can operate independently but share the same memory resources (Shared Memory model).
Since the largest and fastest computers in the world today employ both shared and distributed memory architectures, software which use only a distributed memory approaches such as MPI may not fully exploit the shared memory underlying architecture and thus may loose some efficiency. Shared memory approaches, based on explicit multithreading or language-generated multithreading (e.g. OpenMP) are more accurate on shared memory architectures. Thus, many hybrid approaches, typically mixing MPI and OpenMP, have been proposed to better exploit clusters of of multi-core processors. In this spirit, we introduce the second level of parallelism based on multithreaded approach in the computation of SHTs.
As discussed shortly at the beginning of this section, the order of the three nested loops can be changed. For instance, the iterations over $m$ in the outermost loop allow precomputing $\beta_{\ell m}$ Eq.~\eqref{eqn:betaDef}, ${\cal P}_{m m}$ Eq.~\eqref{eqn:pmm} and ${\cal P}_{m+1, m}$ Eq.~\eqref{eqn:pm1m} only once for each ring with extra storage of an order ${\cal O}\l(\ell_{max}\r)$. But in this case we also need to store intermediate products which require additional ${\cal O}\l({\cal R}_{N}\ell_{max}\r)$ storage comparable to the size of the input and output. However modern super computers are equipped with memory banks of size in tens of GB. This fostered us for developing multithreaded versions of Algorithm $1$ and $2$ in which we place loop over $m$ outermost. That is, we introduced the multithread layer via OpenMP for $m$ loop i.e., a set of $m\in{\cal M}_{i}$ values (already assigned to $i^{\text{th}}$ MPI process) is evenly divided into a number of subsets equal to the number of threads mapped onto each physical core available on a given multi-core processor. We denote this subset of $m$ values as ${\cal M}^\text{T}_i$ where subindex $i$ is an $i^{\text{th}}$ thread. In such a way each physical core executes one thread concurrently and calculates intermediate results of SHTs for its local (in respect to shared memory) subset of $m$. The Algorithm \ref{algo:alm2mapS2HATParallelMT} describes the \text{\sc step 1} of Algorithm~\ref{algo:Generallalm2mapS2HATParallel} in its multithreaded version.
\begin{algorithm}
\caption{{\sc $\bm{\Delta}^{A}_m$ calculation per thread}}
\begin{algorithmic}
\REQUIRE {all $m\in{\cal M}^{T}_{i}\;\Rightarrow \bigcup \limits^{\text{Nth}}_{i=1}{\cal M}_{i}^{T}={\cal M}_{i}$ }
\FOR{all $m \in {\cal M}^{T}_i$}
\STATE {
$\circ$ initialise ${\cal P}_{mm}$ Eqs.~\eqref{eqn:pmm} \\
}
\FOR{ \underline{every} ring $r$}
\FOR{ every $\ell = m+2, ..., \ell_{max}$}
\STATE {
$\circ$ precompute $\beta_{\ell m}$~Eq.~\eqref{eqn:betaDef} \\
$\circ$ compute ${\cal P}_{\ell m}$ via $\bm{\beta}_{\ell m}$, Eq.~\eqref{eqn:assLegRec} \\
$\circ$ update $\bm{\Delta}^{A}_m\l( r\r)$ in shared memory, Eq.~\eqref{eqn:DeltaDef} \\
}
\ENDFOR{ ($\ell$)}
\ENDFOR{($r$)}
\ENDFOR{($m$)}
\end{algorithmic}
\label{algo:alm2mapS2HATParallelMT}
\end{algorithm}
% subsection multithreading_alm2mapmt (end)
\subsubsection*{Load balance between active threads}
Selecting a loop over $m$ as an outermost one is certainly the best approach from the point of view of speed optimization (see previous subsection), but such configuration leads to poor load balance. Notice that due to assumptions listed in section~\ref{sec:additional_assumptions} the $\ell$-recursion starts from $\ell = m$ (see for example Eq.~\ref{eqn:DeltaDef}). Therefore, the number of flops required for evaluating the recurrence from equation~\eqref{eqn:pmm} differs for different $m$. For that reason, we have to carefully map subsets ${\cal M}^{T}_{i}$ onto active threads to ensure that the total time required for the evaluation of the problem on a given node will be equally divided by the number of available cores.
\begin{figure}[ht]
\centering
\includegraphics[width=0.95\textwidth]{figures/mt_thread.png}
\caption{Example of two "min-max" pairs of $m$ values assigned to {\cal Thread} $X$ on Process 1 (left) and 0 (right) respectively. The read arrow represents the direction and the lenght of the $\ell$-recurrences (Eq.~\ref{eqn:pm1m}) evaluated by a thread for a given "min-max" pair. }\label{fig:mt_load_balance}
\end{figure}
Because the lenght of the loop over $\ell$ decreases with growing number of $m$, we create subsets $M^{T}_{i}$ by taking values from the set $M_{i}$ in "max-min" fashion i.e., iteratively we take from ${\cal M}_{i}$ the maximum value and the minimal one and we assign them to a chosen thread. In the same moment we try to equilibrate the number of such pairs within each subset ${\cal M}^{T}_{i}$. Example of a such a mapping is depicted in Figure~\ref{fig:mt_load_balance}.
In next subsection, we introduce a similar multithreaded approach for accelerating the projection of associated Legendre functions using GPU.
\subsubsection{SHTs on GPGPU}\label{sub:sht_on_gpu} % (fold)
% The model for GPU computing is to use a CPU and a GPU together in a heterogeneous co-processing computing model. The sequential part of the application runs on the CPU and the computationally-intensive part is accelerated by the GPU. From the userâ€™s perspective, the application just runs faster because it is using the GPU to boost performance. NVIDIA revolutionized the GPGPU and accelerated computing world in 2006-2007 by introducing its new massively parallel architecture called "CUDA" (Compute Unified Device Architecture).
A CUDA (Compute Unified Device Architecture) device is a chip consisting of hundreds of simple cores, called Streaming Processors (SP) which execute instructions sequentially. Groups of such processors are encapsulated in a Streaming Multiprocessor (SM) which executes instruction in a SIMD (Single Instruction, Multiple Data) fashion. Therefore each SM manages hundreds of active threads in a cyclic pipeline in order to hide memory latency i.e., a number of threads are grouped together and scheduled as a Warp, executing the same instructions in parallel. Therefore CUDA threads are grouped together into a series of thread blocks. All threads in the thread block execute on the same SM, and can synchronise and exchange data using very fast but small (up to 48KB in size for the latest NVIDIA carts) shared memory. The shared memory can be used as a user-managed cache enabling higher bandwidth than direct data copy from the slow device global memory.
The restrictions related to memory (both size and speed) and the lack of synchronisation between the blocks of threads drive the structure of our algorithms devised for GPUs. For instance, placing the loop over $m$ as outermost (like in algorithms dedicated for multi-core CPUs) is not appropriate for CUDA applications due to the memory size limitation. Consequently, the vector \blm{} cannot be entirely stored in shared memory. Its values would need to be recomputed for each $m$ and accessed sequentially in the $\ell$-loop. A possible implementation would require re-computing the \blm{} coefficients for each pass through the $\ell$-loop, which would significantly affect the performance. Therefore, to agree with the philosophy of programming for GPUs, which consists in using fine grained parallelism and launching a very large number of threads in order to use all the available cores and hide memory latency, we set iterations over $r$ in the outermost loop and parallelize it in such a way that a number of rings (preferably only one) is assigned to one thread. In this model each thread computes the 2-point recurrence for all \emph{available} $m$-values (but we assume that GPU is a part of distributed memory system therefore) in contiguous fashion as depicted on Figure~\ref{fig:cuda_alm_access}. This makes it straightforward to plan the computation of \blm{} via Eq.~\eqref{eqn:pmm} in batches, which can be cached and shared inside a thread block. Note that in case of distributed memory systems the number of $m$ values in set ${\cal M}_{i}$ may vary for different $i$, while the total number of rings ${\cal R}_{N}$ stays constant. Therefore, it is easier to plan good load balance for GPU when we parallize our algorithms over $r$ loop.
\begin{figure}[ht]
\centering
\includegraphics[width=0.95\textwidth]{figures/cuda_thread.png}\caption{Schematic view of $\ell$-recurrence iterations (eq.~\ref{eqn:pm1m}) evaluated by each thread on CUDA device assigned to Process $1$ and $0$ respectively.}\label{fig:cuda_alm_access}
\end{figure}
Setting loop over rings outermost in our nested loop triggers two completely different approaches for memory management in direct and inverse SHTs. Therefore we dedicate the following two subsections to describe in detail the differences between \map2alm and \alm2map algorithms for GPUs.
\subsubsection*{\map2alm}
Mapping rings onto big number of threads, indeed facilitates the parallelization of SHTs algorithm on GPUs i.e., in such set-up each active thread perform exactly the same number and type of operations but with different values. However, the direct transform requires that for each ring we update all non-zero values of the matrix $\bm{a}_{\ell,m}\in\mathbb{C}^{m_{max}\times\ell_{max}}$. This leads to all-reduce operation\footnote{Reduce-type operation combines the distributed elements using predefined operation (e.g., summation, multiplication etc.) and returns the combined value in the output buffer.} between active threads. This is to say, the partial results generated by all active threads need to be summed together and written in the memory space from which we will read the final solution after all threads will be executed. Note that on CUDA, active threads can communicate only in thread block, which makes this task more difficult.
The Algorithm~\ref{alg:cuda_map2alm} outlines the {\sc Step 3} of algorithm~\ref{algo:map2almBasic} adapted to CUDA. We assume however that we are working on a memory distributed system, therefore the range of $m$ values is limited to ${\cal M}_{i}$. To ensure workaround for the high latency device memory and take advantage of the fast shared memory, we first calculate the values of \blm{} vectors in segments, which allows us to fit them to the shared memory i.e., all threads within a block in a "collaborative" way compute in advance \blm{} values and store them in shared memory. Next, we use those precomputed values to evaluate associated Legendre function ${\cal P}_{\ell m}(r)$. After, each active thread fetches from global memory a precomputed sum $\bm{\Delta}^{S}_m\l( r\r)$ and performs a final multiplication Eq.~\eqref{eqn:almdef}. Once it's done, we sum all those partial results within a given block of threads using adapted\footnote{We omitted a part of the code which writes partial result to global memory} version of parallel reduction primitive from CUDA SDK. For such locally reduced result one thread in block updates corresponding entry in global memory. This implies that $j = {\tt NBT}$ (number of blocks of threads) threads will try to increase the value of a variable at the same space of global memory - which is referred to as a \emph{race condition problem}, common in multithread applications. Fortunately, race conditions are easy to avoid in CUDA by using atomic operations. An atomic operation is capable of reading, modifying, and writing a value back to memory without the interference of any other threads, which guarantees that a race condition will not occur. Therefore, all final updates of final results in global memory are performed via CUDA {\tt atomicAdd} function. In case when atomic functions are not supported on a given GPU we can easily avoid race conditions by increasing granularity of recurrence step. In this scenario we loop over $m$ values on CPU and execute ${\cal M}_{i}$ "\emph{smaller}" CUDA kernels which for a fixed values $m$ and $\bm{\mu}_m$, evaluate ${\tt NBT}$ reduced (per block of threads) results $a^{\star}_{\ell m}$, which are next copy from device to host memory and sum together using CPU. This variant of our algorithm corresponds to the state {\tt HYBRID\_EVALUATION = true}, in listing~\ref{alg:cuda_map2alm}.
\begin{algorithm}
\caption{{\sc \aalm{} calculation on CPU\&GPU }}
\begin{algorithmic}
\REQUIRE {$\bm{\Delta}^{S}_m$ \& $\bm{\mu}_m$ vectors in GPU global memory for $m\in{\cal M}_i$}
\FOR{ \underline{every} ring $r$ assign to one GPU thread}
\FOR{every $m \in {\cal M}_i$}
\FOR{ every $\ell = m+2, ..., \ell_{max}$}
\STATE{ $\bullet$ use precomputed or, if needed, precompute in parallel a segment of \blm{}~\eqref{eqn:betaDef}; }
\STATE{ $\bullet$ compute ${\cal P}_{\ell m}$ via Eq.~\ref{eqn:assLegRec}; }
\STATE{ $\bullet$ evaluate: ${a}^{r}_{\ell m} = \bm{\Delta}^{S}_m\l( r\r) {\cal P}_{\ell m}(r)$}
\STATE{ $\bullet$ sum all partial results ${a}^{r}_{\ell m}$ within a block of threads
\begin{equation}\label{eq:cuda_partial_alm}
{a}^{\star}_{\ell m} = \sum^{\tt NTPB }_{i = 0} \l({a}^{r}_{\ell m}\r)_{i}
\end{equation}
}
\IF {{\tt HYBRID\_EVALUATION}}
\STATE{ $\bullet$ for each block of threads with different id = {\tt BID}, write $\bm{a}^{\star}_{\ell m}$ in global device memory}
\begin{equation*}
\bm{a}^{GPU}_{\ell m}[{\tt BID}] = {a}^{\star}_{\ell m}
\end{equation*}
\STATE{ $\bullet$ copy partial results $\bm{a}^{GPU}_{\ell m}[\ldots]$ from device to host memory and sum together using CPU}
\FOR{$i = 0 \to {\tt NBT}$}
\STATE {$\bm{a}_{\ell m} = \bm{a}_{\ell m} + {a}^{\tt GPU}_{\ell m}[i]$ }
\ENDFOR
\ELSE
\STATE{ $\bullet$ update $\bm{a}_{\ell m}$ in global memory via atomic addition: $\bm{a}_{\ell m} = \bm{a}_{\ell m} + {a}^{\star}_{\ell m}$ }
\ENDIF
\medskip
\ENDFOR{ ($\ell$)}
\ENDFOR{($m$)}
\ENDFOR{($r$)}
\medskip
\RETURN{$\bm{a}_{\ell,m}\in\mathbb{C}^{m_{max}\times\ell_{max}}$ }
\end{algorithmic}
\label{alg:cuda_map2alm}
\end{algorithm}
\subsubsection*{\alm2map}
The inverse SHT also requires the reduction of partial results, but in contrary to the direct transform, we need to sum them for all $\ell$ as displayed in Eq.~\eqref{eqn:DeltaDef}. With rings assigned to single threads, this leads to rather straightforward implementation. Algorithm~\ref{algo:alm2mapS2HATParallelCUDA} outlines the \alm2map algorithm devised for GPUs and it is based on the version detailed in \cite{hupca}, which conforms with all assumptions stated in this paper.
\begin{algorithm}
\caption{{\sc $\bm{\Delta}^{A}_m$ calculation on GPU}}
\begin{algorithmic}
\REQUIRE {$\bm{a}_{\ell,m}\in\mathbb{C}^{m_{max}\times\ell_{max}}$ matrix in GPU global memory}
\FOR{ \underline{every} ring $r$ assign to one GPU thread}
\FOR{$m \in {\cal M}_i$}
\FOR{ every $\ell = m+2, ..., \ell_{max}$}
\STATE{ $\bullet$ use fetched or, if needed, fetch in parallel a segment of \aalm{} data; }
\STATE{ $\bullet$ use precomputed or, if needed, precompute in parallel a segment of \blm{}~\eqref{eqn:betaDef}; }
\STATE{ $\bullet$ compute ${\cal P}_{\ell m}$ via Eq.~\ref{eqn:assLegRec}; }
\STATE{ $\bullet$ update $\bm{\Delta}^{A}_m\l( r\r)$ for a given prefetched $\bm{a}_{\ell m}$ and computed ${\cal P}_{\ell m}$ \eqref{eqn:DeltaDef}; }
\ENDFOR{ ($\ell$)}
\STATE{ $\bullet$ save final $\bm{\Delta}^{A}_m\l( r\r)$ in global memory }
\ENDFOR{($m$)}
\ENDFOR{($r$)}
\medskip
\RETURN{vector of partial results: $\bm{\Delta}^{A}_m\in\mathbb{C}$ }
\end{algorithmic}
\label{algo:alm2mapS2HATParallelCUDA}
\end{algorithm}
First, we partition the vector of the \aalm{} coefficients in the global memory into small tiles. In this way a block of threads can fetch resulting segments in sequence and fit them in the shared memory. Next, we calculate \blm{} values in the same way as in the \map2alm algorithm. With all the necessary data in the fast shared memory, in following steps, each thread evaluates in parallel an associated Legendre function and updates the partial result $\bm{\Delta}^{A}_m$. In contrast to the direct transform, those updates are performed without a race condition problem i.e., each thread stores its temporary values in small space of the shared memory to which it has an exclusive and very fast access. Once, the loop over all $\ell$ is finished, the final partial result is written to the global memory.
\subsection{Exploitation of hybrid architectures}
Many new supercomputers are hybrid clusters i.e., they are a combination of computational nodes interconnected (usually via PCI-Express bus) with multiply GPUs. In such a way, one or more multi-core processors have access to one GPU. Facing these heterogeneities, the front end user of the spherical harmonic package will have to decide which type of architecture to choose for calculation. The three-stage structure of our algorithms allows us to freely map different steps of the algorithms onto different accelerators in hybrid system. However, the steps have to be evaluated sequentially in unchanged order. The intention of this flexibility is to facilitate selection of the most efficient library for FFTs e.g. NVIDIA provides its own library for FFT on GPUs i.e., {\tt CUFFT} \cite{CUFFT} which efficiency strongly depends on the version of CUDA device and the version of the routine. Therefore, on some supercomputers with highly tuned version of FFTW3 library, FFTs perform better on multi-core CPUs. Note that our algorithms require the evaluation of FFTs on complex data, which length may vary from ring to ring.
In practice, on heterogeneous systems we assign each step of the algorithm to the architecture on which it performs faster. The default assignment for both transformation is depicted in Figure~\ref{fig:heterogeneus}. Note that due to their sequential nature, we omit steps in which we evaluate the starting values of the recurrence Eq.~\eqref{eqn:pmm} (step 2 in Algorithm \ref{algo:map2almBasic} and step 1 in Algorithm \ref{algo:alm2mapBasic}), which are always computed on CPU.
\begin{figure}[htp]
\begin{center}
\includegraphics[width=0.75\textwidth]{figures/heterogeneus.png}
\end{center}
\caption{Default assignment of the major steps of the SHTs algorithm to one of the two architectures, multi-core processor or GPU, in a GPU/CPU heterogeneous system. Step 3 in {\tt map2alm} algorithm covers two architectures due to optional use of CPU for reducing partial results between the execution of the CUDA kernels.}
\label{fig:heterogeneus}
\vskip -0.5truecm
\end{figure}
% subsection parallelization (end)
% section background (end)
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "../s2hat_jpaper"
%%% End: