/* ! //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ! ! ! Maintainers : support@fluidnumerics.com ! Official Repository : https://github.com/FluidNumerics/self/ ! ! Copyright © 2024 Fluid Numerics LLC ! ! Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: ! ! 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. ! ! 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in ! the documentation and/or other materials provided with the distribution. ! ! 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from ! this software without specific prior written permission. ! ! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ! HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ! LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ! THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF ! THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ! ! //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ! */ #include "SELF_GPU_Macros.h" #include <math.h> /* * Equation of state: p = p0 * (rho * Rd * theta / p0)^gamma */ __device__ real eos_pressure(real rho, real theta, real p0, real Rd, real gamma) { return p0 * pow(rho * Rd * theta / p0, gamma); } /* * Logarithmic mean used by the Souza et al. (2023, JAMES) * entropy-conservative two-point flux for compressible Euler with * potential temperature: * * <a>_log = (a - b) / (ln(a) - ln(b)) for a != b * <a>_log = (a + b) / 2 for a == b * * Implemented with a Taylor series in u = ((a-b)/(a+b))^2 near a == b * to avoid 0/0. */ __device__ inline real log_mean_dev(real a, real b) { real zeta = a / b; real f = (zeta - (real)1.0) / (zeta + (real)1.0); real u = f * f; real F_F; if (u < (real)1.0e-2) { F_F = (real)1.0 + u/(real)3.0 + u*u/(real)5.0 + u*u*u/(real)7.0; } else { F_F = log(zeta) / ((real)2.0 * f); } return (a + b) / ((real)2.0 * F_F); } // ============================================================ // No-normal-flow boundary condition for Entropy-Stable Atmosphere 3D // // Mirrors density (var 0) and rho*theta (var 4). // Reflects momentum: (rho*v)_ext = (rho*v)_int - 2*((rho*v).n)*n // ============================================================ __global__ void hbc3d_nonormalflow_esatmo3d_kernel( real *extBoundary, real *boundary, real *nhat, int *elements, int *sides, int nBoundaries, int N, int nel, int nvar) { uint32_t idof = threadIdx.x + blockIdx.x * blockDim.x; uint32_t dofs_per_face = (N + 1) * (N + 1); uint32_t total_dofs = nBoundaries * dofs_per_face; if (idof < total_dofs) { uint32_t i = idof % (N + 1); uint32_t j = (idof / (N + 1)) % (N + 1); uint32_t n = idof / dofs_per_face; uint32_t e1 = elements[n] - 1; uint32_t s1 = sides[n] - 1; // Load unit normal real nx = nhat[VEB_3D_INDEX(i, j, s1, e1, 0, 0, N, nel, 1)]; real ny = nhat[VEB_3D_INDEX(i, j, s1, e1, 0, 1, N, nel, 1)]; real nz = nhat[VEB_3D_INDEX(i, j, s1, e1, 0, 2, N, nel, 1)]; // Mirror density extBoundary[SCB_3D_INDEX(i, j, s1, e1, 0, N, nel)] = boundary[SCB_3D_INDEX(i, j, s1, e1, 0, N, nel)]; // Load interior momentum real rhou = boundary[SCB_3D_INDEX(i, j, s1, e1, 1, N, nel)]; real rhov = boundary[SCB_3D_INDEX(i, j, s1, e1, 2, N, nel)]; real rhow = boundary[SCB_3D_INDEX(i, j, s1, e1, 3, N, nel)]; // Normal momentum component real rhovn = rhou * nx + rhov * ny + rhow * nz; // Reflect: ext = int - 2*(int.n)*n extBoundary[SCB_3D_INDEX(i, j, s1, e1, 1, N, nel)] = rhou - 2.0 * rhovn * nx; extBoundary[SCB_3D_INDEX(i, j, s1, e1, 2, N, nel)] = rhov - 2.0 * rhovn * ny; extBoundary[SCB_3D_INDEX(i, j, s1, e1, 3, N, nel)] = rhow - 2.0 * rhovn * nz; // Mirror rho*theta extBoundary[SCB_3D_INDEX(i, j, s1, e1, 4, N, nel)] = boundary[SCB_3D_INDEX(i, j, s1, e1, 4, N, nel)]; // Mirror geopotential (Phi depends on z only — same on both sides) extBoundary[SCB_3D_INDEX(i, j, s1, e1, 5, N, nel)] = boundary[SCB_3D_INDEX(i, j, s1, e1, 5, N, nel)]; } } extern "C" { void hbc3d_nonormalflow_esatmo3d_gpu( real *extBoundary, real *boundary, real *nhat, int *elements, int *sides, int nBoundaries, int N, int nel, int nvar) { int dofs_per_face = (N + 1) * (N + 1); int total_dofs = nBoundaries * dofs_per_face; int threads_per_block = 256; int nblocks_x = total_dofs / threads_per_block + 1; hbc3d_nonormalflow_esatmo3d_kernel<<<dim3(nblocks_x, 1, 1), dim3(threads_per_block, 1, 1), 0, 0>>>( extBoundary, boundary, nhat, elements, sides, nBoundaries, N, nel, nvar); } } // ============================================================ // Parabolic no-stress / no-heat-flux BC. // Reflects the normal component of the solution gradient at every // wall node so BR1 averaging gives avgGrad . n = 0 (zero diffusive // normal flux for every variable). // grad_ext = grad_int - 2*(grad_int . n)*n // ============================================================ __global__ void pbc3d_nostress_esatmo3d_kernel( real *extGrad, real *grad, real *nhat, int *elements, int *sides, int nBoundaries, int N, int nel, int nvar) { uint32_t idof = threadIdx.x + blockIdx.x * blockDim.x; uint32_t dofs_per_face = (N + 1) * (N + 1); uint32_t total_dofs = nBoundaries * dofs_per_face; if (idof < total_dofs) { uint32_t i = idof % (N + 1); uint32_t j = (idof / (N + 1)) % (N + 1); uint32_t n = idof / dofs_per_face; uint32_t e1 = elements[n] - 1; uint32_t s1 = sides[n] - 1; real nx = nhat[VEB_3D_INDEX(i, j, s1, e1, 0, 0, N, nel, 1)]; real ny = nhat[VEB_3D_INDEX(i, j, s1, e1, 0, 1, N, nel, 1)]; real nz = nhat[VEB_3D_INDEX(i, j, s1, e1, 0, 2, N, nel, 1)]; for (int iVar = 0; iVar < nvar; iVar++) { real gx = grad[VEB_3D_INDEX(i, j, s1, e1, iVar, 0, N, nel, nvar)]; real gy = grad[VEB_3D_INDEX(i, j, s1, e1, iVar, 1, N, nel, nvar)]; real gz = grad[VEB_3D_INDEX(i, j, s1, e1, iVar, 2, N, nel, nvar)]; real gn = gx * nx + gy * ny + gz * nz; extGrad[VEB_3D_INDEX(i, j, s1, e1, iVar, 0, N, nel, nvar)] = gx - (real)2.0 * gn * nx; extGrad[VEB_3D_INDEX(i, j, s1, e1, iVar, 1, N, nel, nvar)] = gy - (real)2.0 * gn * ny; extGrad[VEB_3D_INDEX(i, j, s1, e1, iVar, 2, N, nel, nvar)] = gz - (real)2.0 * gn * nz; } } } extern "C" { void pbc3d_nostress_esatmo3d_gpu( real *extGrad, real *grad, real *nhat, int *elements, int *sides, int nBoundaries, int N, int nel, int nvar) { int dofs_per_face = (N + 1) * (N + 1); int total_dofs = nBoundaries * dofs_per_face; int threads_per_block = 256; int nblocks_x = total_dofs / threads_per_block + 1; pbc3d_nostress_esatmo3d_kernel<<<dim3(nblocks_x, 1, 1), dim3(threads_per_block, 1, 1), 0, 0>>>( extGrad, grad, nhat, elements, sides, nBoundaries, N, nel, nvar); } } // ============================================================ // LLF (Rusanov) boundary flux for Entropy-Stable Atmosphere 3D // // F* = 0.5*(fL + fR) - 0.5*lambda_max*(sR - sL) // lambda_max = max(|unL|+cL, |unR|+cR) // c = sqrt(gamma * p / rho) // // Variables: [rho, rhou, rhov, rhow, rhotheta] // ============================================================ __global__ void boundaryflux_esatmo3d_kernel( real *fb, real *fextb, real *nhat, real *nscale, real *flux, real p0, real Rd, real gamma, int N, int nel) { // LMARS (Low-Mach Approximate Riemann Solver, Chen et al. 2013). // // un* = 0.5*(unL+unR) - (pR - pL) / (2*rho_bar*c_bar) // p* = 0.5*(pL+pR) - 0.5*rho_bar*c_bar*(unR - unL) // // Upwind on sign of un* selects which side supplies the conserved // state for the advective part. Pressure adds to normal momentum. // Geopotential (var 5) has zero flux. Gravity is folded into the // SourceMethod via the Souza non-conservative term so no hydrostatic // pressure split is applied here. uint32_t idof = threadIdx.x + blockIdx.x * blockDim.x; uint32_t ndof = (N + 1) * (N + 1) * 6 * nel; if (idof < ndof) { real nx = nhat[idof]; real ny = nhat[idof + ndof]; real nz = nhat[idof + 2 * ndof]; real nmag = nscale[idof]; // Left state (interior) real rhoL = fb[idof]; real rhouL = fb[idof + ndof]; real rhovL = fb[idof + 2 * ndof]; real rhowL = fb[idof + 3 * ndof]; real rthetaL = fb[idof + 4 * ndof]; real uL = rhouL / rhoL; real vL = rhovL / rhoL; real wL = rhowL / rhoL; real thetaL = rthetaL / rhoL; real pL = eos_pressure(rhoL, thetaL, p0, Rd, gamma); real unL = uL * nx + vL * ny + wL * nz; real cL = sqrt(gamma * pL / rhoL); // Right state (exterior) real rhoR = fextb[idof]; real rhouR = fextb[idof + ndof]; real rhovR = fextb[idof + 2 * ndof]; real rhowR = fextb[idof + 3 * ndof]; real rthetaR = fextb[idof + 4 * ndof]; real uR = rhouR / rhoR; real vR = rhovR / rhoR; real wR = rhowR / rhoR; real thetaR = rthetaR / rhoR; real pR = eos_pressure(rhoR, thetaR, p0, Rd, gamma); real unR = uR * nx + vR * ny + wR * nz; real cR = sqrt(gamma * pR / rhoR); // LMARS interface velocity and pressure. real rho_bar = (real)0.5 * (rhoL + rhoR); real c_bar = (real)0.5 * (cL + cR); real rc = rho_bar * c_bar; real un_star = (real)0.5 * (unL + unR) - (pR - pL) / ((real)2.0 * rc); real p_star = (real)0.5 * (pL + pR) - (real)0.5 * rc * (unR - unL); // Upwind on un_star real s0, s1, s2, s3, s4; if (un_star >= (real)0) { s0 = rhoL; s1 = rhouL; s2 = rhovL; s3 = rhowL; s4 = rthetaL; } else { s0 = rhoR; s1 = rhouR; s2 = rhovR; s3 = rhowR; s4 = rthetaR; } flux[idof] = (s0 * un_star) * nmag; flux[idof + ndof] = (s1 * un_star + p_star * nx) * nmag; flux[idof + 2 * ndof] = (s2 * un_star + p_star * ny) * nmag; flux[idof + 3 * ndof] = (s3 * un_star + p_star * nz) * nmag; flux[idof + 4 * ndof] = (s4 * un_star) * nmag; flux[idof + 5 * ndof] = (real)0; // geopotential — no flux } } extern "C" { void boundaryflux_esatmo3d_gpu( real *fb, real *fextb, real *nhat, real *nscale, real *flux, real p0, real Rd, real gamma, int N, int nel) { int ndof = (N + 1) * (N + 1) * 6 * nel; int threads_per_block = 256; int nblocks_x = ndof / threads_per_block + 1; boundaryflux_esatmo3d_kernel<<<dim3(nblocks_x, 1, 1), dim3(threads_per_block, 1, 1), 0, 0>>>( fb, fextb, nhat, nscale, flux, p0, Rd, gamma, N, nel); } } // ============================================================ // Souza et al. (2023, JAMES) entropy-conservative two-point flux for // Entropy-Stable Atmosphere 3D with potential temperature on curvilinear mesh, with // well-balanced hydrostatic pressure split. // // Physical flux: // f_d(rho) = <rho>_log * <v_d> // f_d(rho*v_i) = <rho>_log * <v_i> * <v_d> + (<p> - <p_hyd>) * delta_{id} // f_d(rho*th) = <rho*theta>_log * <v_d> // // where <a>_log is the logarithmic mean, <a> the arithmetic mean. // The hydrostatic pressure subtraction makes the volume tendency for // (rho*u, rho*v, rho*w) vanish exactly in the hydrostatic state. // // For each computational direction r and node pair, the contravariant // flux is the parent's split-form projection: // // Fc^r = avg(Ja^r_d) * Fphys_d (summed over physical dirs d=1,2,3) // // Memory layout: // s: SC_3D_INDEX(i,j,k,iel,ivar,N,nel) // dsdx: dsdx[iq + nq*(iel + nel*(d + 3*r))] for Ja^r_d // f: TPV_3D_INDEX(nn,i,j,k,iel,ivar,idir,N,nel,nvar) // // nvar = 6: (rho, rho*u, rho*v, rho*w, rho*theta, Phi). Phi has zero // flux; gravity is handled by the Souza non-conservative source term. // ============================================================ template <int blockSize> __global__ void __launch_bounds__(512) twopointfluxmethod_esatmo3d_kernel( real *f, real *s, real *dsdx, real p0, real Rd, real gamma, int nq, int N, int nvar) { uint32_t iq = threadIdx.x; if (iq < (uint32_t)nq) { uint32_t iel = blockIdx.x; uint32_t nel = gridDim.x; uint32_t ii = iq % (N + 1); uint32_t jj = (iq / (N + 1)) % (N + 1); uint32_t kk = iq / (N + 1) / (N + 1); // Load state at (i,j,k) real rho_ijk = s[iq + nq * (iel + nel * 0)]; real rhou_ijk = s[iq + nq * (iel + nel * 1)]; real rhov_ijk = s[iq + nq * (iel + nel * 2)]; real rhow_ijk = s[iq + nq * (iel + nel * 3)]; real rtheta_ijk = s[iq + nq * (iel + nel * 4)]; real u_ijk = rhou_ijk / rho_ijk; real v_ijk = rhov_ijk / rho_ijk; real w_ijk = rhow_ijk / rho_ijk; real theta_ijk = rtheta_ijk / rho_ijk; real p_ijk = eos_pressure(rho_ijk, theta_ijk, p0, Rd, gamma); int nq4 = nq * (N + 1); // stride between idir slices in TPV layout for (int nn = 0; nn < N + 1; nn++) { // ============== xi^1: pair (i,j,k)-(nn,j,k) ============== uint32_t iq1 = nn + (N + 1) * (jj + (N + 1) * kk); real rho1 = s[iq1 + nq * (iel + nel * 0)]; real rhou1 = s[iq1 + nq * (iel + nel * 1)]; real rhov1 = s[iq1 + nq * (iel + nel * 2)]; real rhow1 = s[iq1 + nq * (iel + nel * 3)]; real rtheta1 = s[iq1 + nq * (iel + nel * 4)]; real u1 = rhou1 / rho1; real v1 = rhov1 / rho1; real w1 = rhow1 / rho1; real theta1 = rtheta1 / rho1; real p1 = eos_pressure(rho1, theta1, p0, Rd, gamma); real rho_log = log_mean_dev(rho_ijk, rho1); real rth_log = log_mean_dev(rtheta_ijk, rtheta1); real u_a = (real)0.5 * (u_ijk + u1); real v_a = (real)0.5 * (v_ijk + v1); real w_a = (real)0.5 * (w_ijk + w1); real p_a = (real)0.5 * (p_ijk + p1); for (int ivar = 0; ivar < 5; ivar++) { real Fphys_x, Fphys_y, Fphys_z; if (ivar == 0) { Fphys_x = rho_log * u_a; Fphys_y = rho_log * v_a; Fphys_z = rho_log * w_a; } else if (ivar == 1) { Fphys_x = rho_log * u_a * u_a + p_a; Fphys_y = rho_log * u_a * v_a; Fphys_z = rho_log * u_a * w_a; } else if (ivar == 2) { Fphys_x = rho_log * v_a * u_a; Fphys_y = rho_log * v_a * v_a + p_a; Fphys_z = rho_log * v_a * w_a; } else if (ivar == 3) { Fphys_x = rho_log * w_a * u_a; Fphys_y = rho_log * w_a * v_a; Fphys_z = rho_log * w_a * w_a + p_a; } else { Fphys_x = rth_log * u_a; Fphys_y = rth_log * v_a; Fphys_z = rth_log * w_a; } real Ja1_x = (real)0.5 * (dsdx[iq + nq * (iel + nel * (0 + 3 * 0))] + dsdx[iq1 + nq * (iel + nel * (0 + 3 * 0))]); real Ja1_y = (real)0.5 * (dsdx[iq + nq * (iel + nel * (1 + 3 * 0))] + dsdx[iq1 + nq * (iel + nel * (1 + 3 * 0))]); real Ja1_z = (real)0.5 * (dsdx[iq + nq * (iel + nel * (2 + 3 * 0))] + dsdx[iq1 + nq * (iel + nel * (2 + 3 * 0))]); f[nn + (N + 1) * iq + nq4 * (iel + nel * (ivar + nvar * 0))] = Ja1_x * Fphys_x + Ja1_y * Fphys_y + Ja1_z * Fphys_z; } // Geopotential (var 5) carries no flux. f[nn + (N + 1) * iq + nq4 * (iel + nel * (5 + nvar * 0))] = (real)0; // ============== xi^2: pair (i,j,k)-(i,nn,k) ============== uint32_t iq2 = ii + (N + 1) * (nn + (N + 1) * kk); real rho2 = s[iq2 + nq * (iel + nel * 0)]; real rhou2 = s[iq2 + nq * (iel + nel * 1)]; real rhov2 = s[iq2 + nq * (iel + nel * 2)]; real rhow2 = s[iq2 + nq * (iel + nel * 3)]; real rtheta2 = s[iq2 + nq * (iel + nel * 4)]; real u2 = rhou2 / rho2; real v2 = rhov2 / rho2; real w2 = rhow2 / rho2; real theta2 = rtheta2 / rho2; real p2 = eos_pressure(rho2, theta2, p0, Rd, gamma); rho_log = log_mean_dev(rho_ijk, rho2); rth_log = log_mean_dev(rtheta_ijk, rtheta2); u_a = (real)0.5 * (u_ijk + u2); v_a = (real)0.5 * (v_ijk + v2); w_a = (real)0.5 * (w_ijk + w2); p_a = (real)0.5 * (p_ijk + p2); for (int ivar = 0; ivar < 5; ivar++) { real Fphys_x, Fphys_y, Fphys_z; if (ivar == 0) { Fphys_x = rho_log * u_a; Fphys_y = rho_log * v_a; Fphys_z = rho_log * w_a; } else if (ivar == 1) { Fphys_x = rho_log * u_a * u_a + p_a; Fphys_y = rho_log * u_a * v_a; Fphys_z = rho_log * u_a * w_a; } else if (ivar == 2) { Fphys_x = rho_log * v_a * u_a; Fphys_y = rho_log * v_a * v_a + p_a; Fphys_z = rho_log * v_a * w_a; } else if (ivar == 3) { Fphys_x = rho_log * w_a * u_a; Fphys_y = rho_log * w_a * v_a; Fphys_z = rho_log * w_a * w_a + p_a; } else { Fphys_x = rth_log * u_a; Fphys_y = rth_log * v_a; Fphys_z = rth_log * w_a; } real Ja2_x = (real)0.5 * (dsdx[iq + nq * (iel + nel * (0 + 3 * 1))] + dsdx[iq2 + nq * (iel + nel * (0 + 3 * 1))]); real Ja2_y = (real)0.5 * (dsdx[iq + nq * (iel + nel * (1 + 3 * 1))] + dsdx[iq2 + nq * (iel + nel * (1 + 3 * 1))]); real Ja2_z = (real)0.5 * (dsdx[iq + nq * (iel + nel * (2 + 3 * 1))] + dsdx[iq2 + nq * (iel + nel * (2 + 3 * 1))]); f[nn + (N + 1) * iq + nq4 * (iel + nel * (ivar + nvar * 1))] = Ja2_x * Fphys_x + Ja2_y * Fphys_y + Ja2_z * Fphys_z; } f[nn + (N + 1) * iq + nq4 * (iel + nel * (5 + nvar * 1))] = (real)0; // ============== xi^3: pair (i,j,k)-(i,j,nn) ============== uint32_t iq3 = ii + (N + 1) * (jj + (N + 1) * nn); real rho3 = s[iq3 + nq * (iel + nel * 0)]; real rhou3 = s[iq3 + nq * (iel + nel * 1)]; real rhov3 = s[iq3 + nq * (iel + nel * 2)]; real rhow3 = s[iq3 + nq * (iel + nel * 3)]; real rtheta3 = s[iq3 + nq * (iel + nel * 4)]; real u3 = rhou3 / rho3; real v3 = rhov3 / rho3; real w3 = rhow3 / rho3; real theta3 = rtheta3 / rho3; real p3 = eos_pressure(rho3, theta3, p0, Rd, gamma); rho_log = log_mean_dev(rho_ijk, rho3); rth_log = log_mean_dev(rtheta_ijk, rtheta3); u_a = (real)0.5 * (u_ijk + u3); v_a = (real)0.5 * (v_ijk + v3); w_a = (real)0.5 * (w_ijk + w3); p_a = (real)0.5 * (p_ijk + p3); for (int ivar = 0; ivar < 5; ivar++) { real Fphys_x, Fphys_y, Fphys_z; if (ivar == 0) { Fphys_x = rho_log * u_a; Fphys_y = rho_log * v_a; Fphys_z = rho_log * w_a; } else if (ivar == 1) { Fphys_x = rho_log * u_a * u_a + p_a; Fphys_y = rho_log * u_a * v_a; Fphys_z = rho_log * u_a * w_a; } else if (ivar == 2) { Fphys_x = rho_log * v_a * u_a; Fphys_y = rho_log * v_a * v_a + p_a; Fphys_z = rho_log * v_a * w_a; } else if (ivar == 3) { Fphys_x = rho_log * w_a * u_a; Fphys_y = rho_log * w_a * v_a; Fphys_z = rho_log * w_a * w_a + p_a; } else { Fphys_x = rth_log * u_a; Fphys_y = rth_log * v_a; Fphys_z = rth_log * w_a; } real Ja3_x = (real)0.5 * (dsdx[iq + nq * (iel + nel * (0 + 3 * 2))] + dsdx[iq3 + nq * (iel + nel * (0 + 3 * 2))]); real Ja3_y = (real)0.5 * (dsdx[iq + nq * (iel + nel * (1 + 3 * 2))] + dsdx[iq3 + nq * (iel + nel * (1 + 3 * 2))]); real Ja3_z = (real)0.5 * (dsdx[iq + nq * (iel + nel * (2 + 3 * 2))] + dsdx[iq3 + nq * (iel + nel * (2 + 3 * 2))]); f[nn + (N + 1) * iq + nq4 * (iel + nel * (ivar + nvar * 2))] = Ja3_x * Fphys_x + Ja3_y * Fphys_y + Ja3_z * Fphys_z; } f[nn + (N + 1) * iq + nq4 * (iel + nel * (5 + nvar * 2))] = (real)0; } } } extern "C" { void twopointfluxmethod_esatmo3d_gpu( real *f, real *s, real *dsdx, real p0, real Rd, real gamma, int N, int nvar, int nel) { int nq = (N + 1) * (N + 1) * (N + 1); if (N < 4) { twopointfluxmethod_esatmo3d_kernel<64><<<dim3(nel, 1, 1), dim3(64, 1, 1), 0, 0>>>(f, s, dsdx, p0, Rd, gamma, nq, N, nvar); } else if (N >= 4 && N < 8) { twopointfluxmethod_esatmo3d_kernel<512><<<dim3(nel, 1, 1), dim3(512, 1, 1), 0, 0>>>(f, s, dsdx, p0, Rd, gamma, nq, N, nvar); } } } // ============================================================ // Souza et al. (2023) non-conservative gravity flux differencing. // At each receiving node (i,j,k,iel) compute // src(rhow)|_i = -(1/J_i) sum_r sum_j D_split^r[i_r, j] // * 0.5*(Ja^r_z(i)+Ja^r_z(j)) // * <rho>_log(s_i, s_j) * (Phi_j - Phi_i) // using the geopotential Phi carried as state variable index 5 // (0-indexed). All other source components are set to zero. template <int blockSize> __global__ void __launch_bounds__(512) sourcemethod_esatmo3d_kernel( real *source, real *solution, real *dsdx, real *J, real *dSplit, int nq, int N, int nvar) { uint32_t iq = threadIdx.x; if (iq < (uint32_t)nq) { uint32_t iel = blockIdx.x; uint32_t nel = gridDim.x; uint32_t ii = iq % (N + 1); uint32_t jj = (iq / (N + 1)) % (N + 1); uint32_t kk = iq / (N + 1) / (N + 1); real rho_ijk = solution[iq + nq * (iel + nel * 0)]; real phi_ijk = solution[iq + nq * (iel + nel * 5)]; real acc = (real)0; for (int nn = 0; nn < N + 1; nn++) { // xi^1 partner (nn, jj, kk) — uses Ja^1_z (slot d=2,r=0 -> idx 2) uint32_t iq1 = nn + (N + 1) * (jj + (N + 1) * kk); real rho_p = solution[iq1 + nq * (iel + nel * 0)]; real phi_p = solution[iq1 + nq * (iel + nel * 5)]; real rho_log = log_mean_dev(rho_ijk, rho_p); real Ja_z_avg = (real)0.5 * (dsdx[iq + nq * (iel + nel * (2 + 3 * 0))] + dsdx[iq1 + nq * (iel + nel * (2 + 3 * 0))]); real D_r = dSplit[nn + (N + 1) * ii]; acc += D_r * Ja_z_avg * rho_log * (phi_p - phi_ijk); // xi^2 partner (ii, nn, kk) — Ja^2_z slot d=2,r=1 -> idx 5 uint32_t iq2 = ii + (N + 1) * (nn + (N + 1) * kk); rho_p = solution[iq2 + nq * (iel + nel * 0)]; phi_p = solution[iq2 + nq * (iel + nel * 5)]; rho_log = log_mean_dev(rho_ijk, rho_p); Ja_z_avg = (real)0.5 * (dsdx[iq + nq * (iel + nel * (2 + 3 * 1))] + dsdx[iq2 + nq * (iel + nel * (2 + 3 * 1))]); D_r = dSplit[nn + (N + 1) * jj]; acc += D_r * Ja_z_avg * rho_log * (phi_p - phi_ijk); // xi^3 partner (ii, jj, nn) — Ja^3_z slot d=2,r=2 -> idx 8 uint32_t iq3 = ii + (N + 1) * (jj + (N + 1) * nn); rho_p = solution[iq3 + nq * (iel + nel * 0)]; phi_p = solution[iq3 + nq * (iel + nel * 5)]; rho_log = log_mean_dev(rho_ijk, rho_p); Ja_z_avg = (real)0.5 * (dsdx[iq + nq * (iel + nel * (2 + 3 * 2))] + dsdx[iq3 + nq * (iel + nel * (2 + 3 * 2))]); D_r = dSplit[nn + (N + 1) * kk]; acc += D_r * Ja_z_avg * rho_log * (phi_p - phi_ijk); } real jac = J[iq + nq * iel]; source[iq + nq * (iel + nel * 0)] = (real)0; // rho source[iq + nq * (iel + nel * 1)] = (real)0; // rhou source[iq + nq * (iel + nel * 2)] = (real)0; // rhov source[iq + nq * (iel + nel * 3)] = -acc / jac; // rhow source[iq + nq * (iel + nel * 4)] = (real)0; // rhotheta source[iq + nq * (iel + nel * 5)] = (real)0; // Phi } } extern "C" { void sourcemethod_esatmo3d_gpu( real *source, real *solution, real *dsdx, real *J, real *dSplit, int N, int nvar, int nel) { int nq = (N + 1) * (N + 1) * (N + 1); if (N < 4) { sourcemethod_esatmo3d_kernel<64><<<dim3(nel, 1, 1), dim3(64, 1, 1), 0, 0>>>( source, solution, dsdx, J, dSplit, nq, N, nvar); } else { sourcemethod_esatmo3d_kernel<512><<<dim3(nel, 1, 1), dim3(512, 1, 1), 0, 0>>>( source, solution, dsdx, J, dSplit, nq, N, nvar); } } } // ============================================================ // Diffusive flux for ESAtmo3D — constant-coefficient Laplacian // (Bassi-Rebay 1) with separate momentum (nu) and thermal (kappa) // diffusivities. // // Interior fill: // F_d(rho) = 0 // F_d(rho*v_i) = -nu * d(rho*v_i)/dx_d for i = 1,2,3 // F_d(rho*theta) = -kappa * d(rho*theta)/dx_d // // solutionGradient layout (Vector3D): // dsdx[(idof + ndof_node*ivar) + ndof_per_dir*d] // where ndof_node = (N+1)^3 * nel and ndof_per_dir = ndof_node*nvar. // // diffFlux is also a Vector3D with the same layout. Using the // MappedDGDivergence pipeline downstream gives the BR1 weak-form // diffusive divergence in a single MappedDGDivergence call. // ============================================================ __global__ void diffusiveflux_esatmo3d_kernel( real *diffFlux, real *grad, real nu, real kappa, uint32_t ndof_node, uint32_t nvar) { uint32_t i = threadIdx.x + blockIdx.x*blockDim.x; uint32_t ndof_total = ndof_node * nvar * 3; if (i < ndof_total) { // Decode (node, ivar, d). Layout: outer d, middle ivar, inner node. uint32_t node = i % ndof_node; uint32_t var = (i / ndof_node) % nvar; // Choose coefficient by variable index: 0 -> rho (no diffusion); // 1,2,3 -> momentum (nu); 4 -> rho*theta (kappa); 5 -> Phi (no diffusion). real coeff; if (var == 0 || var == 5) coeff = (real)0; else if (var == 4) coeff = kappa; else coeff = nu; diffFlux[i] = -coeff * grad[i]; (void)node; } } extern "C" { void diffusiveflux_esatmo3d_gpu( real *diffFlux, real *grad, real nu, real kappa, int N, int nvar, int nel) { uint32_t ndof_node = (N+1) * (N+1) * (N+1) * (uint32_t)nel; uint32_t ndof_total = ndof_node * (uint32_t)nvar * 3; uint32_t nthreads = 256; uint32_t nblocks = (ndof_total + nthreads - 1) / nthreads; diffusiveflux_esatmo3d_kernel<<<dim3(nblocks,1,1), dim3(nthreads,1,1), 0, 0>>>( diffFlux, grad, nu, kappa, ndof_node, (uint32_t)nvar); } } // ============================================================ // Boundary BR1 central flux for the parabolic terms: // f_R^diff(iVar) = -coeff(iVar) * (avg_grad(iVar,d) . n_d) * nmag // // avgGrad layout (Vector3D, boundary side, with the 3 components // stored as separate "vars" of the 3*nvar ordering used internally): // avgGrad[idof_face + ndof_face*(var + nvar*d)] // where idof_face = (i + (N+1)*j + (N+1)^2*side + 6*(N+1)^2*iel). // nhat layout (Vector3D, nVar=1, boundary): // nhat[idof_face] : x-comp // nhat[idof_face + ndof_face] : y-comp // nhat[idof_face + 2*ndof_face] : z-comp // nscale layout (Scalar3D, nVar=1): nscale[idof_face] // fluxN layout (Scalar3D, boundary): fluxN[idof_face + ndof_face*var] // ============================================================ __global__ void diffusiveboundaryflux_esatmo3d_kernel( real *fluxN, real *avgGrad, real *uBnd, real *uExt, real *nhat, real *nscale, real nu, real kappa, real tau_nu, real tau_kappa, uint32_t ndof_face, uint32_t nvar) { // SIPG-stabilised BR1 boundary flux: // fluxN = -coeff*(avgGrad . n)*nmag + tau*(uL - uR)*nmag // tau_nu, tau_kappa precomputed on host: // tau = eta_penalty * coeff * (N+1)^2 / length_scale. // Solution buffer layout (Scalar3D, boundary): // uBnd[idof + ndof_face*var] = solution%boundary(... ,var) // uExt[idof + ndof_face*var] = solution%extBoundary(... ,var) uint32_t idof = threadIdx.x + blockIdx.x*blockDim.x; if (idof < ndof_face) { real nx = nhat[idof]; real ny = nhat[idof + ndof_face]; real nz = nhat[idof + 2*ndof_face]; real nmag = nscale[idof]; for (uint32_t var = 0; var < nvar; var++) { real coeff, tau; if (var == 0 || var == 5) { coeff = (real)0; tau = (real)0; } else if (var == 4) { coeff = kappa; tau = tau_kappa; } else { coeff = nu; tau = tau_nu; } // avgGrad components for this variable: stride between d-slabs // is ndof_face*nvar. real gx = avgGrad[idof + ndof_face*(var + nvar*0)]; real gy = avgGrad[idof + ndof_face*(var + nvar*1)]; real gz = avgGrad[idof + ndof_face*(var + nvar*2)]; real gn = gx*nx + gy*ny + gz*nz; real uL = uBnd[idof + ndof_face*var]; real uR = uExt[idof + ndof_face*var]; fluxN[idof + ndof_face*var] = (-coeff*gn + tau*(uL - uR)) * nmag; } } } extern "C" { void diffusiveboundaryflux_esatmo3d_gpu( real *fluxN, real *avgGrad, real *uBnd, real *uExt, real *nhat, real *nscale, real nu, real kappa, real tau_nu, real tau_kappa, int N, int nvar, int nel) { uint32_t ndof_face = (N+1) * (N+1) * 6 * (uint32_t)nel; uint32_t nthreads = 256; uint32_t nblocks = (ndof_face + nthreads - 1) / nthreads; diffusiveboundaryflux_esatmo3d_kernel<<<dim3(nblocks,1,1), dim3(nthreads,1,1), 0, 0>>>( fluxN, avgGrad, uBnd, uExt, nhat, nscale, nu, kappa, tau_nu, tau_kappa, ndof_face, (uint32_t)nvar); } }