SELF_TwoPointVectors.cpp Source File


Contents


Source Code

#include "SELF_GPU_Macros.h"

// ============================================================
// TwoPointVectorDivergence_2D
//
// Computes the split-form (two-point) divergence in 2-D on the
// reference element:
//
//   df(i,j,iel,ivar) = 2 * sum_{n=0}^{N} [ D_{n,i} * F^0(n,i,j,iel,ivar)
//                                         + D_{n,j} * F^1(n,i,j,iel,ivar) ]
//
// Input  f  : two-point vector, layout TPV_2D_INDEX(n,i,j,iel,ivar,idir,...)
// Output df : scalar, layout SC_2D_INDEX / iq + nq*(iel + nel*ivar)
// ============================================================

template <int blockSize>
__global__ void __launch_bounds__(256) TwoPointVectorDivergence_2D_gpukernel(
    real *f, real *df, real *dmatrix, int nq, int N, int nvar)
{
  uint32_t iq = threadIdx.x;
  if (iq < nq) {
    uint32_t iel  = blockIdx.x;
    uint32_t nel  = gridDim.x;
    uint32_t ivar = blockIdx.y;
    uint32_t i    = iq % (N+1);
    uint32_t j    = iq / (N+1);

    // Load D-matrix into shared memory (nq entries, same as block size)
    __shared__ real dmloc[blockSize];
    dmloc[iq] = dmatrix[iq];
    __syncthreads();

    // nq3 = (N+1)^3 : stride between idir slices in the two-point array
    int nq3 = nq*(N+1);

    real dfLoc = 0.0;
    for (int nn = 0; nn < N+1; nn++) {
      // f^0(n,i,j,...) at idir=0  :  TPV_2D_INDEX reduces to
      //   nn + (N+1)*iq + nq3*(iel + nel*ivar)
      real f0 = f[nn + (N+1)*iq + nq3*(iel + nel*ivar)];
      // f^1(n,i,j,...) at idir=1  :  nq3*(iel + nel*(ivar + nvar))
      real f1 = f[nn + (N+1)*iq + nq3*(iel + nel*(ivar + nvar))];

      dfLoc += dmloc[nn + (N+1)*i]*f0 + dmloc[nn + (N+1)*j]*f1;
    }
    df[iq + nq*(iel + nel*ivar)] = 2.0*dfLoc;
  }
}

extern "C"
{
  void TwoPointVectorDivergence_2D_gpu(real *f, real *df, real *dmatrix, int N, int nvar, int nel)
  {
    int nq = (N+1)*(N+1);
    if (N <= 7) {
      TwoPointVectorDivergence_2D_gpukernel<64><<<dim3(nel,nvar,1), dim3(64,1,1), 0, 0>>>(
        f, df, dmatrix, nq, N, nvar);
    } else {
      TwoPointVectorDivergence_2D_gpukernel<256><<<dim3(nel,nvar,1), dim3(256,1,1), 0, 0>>>(
        f, df, dmatrix, nq, N, nvar);
    }
  }
}

// ============================================================
// TwoPointVectorDivergence_3D
//
// Computes the split-form divergence in 3-D on the reference element:
//
//   df(i,j,k,iel,ivar) = 2 * sum_n [ D_{n,i} F^0(n,i,j,k)
//                                   + D_{n,j} F^1(n,i,j,k)
//                                   + D_{n,k} F^2(n,i,j,k) ]
// ============================================================

template <int blockSize, int matSize>
__global__ void __launch_bounds__(512) TwoPointVectorDivergence_3D_gpukernel(
    real *f, real *df, real *dmatrix, int nq, int N, int nvar)
{
  uint32_t iq = threadIdx.x;
  if (iq < nq) {
    uint32_t iel  = blockIdx.x;
    uint32_t nel  = gridDim.x;
    uint32_t ivar = blockIdx.y;
    uint32_t i    = iq % (N+1);
    uint32_t j    = (iq/(N+1)) % (N+1);
    uint32_t k    = iq/(N+1)/(N+1);

    // Load D-matrix (matSize = (N+1)^2 entries) into shared memory.
    // Only threads with k==0 contribute; all others are idle for this load.
    __shared__ real dmloc[matSize];
    if (k == 0) {
      dmloc[i + (N+1)*j] = dmatrix[i + (N+1)*j];
    }
    __syncthreads();

    // nq4 = (N+1)^4 : stride between idir slices
    int nq4 = nq*(N+1);

    real dfLoc = 0.0;
    for (int nn = 0; nn < N+1; nn++) {
      real f0 = f[nn + (N+1)*iq + nq4*(iel + nel*ivar)];
      real f1 = f[nn + (N+1)*iq + nq4*(iel + nel*(ivar + nvar))];
      real f2 = f[nn + (N+1)*iq + nq4*(iel + nel*(ivar + 2*nvar))];

      dfLoc += dmloc[nn + (N+1)*i]*f0
             + dmloc[nn + (N+1)*j]*f1
             + dmloc[nn + (N+1)*k]*f2;
    }
    df[iq + nq*(iel + nel*ivar)] = 2.0*dfLoc;
  }
}

extern "C"
{
  void TwoPointVectorDivergence_3D_gpu(real *f, real *df, real *dmatrix, int N, int nvar, int nel)
  {
    int nq = (N+1)*(N+1)*(N+1);
    if (N < 4) {
      TwoPointVectorDivergence_3D_gpukernel<64,16><<<dim3(nel,nvar,1), dim3(64,1,1), 0, 0>>>(
        f, df, dmatrix, nq, N, nvar);
    } else if (N >= 4 && N < 8) {
      TwoPointVectorDivergence_3D_gpukernel<512,64><<<dim3(nel,nvar,1), dim3(512,1,1), 0, 0>>>(
        f, df, dmatrix, nq, N, nvar);
    }
  }
}

// ============================================================
// MappedTwoPointVectorDivergence_2D
//
// Fused kernel for the physical-space split-form divergence on a
// curvilinear 2-D mesh following Winters, Kopriva, Gassner & Hindenlang.
//
// For each node (i,j) the contravariant two-point fluxes are formed
// by projecting the physical-space two-point fluxes onto *averaged*
// metric terms:
//
//   F~^r_{(i,n),j} = sum_d (Ja^r_d(i,j) + Ja^r_d(n,j))/2 * f^d(n,i,j)
//
// where dsdx[iq + nq*(iel + nel*(d + 2*r))] = J*a^r_d  (0-based d,r).
//
// The physical divergence is:
//   df = (2/J) * sum_n [ D_{n,i} F~^1 + D_{n,j} F~^2 ]
// ============================================================

template <int blockSize>
__global__ void __launch_bounds__(256) MappedTwoPointVectorDivergence_2D_gpukernel(
    real *f, real *df, real *dmatrix, real *dsdx, real *jacobian, int nq, int N, int nvar)
{
  uint32_t iq = threadIdx.x;
  if (iq < nq) {
    uint32_t iel  = blockIdx.x;
    uint32_t nel  = gridDim.x;
    uint32_t ivar = blockIdx.y;
    uint32_t i    = iq % (N+1);
    uint32_t j    = iq / (N+1);

    __shared__ real dmloc[blockSize];
    dmloc[iq] = dmatrix[iq];
    __syncthreads();

    int nq3 = nq*(N+1);

    // Pre-load the four metric components at the current node (i,j) to
    // avoid repeated global-memory reads inside the nn loop.
    real m00 = dsdx[iq + nq*(iel + nel*0)]; // J*a^1_x  at (i,j)
    real m10 = dsdx[iq + nq*(iel + nel*1)]; // J*a^1_y  at (i,j)
    real m01 = dsdx[iq + nq*(iel + nel*2)]; // J*a^2_x  at (i,j)
    real m11 = dsdx[iq + nq*(iel + nel*3)]; // J*a^2_y  at (i,j)

    real dfLoc = 0.0;
    for (int nn = 0; nn < N+1; nn++) {
      // Node (nn,j) for the xi^1 averaged metric
      uint32_t iq_nn_j = nn + (N+1)*j;
      // Node (i,nn) for the xi^2 averaged metric
      uint32_t iq_i_nn = i + (N+1)*nn;

      // Physical-space two-point fluxes (same value used for both sums)
      real fx = f[nn + (N+1)*iq + nq3*(iel + nel*ivar)];
      real fy = f[nn + (N+1)*iq + nq3*(iel + nel*(ivar + nvar))];

      // Contravariant two-point flux in xi^1: metric averaged between (i,j)-(nn,j)
      real Fc1 = 0.5*(m00 + dsdx[iq_nn_j + nq*(iel + nel*0)])*fx
               + 0.5*(m10 + dsdx[iq_nn_j + nq*(iel + nel*1)])*fy;

      // Contravariant two-point flux in xi^2: metric averaged between (i,j)-(i,nn)
      real Fc2 = 0.5*(m01 + dsdx[iq_i_nn + nq*(iel + nel*2)])*fx
               + 0.5*(m11 + dsdx[iq_i_nn + nq*(iel + nel*3)])*fy;

      dfLoc += dmloc[nn + (N+1)*i]*Fc1 + dmloc[nn + (N+1)*j]*Fc2;
    }
    df[iq + nq*(iel + nel*ivar)] = 2.0*dfLoc/jacobian[iq + nq*iel];
  }
}

extern "C"
{
  void MappedTwoPointVectorDivergence_2D_gpu(
      real *f, real *df, real *dmatrix, real *dsdx, real *jacobian, int N, int nvar, int nel)
  {
    int nq = (N+1)*(N+1);
    if (N <= 7) {
      MappedTwoPointVectorDivergence_2D_gpukernel<64><<<dim3(nel,nvar,1), dim3(64,1,1), 0, 0>>>(
        f, df, dmatrix, dsdx, jacobian, nq, N, nvar);
    } else {
      MappedTwoPointVectorDivergence_2D_gpukernel<256><<<dim3(nel,nvar,1), dim3(256,1,1), 0, 0>>>(
        f, df, dmatrix, dsdx, jacobian, nq, N, nvar);
    }
  }
}

// ============================================================
// MappedTwoPointVectorDivergence_3D
//
// Fused kernel for the physical-space split-form divergence on a
// curvilinear 3-D mesh.
//
// dsdx[iq + nq*(iel + nel*(d + 3*r))] = J*a^r_d  (0-based d in {0,1,2},
//                                                   r in {0,1,2}).
//
// Three averaging pairs:
//   xi^1: metric averaged between (i,j,k)-(nn,j,k)
//   xi^2: metric averaged between (i,j,k)-(i,nn,k)
//   xi^3: metric averaged between (i,j,k)-(i,j,nn)
// ============================================================

template <int blockSize, int matSize>
__global__ void __launch_bounds__(512) MappedTwoPointVectorDivergence_3D_gpukernel(
    real *f, real *df, real *dmatrix, real *dsdx, real *jacobian, int nq, int N, int nvar)
{
  uint32_t iq = threadIdx.x;
  if (iq < nq) {
    uint32_t iel  = blockIdx.x;
    uint32_t nel  = gridDim.x;
    uint32_t ivar = blockIdx.y;
    uint32_t i    = iq % (N+1);
    uint32_t j    = (iq/(N+1)) % (N+1);
    uint32_t k    = iq/(N+1)/(N+1);

    __shared__ real dmloc[matSize];
    if (k == 0) {
      dmloc[i + (N+1)*j] = dmatrix[i + (N+1)*j];
    }
    __syncthreads();

    int nq4 = nq*(N+1);

    // Pre-load the nine metric components at the current node (i,j,k)
    real m00 = dsdx[iq + nq*(iel + nel*0)]; // J*a^1_x
    real m10 = dsdx[iq + nq*(iel + nel*1)]; // J*a^1_y
    real m20 = dsdx[iq + nq*(iel + nel*2)]; // J*a^1_z
    real m01 = dsdx[iq + nq*(iel + nel*3)]; // J*a^2_x
    real m11 = dsdx[iq + nq*(iel + nel*4)]; // J*a^2_y
    real m21 = dsdx[iq + nq*(iel + nel*5)]; // J*a^2_z
    real m02 = dsdx[iq + nq*(iel + nel*6)]; // J*a^3_x
    real m12 = dsdx[iq + nq*(iel + nel*7)]; // J*a^3_y
    real m22 = dsdx[iq + nq*(iel + nel*8)]; // J*a^3_z

    real dfLoc = 0.0;
    for (int nn = 0; nn < N+1; nn++) {
      // Neighbour nodes for each coordinate direction's averaging pair
      uint32_t iq_nn_jk = nn + (N+1)*(j + (N+1)*k); // (nn,j,k) for xi^1
      uint32_t iq_i_nn_k = i + (N+1)*(nn + (N+1)*k); // (i,nn,k) for xi^2
      uint32_t iq_ij_nn  = i + (N+1)*(j + (N+1)*nn); // (i,j,nn) for xi^3

      // Physical two-point fluxes
      real fx = f[nn + (N+1)*iq + nq4*(iel + nel*ivar)];
      real fy = f[nn + (N+1)*iq + nq4*(iel + nel*(ivar + nvar))];
      real fz = f[nn + (N+1)*iq + nq4*(iel + nel*(ivar + 2*nvar))];

      // Contravariant two-point flux in xi^1: (i,j,k)-(nn,j,k)
      real Fc1 = 0.5*(m00 + dsdx[iq_nn_jk + nq*(iel + nel*0)])*fx
               + 0.5*(m10 + dsdx[iq_nn_jk + nq*(iel + nel*1)])*fy
               + 0.5*(m20 + dsdx[iq_nn_jk + nq*(iel + nel*2)])*fz;

      // Contravariant two-point flux in xi^2: (i,j,k)-(i,nn,k)
      real Fc2 = 0.5*(m01 + dsdx[iq_i_nn_k + nq*(iel + nel*3)])*fx
               + 0.5*(m11 + dsdx[iq_i_nn_k + nq*(iel + nel*4)])*fy
               + 0.5*(m21 + dsdx[iq_i_nn_k + nq*(iel + nel*5)])*fz;

      // Contravariant two-point flux in xi^3: (i,j,k)-(i,j,nn)
      real Fc3 = 0.5*(m02 + dsdx[iq_ij_nn + nq*(iel + nel*6)])*fx
               + 0.5*(m12 + dsdx[iq_ij_nn + nq*(iel + nel*7)])*fy
               + 0.5*(m22 + dsdx[iq_ij_nn + nq*(iel + nel*8)])*fz;

      dfLoc += dmloc[nn + (N+1)*i]*Fc1
             + dmloc[nn + (N+1)*j]*Fc2
             + dmloc[nn + (N+1)*k]*Fc3;
    }
    df[iq + nq*(iel + nel*ivar)] = 2.0*dfLoc/jacobian[iq + nq*iel];
  }
}

extern "C"
{
  void MappedTwoPointVectorDivergence_3D_gpu(
      real *f, real *df, real *dmatrix, real *dsdx, real *jacobian, int N, int nvar, int nel)
  {
    int nq = (N+1)*(N+1)*(N+1);
    if (N < 4) {
      MappedTwoPointVectorDivergence_3D_gpukernel<64,16><<<dim3(nel,nvar,1), dim3(64,1,1), 0, 0>>>(
        f, df, dmatrix, dsdx, jacobian, nq, N, nvar);
    } else if (N >= 4 && N < 8) {
      MappedTwoPointVectorDivergence_3D_gpukernel<512,64><<<dim3(nel,nvar,1), dim3(512,1,1), 0, 0>>>(
        f, df, dmatrix, dsdx, jacobian, nq, N, nvar);
    }
  }
}

// ============================================================
// ECDGSurfaceContribution_2D / _3D
//
// Adds the weak-form DG surface contribution divided by the element Jacobian
// to the flux divergence array.  This is the surface term in the EC-DGSEM
// tendency:
//
//   df += (1/J) * M^{-1} * B^T * f_bn
//
// where f_bn = flux%boundaryNormal (Riemann fluxes scaled by nScale).
// The layout of f_bn is SCB_2D_INDEX / SCB_3D_INDEX (scalar boundary).
// The Jacobian array has layout SC_2D_INDEX / SC_3D_INDEX (scalar interior).
// ============================================================

__global__ void __launch_bounds__(256) ECDGSurfaceContribution_2D_gpukernel(
    real *fbn, real *jacobian, real *bMatrix, real *qWeights, real *df, int N, int nq)
{
  uint32_t iq  = threadIdx.x;
  if (iq < nq) {
    uint32_t i   = iq % (N+1);
    uint32_t j   = iq / (N+1);
    uint32_t iel = blockIdx.x;
    uint32_t nel = gridDim.x;
    uint32_t ivar = blockIdx.y;

    real J = jacobian[iq + nq*iel];

    df[iq + nq*(iel + nel*ivar)] +=
      (bMatrix[i+(N+1)]*fbn[SCB_2D_INDEX(j,1,iel,ivar,N,nel)] + // east
       bMatrix[i]       *fbn[SCB_2D_INDEX(j,3,iel,ivar,N,nel)]) // west
      / (qWeights[i]*J)
      +
      (bMatrix[j+(N+1)]*fbn[SCB_2D_INDEX(i,2,iel,ivar,N,nel)] + // north
       bMatrix[j]       *fbn[SCB_2D_INDEX(i,0,iel,ivar,N,nel)]) // south
      / (qWeights[j]*J);
  }
}

extern "C"
{
  void ECDGSurfaceContribution_2D_gpu(
      real *fbn, real *jacobian, real *bMatrix, real *qWeights, real *df,
      int N, int nvar, int nel)
  {
    int nq = (N+1)*(N+1);
    if (N <= 7) {
      ECDGSurfaceContribution_2D_gpukernel<<<dim3(nel,nvar,1), dim3(64,1,1), 0, 0>>>(
        fbn, jacobian, bMatrix, qWeights, df, N, nq);
    } else {
      ECDGSurfaceContribution_2D_gpukernel<<<dim3(nel,nvar,1), dim3(256,1,1), 0, 0>>>(
        fbn, jacobian, bMatrix, qWeights, df, N, nq);
    }
  }
}

__global__ void __launch_bounds__(512) ECDGSurfaceContribution_3D_gpukernel(
    real *fbn, real *jacobian, real *bMatrix, real *qWeights, real *df, int N, int nq)
{
  uint32_t iq   = threadIdx.x;
  if (iq < nq) {
    uint32_t i    = iq % (N+1);
    uint32_t j    = (iq/(N+1)) % (N+1);
    uint32_t k    = iq/(N+1)/(N+1);
    uint32_t iel  = blockIdx.x;
    uint32_t nel  = gridDim.x;
    uint32_t ivar = blockIdx.y;

    real J = jacobian[iq + nq*iel];

    df[iq + nq*(iel + nel*ivar)] +=
      (fbn[SCB_3D_INDEX(i,j,5,iel,ivar,N,nel)]*bMatrix[k+(N+1)] + // top
       fbn[SCB_3D_INDEX(i,j,0,iel,ivar,N,nel)]*bMatrix[k])         // bottom
      / (qWeights[k]*J)
      +
      (fbn[SCB_3D_INDEX(j,k,2,iel,ivar,N,nel)]*bMatrix[i+(N+1)] + // east
       fbn[SCB_3D_INDEX(j,k,4,iel,ivar,N,nel)]*bMatrix[i])         // west
      / (qWeights[i]*J)
      +
      (fbn[SCB_3D_INDEX(i,k,3,iel,ivar,N,nel)]*bMatrix[j+(N+1)] + // north
       fbn[SCB_3D_INDEX(i,k,1,iel,ivar,N,nel)]*bMatrix[j])         // south
      / (qWeights[j]*J);
  }
}

extern "C"
{
  void ECDGSurfaceContribution_3D_gpu(
      real *fbn, real *jacobian, real *bMatrix, real *qWeights, real *df,
      int N, int nvar, int nel)
  {
    int nq = (N+1)*(N+1)*(N+1);
    if (N < 4) {
      ECDGSurfaceContribution_3D_gpukernel<<<dim3(nel,nvar,1), dim3(64,1,1), 0, 0>>>(
        fbn, jacobian, bMatrix, qWeights, df, N, nq);
    } else if (N >= 4 && N < 8) {
      ECDGSurfaceContribution_3D_gpukernel<<<dim3(nel,nvar,1), dim3(512,1,1), 0, 0>>>(
        fbn, jacobian, bMatrix, qWeights, df, N, nq);
    }
  }
}