SELF_Data.cpp Source File


Contents

Source Code


Source Code

#include "SELF_GPU_Macros.h"

__global__ void Average(real *avgf, real *f1, real *f2, int ndof){

  uint32_t i = threadIdx.x + blockIdx.x*blockDim.x;

  if( i < ndof ){
    avgf[i] =0.5*(f1[i]+f2[i]);
  }
}

extern "C"
{
  void Average_gpu(real *f, real *f1, real *f2, int ndof)
  {
    int threads_per_block = 256;
    int nblocks_x = ndof/threads_per_block + 1;

    dim3 nblocks(nblocks_x,1,1);
    dim3 nthreads(threads_per_block,1,1);

    Average<<<nblocks, nthreads, 0, 0>>>(f, f1, f2, ndof);
  }
}

__global__ void BoundaryInterp_2D_gpukernel(real *bMatrix, real *f, real *fBound, int N, int nel, int nvar){
  int ndof = (N+1)*nel*nvar;
  int iq = threadIdx.x + blockIdx.x*blockDim.x;
  if(iq < ndof){
    int i = iq % (N+1);
    int iEl = (iq/(N+1)) % (nel);
    int iVar = iq/(N+1)/(nel);

    real fbl = 0.0;
    real fbr = 0.0;
    for (int ii=0; ii<N+1; ii++) {
      fbl += f[SC_2D_INDEX(i,ii,iEl,iVar,N,nel)]*bMatrix[ii]; // South
      fbr += f[SC_2D_INDEX(i,ii,iEl,iVar,N,nel)]*bMatrix[ii+(N+1)]; // North
    }
    fBound[SCB_2D_INDEX(i,0,iEl,iVar,N,nel)] = fbl; // South
    fBound[SCB_2D_INDEX(i,2,iEl,iVar,N,nel)] = fbr; // North


    fbl = 0.0;
    fbr = 0.0;
    for (int ii=0; ii<N+1; ii++) {
      fbl += f[SC_2D_INDEX(ii,i,iEl,iVar,N,nel)]*bMatrix[ii]; // West
      fbr += f[SC_2D_INDEX(ii,i,iEl,iVar,N,nel)]*bMatrix[ii+(N+1)]; // East
    }
    fBound[SCB_2D_INDEX(i,3,iEl,iVar,N,nel)] = fbl; // West
    fBound[SCB_2D_INDEX(i,1,iEl,iVar,N,nel)] = fbr; // East
  }

}

extern "C"
{
  void BoundaryInterp_2D_gpu(real *bMatrix, real *f, real *fBound, int N, int nvar, int nel)
  {
    int ndof = (N+1)*nel*nvar;
    int threads_per_block = 256;
    int nblocks_x = ndof/threads_per_block +1;
	  BoundaryInterp_2D_gpukernel<<<dim3(nblocks_x,1,1), dim3(threads_per_block,1,1), 0, 0>>>(bMatrix, f, fBound, N, nel, nvar);
  } 
}

__global__ void BoundaryInterp_3D_gpukernel(real *bMatrix, real *f, real *fBound, int N, int nel, int nvar){
  int ndof = (N+1)*(N+1)*nel*nvar;
  int iq = threadIdx.x + blockIdx.x*blockDim.x;
  if( iq < ndof ){
    int i = iq % (N+1);
    int j = (iq/(N+1))%(N+1);
    int iEl = (iq/(N+1)/(N+1)) % (nel);
    int iVar = iq/(N+1)/(N+1)/(nel);

    real fb[6] = {0.0};
    for (int ii=0; ii<N+1; ii++) {
      fb[0] += f[SC_3D_INDEX(i,j,ii,iEl,iVar,N,nel)]*bMatrix[ii]; // Bottom
      fb[1] += f[SC_3D_INDEX(i,ii,j,iEl,iVar,N,nel)]*bMatrix[ii]; // South
      fb[2] += f[SC_3D_INDEX(ii,i,j,iEl,iVar,N,nel)]*bMatrix[ii+(N+1)]; // East
      fb[3] += f[SC_3D_INDEX(i,ii,j,iEl,iVar,N,nel)]*bMatrix[ii+(N+1)]; // North
      fb[4] += f[SC_3D_INDEX(ii,i,j,iEl,iVar,N,nel)]*bMatrix[ii]; // West
      fb[5] += f[SC_3D_INDEX(i,j,ii,iEl,iVar,N,nel)]*bMatrix[ii+(N+1)]; // Top
    }
    fBound[SCB_3D_INDEX(i,j,0,iEl,iVar,N,nel)] = fb[0];
    fBound[SCB_3D_INDEX(i,j,1,iEl,iVar,N,nel)] = fb[1];
    fBound[SCB_3D_INDEX(i,j,2,iEl,iVar,N,nel)] = fb[2];
    fBound[SCB_3D_INDEX(i,j,3,iEl,iVar,N,nel)] = fb[3];
    fBound[SCB_3D_INDEX(i,j,4,iEl,iVar,N,nel)] = fb[4];
    fBound[SCB_3D_INDEX(i,j,5,iEl,iVar,N,nel)] = fb[5];
  }
}

extern "C"
{
  void BoundaryInterp_3D_gpu(real *bMatrix, real *f, real *fBound, int N, int nvar, int nel)
  {
    int ndof = (N+1)*(N+1)*nel*nvar;
    int threads_per_block = 256;
    int nblocks_x = ndof/threads_per_block +1;
	  BoundaryInterp_3D_gpukernel<<<dim3(nblocks_x,1,1), dim3(threads_per_block,1,1), 0, 0>>>(bMatrix, f, fBound, N, nel, nvar);
  } 
}

template <int blockSize>
__global__ void __launch_bounds__(256) Divergence_2D_gpukernel(real *f, real *df, real *dmatrix, int nq, int N){

    uint32_t iq = threadIdx.x;
    if( iq < nq ){
        
        uint32_t iel = blockIdx.x;
        uint32_t nel = gridDim.x;
        uint32_t ivar = blockIdx.y;
        uint32_t nvar = gridDim.y;
        uint32_t i = iq % (N+1);
        uint32_t j = (iq/(N+1));

        __shared__ real f1[blockSize];
        __shared__ real f2[blockSize];
        __shared__ real dmloc[blockSize];
        f1[iq] = f[iq + nq*(iel + nel*(ivar))]; // x-component
        f2[iq] = f[iq + nq*(iel + nel*(ivar + nvar))]; // y-component
        dmloc[iq] = dmatrix[iq];
        __syncthreads();

        real dfloc = 0.0;
        for(int ii = 0; ii<N+1; ii++){
            dfloc += dmloc[ii+(N+1)*i]*f1[ii+(N+1)*(j)]+
                     dmloc[ii+(N+1)*j]*f2[i+(N+1)*(ii)];
        }
        df[iq + nq*(iel + nel*ivar)] = dfloc;
    }

}

extern "C"
{
  void Divergence_2D_gpu(real *f, real *df, real *dmatrix, int N, int nvar, int nel){
    int nq = (N+1)*(N+1);

    if( N <= 7 ){
      Divergence_2D_gpukernel<64><<<dim3(nel,nvar,1), dim3(64,1,1), 0, 0>>>(f,df,dmatrix,nq,N);
    } else {
      Divergence_2D_gpukernel<256><<<dim3(nel,nvar,1), dim3(256,1,1), 0, 0>>>(f,df,dmatrix,nq,N);
    }

  }
}

__global__ void __launch_bounds__(256) DG_BoundaryContribution_2D_gpukernel(real *bMatrix, real *qWeights, real *bf, real *df, int N, int nq){

  uint32_t iq = threadIdx.x;

  if( iq < nq ){
    uint32_t i = iq % (N+1);
    uint32_t j = (iq/(N+1));
    uint32_t iel = blockIdx.x;
    uint32_t nel = gridDim.x;
    uint32_t ivar = blockIdx.y;

    df[iq + nq*(iel + nel*ivar)] += (bMatrix[i+(N+1)]*bf[SCB_2D_INDEX(j,1,iel,ivar,N,nel)] + // east
                                           bMatrix[i]*bf[SCB_2D_INDEX(j,3,iel,ivar,N,nel)])/ // west
                                         qWeights[i];

    df[iq + nq*(iel + nel*ivar)] += (bMatrix[j+(N+1)]*bf[SCB_2D_INDEX(i,2,iel,ivar,N,nel)] + // north
                                           bMatrix[j]*bf[SCB_2D_INDEX(i,0,iel,ivar,N,nel)])/  // south
                                         qWeights[j];
  }

}

extern "C"
{
  void DG_BoundaryContribution_2D_gpu(real *bMatrix, real *qWeights, real *bf, real *df, int N, int nvar, int nel)
  {
    int nq = (N+1)*(N+1);

    if( N <= 7 ){
      DG_BoundaryContribution_2D_gpukernel<<<dim3(nel,nvar,1), dim3(64,1,1), 0, 0>>>(bMatrix, qWeights, bf, df, N, nq);
    } else {
      DG_BoundaryContribution_2D_gpukernel<<<dim3(nel,nvar,1), dim3(256,1,1), 0, 0>>>(bMatrix, qWeights, bf, df, N, nq);
    }
  } 
}

template<int blockSize, int matSize>
__global__ void __launch_bounds__(512) Divergence_3D_gpukernel(double *f, double *df, double *dmatrix, int nq, int N, int nel, int nvar){

    uint32_t idof = threadIdx.x;
    if( idof < nq ){
        
        uint32_t iel = blockIdx.x;
        uint32_t ivar = blockIdx.y;
        uint32_t i = idof % (N+1);
        uint32_t j = (idof/(N+1)) % (N+1);
        uint32_t k = (idof/(N+1)/(N+1));

        __shared__ double f1[blockSize];
        __shared__ double f2[blockSize];
        __shared__ double f3[blockSize];
        __shared__ double dmloc[matSize];
        f1[i+(N+1)*(j+(N+1)*k)] = f[i+(N+1)*(j+(N+1)*(k + (N+1)*(iel + nel*(ivar))))];
        f2[i+(N+1)*(j+(N+1)*k)] = f[i+(N+1)*(j+(N+1)*(k + (N+1)*(iel + nel*(ivar + nvar))))];
        f3[i+(N+1)*(j+(N+1)*k)] = f[i+(N+1)*(j+(N+1)*(k + (N+1)*(iel + nel*(ivar + 2*nvar))))];
        if( k == 0 ){
            dmloc[i+(N+1)*j] = dmatrix[i+(N+1)*j];
        }
        __syncthreads();

        double dfloc = 0.0;

        for(int ii = 0; ii<N+1; ii++){
            dfloc += dmloc[ii+(N+1)*i]*f1[ii+(N+1)*(j+(N+1)*(k))]+
                     dmloc[ii+(N+1)*j]*f2[i+(N+1)*(ii+(N+1)*(k))]+
                     dmloc[ii+(N+1)*k]*f3[i+(N+1)*(j+(N+1)*(ii))];
        }
        df[idof + nq*(iel + nel*ivar)] = dfloc;
    }

}

extern "C"
{
  void Divergence_3D_gpu(double *f, double *df, double *dmatrix, int N, int nel, int nvar){
    int nq = (N+1)*(N+1)*(N+1);
    if( N < 4 ){
        Divergence_3D_gpukernel<64,16><<<dim3(nel,nvar,1), dim3(64,1,1), 0, 0>>>(f,df,dmatrix,nq,N,nel,nvar);

    } else if( N >= 4 && N < 8 ){
        Divergence_3D_gpukernel<512,64><<<dim3(nel,nvar,1), dim3(512,1,1), 0, 0>>>(f,df,dmatrix,nq,N,nel,nvar);
    }
  }
}

__global__ void __launch_bounds__(512) DG_BoundaryContribution_3D_gpukernel(real *bMatrix, real *qWeights, real *bf, real *df, int N, int nel){

  uint32_t iq = threadIdx.x;
  uint32_t nq = (N+1)*(N+1)*(N+1);

  if( iq < nq ){
    uint32_t i = iq % (N+1);
    uint32_t j = (iq/(N+1))%(N+1);
    uint32_t k = (iq/(N+1)/(N+1))%(N+1);
    uint32_t iel = blockIdx.x;
    uint32_t ivar = blockIdx.y;
    df[iq + nq*(iel + nel*ivar)] += (bf[SCB_3D_INDEX(i,j,5,iel,ivar,N,nel)]*bMatrix[k+(N+1)] + // top
                                              bf[SCB_3D_INDEX(i,j,0,iel,ivar,N,nel)]*bMatrix[k])/       // bottom
                                              qWeights[k];

    df[iq + nq*(iel + nel*ivar)] += (bf[SCB_3D_INDEX(j,k,2,iel,ivar,N,nel)]*bMatrix[i+(N+1)] + // east
                                              bf[SCB_3D_INDEX(j,k,4,iel,ivar,N,nel)]*bMatrix[i])/       // west
                                              qWeights[i];

    df[iq + nq*(iel + nel*ivar)] += (bf[SCB_3D_INDEX(i,k,3,iel,ivar,N,nel)]*bMatrix[j+(N+1)] + // north
                                              bf[SCB_3D_INDEX(i,k,1,iel,ivar,N,nel)]*bMatrix[j])/       // south
                                              qWeights[j];
  }

}

extern "C"
{
  void DG_BoundaryContribution_3D_gpu(real *bMatrix, real *qWeights, real *bf, real *df, int N, int nvar, int nel)
  {

    int nq = (N+1)*(N+1)*(N+1);
    if( N < 4 ){
        DG_BoundaryContribution_3D_gpukernel<<<dim3(nel,nvar,1), dim3(64,1,1), 0, 0>>>(bMatrix, qWeights, bf, df, N, nel);

    } else if( N >= 4 && N < 8 ){
        DG_BoundaryContribution_3D_gpukernel<<<dim3(nel,nvar,1), dim3(512,1,1), 0, 0>>>(bMatrix, qWeights, bf, df, N, nel);
    }
  } 
}