首頁猿問如何使用嵌套for循環(huán)添加兩個2d...

如何使用嵌套for循環(huán)添加兩個2d（音調(diào)）數(shù)組？

源碼算法與數(shù)據(jù)結(jié)構(gòu)

躍然一笑 2019-07-30 16:50:24

如何使用嵌套for循環(huán)添加兩個2d（音調(diào)）數(shù)組？我是cuda的新手。我想將兩個2d數(shù)組加到第三個數(shù)組中。我使用以下代碼：cudaMallocPitch((void**)&device_a, &pitch, 2*sizeof(int),2);cudaMallocPitch((void**)&device_b, &pitch, 2*sizeof(int),2);cudaMallocPitch((void**)&device_c, &pitch, 2*sizeof(int),2);現(xiàn)在我的問題是我不想在我的內(nèi)核代碼中使用這些數(shù)組作為扁平的二維數(shù)組我希望di使用兩個for循環(huán)并將結(jié)果放在第三個數(shù)組中__global__ void add(int *dev_a ,int *dev_b,int* dec_c){ for i=0;i<2;i++) { for j=0;j<2;j++) { dev_c[i][j]=dev_a[i][j]+dev_b[i][j]; } }}我怎么能在CUDA做到這一點？請告訴我如何以這種方式使用二維陣列？使用2d-array的內(nèi)核調(diào)用應(yīng)該是什么？如果可能，請使用代碼示例進行說明。

查看完整描述

2 回答

RISEBY

TA貢獻1856條經(jīng)驗獲得超5個贊

簡短的回答是，你做不到。該cudaMallocPitch()函數(shù)正如其名稱所暗示的那樣，它分配了音調(diào)線性內(nèi)存，其中音調(diào)被選擇為GPU內(nèi)存控制器和紋理硬件的最佳選擇。

如果你想在內(nèi)核中使用指針數(shù)組，那么內(nèi)核代碼必須如下所示：

__global___ void add(int *dev_a[] ,int *dev_b[], int* dec_c[]){
    for i=0;i<2;i++) { 
      for j=0;j<2;j++) {
        dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
      }
    }}

然后你需要cudaMalloc在主機端進行嵌套調(diào)用來構(gòu)造指針數(shù)組并將其復(fù)制到設(shè)備內(nèi)存中。對于相當(dāng)簡單的2x2示例，分配單個數(shù)組的代碼如下所示：

int ** h_a = (int **)malloc(2 * sizeof(int *));cudaMalloc((void**)&h_a[0], 2*sizeof(int));cudaMalloc((void**)&h_a[1], 2*sizeof(int));int **d_a;cudaMalloc((void ***)&d_a, 2 * sizeof(int *));cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice);

這將在d_a中保留指定的指針設(shè)備數(shù)組，并將其傳遞給內(nèi)核。

出于代碼復(fù)雜性和性能原因，您實際上不希望這樣做，使用CUDA代碼中的指針數(shù)組比使用線性內(nèi)存的替代方案更難和更慢。

要在CUDA中顯示使用指針數(shù)組的愚蠢行為，這里是一個完整的示例問題示例，它結(jié)合了上述兩個想法：

#include <cstdio>__global__ void add(int * dev_a[], int * dev_b[], int * dev_c[]){
    for(int i=0;i<2;i++)
    { 
        for(int j=0;j<2;j++)
        {
            dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
        }
    }}inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true){
    if (code != 0) {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
        if (Abort) exit(code);
    }       }#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }int main(void){
    const int aa[2][2]={{1,2},{3,4}};
    const int bb[2][2]={{5,6},{7,8}};
    int cc[2][2];

    int ** h_a = (int **)malloc(2 * sizeof(int *));
    for(int i=0; i<2;i++){
        GPUerrchk(cudaMalloc((void**)&h_a[i], 2*sizeof(int)));
        GPUerrchk(cudaMemcpy(h_a[i], &aa[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
    }

    int **d_a;
    GPUerrchk(cudaMalloc((void ***)&d_a, 2 * sizeof(int *)));
    GPUerrchk(cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice));

    int ** h_b = (int **)malloc(2 * sizeof(int *));
    for(int i=0; i<2;i++){
        GPUerrchk(cudaMalloc((void**)&h_b[i], 2*sizeof(int)));
        GPUerrchk(cudaMemcpy(h_b[i], &bb[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
    }

    int ** d_b;
    GPUerrchk(cudaMalloc((void ***)&d_b, 2 * sizeof(int *)));
    GPUerrchk(cudaMemcpy(d_b, h_b, 2*sizeof(int *), cudaMemcpyHostToDevice));

    int ** h_c = (int **)malloc(2 * sizeof(int *));
    for(int i=0; i<2;i++){
        GPUerrchk(cudaMalloc((void**)&h_c[i], 2*sizeof(int)));
    }

    int ** d_c;
    GPUerrchk(cudaMalloc((void ***)&d_c, 2 * sizeof(int *)));
    GPUerrchk(cudaMemcpy(d_c, h_c, 2*sizeof(int *), cudaMemcpyHostToDevice));

    add<<<1,1>>>(d_a,d_b,d_c);
    GPUerrchk(cudaPeekAtLastError());

    for(int i=0; i<2;i++){
        GPUerrchk(cudaMemcpy(&cc[i][0], h_c[i], 2*sizeof(int), cudaMemcpyDeviceToHost));
    }

    for(int i=0;i<2;i++) {
        for(int j=0;j<2;j++) {
            printf("(%d,%d):%d\n",i,j,cc[i][j]);
        }
    }

    return cudaThreadExit();}

我建議你研究它，直到你理解它的作用，以及與使用線性記憶相比，它為什么這么糟糕。

反對回復(fù) 2019-07-30