首頁猿問推動(dòng)用戶編寫的內(nèi)核

推動(dòng)用戶編寫的內(nèi)核

Swift 算法與數(shù)據(jù)結(jié)構(gòu)

白豬掌柜的 2019-10-29 14:46:00

我是Thrust的新手。我看到所有Thrust演示文稿和示例都只顯示了主機(jī)代碼。我想知道是否可以將device_vector傳遞給我自己的內(nèi)核？怎么樣？如果是，內(nèi)核/設(shè)備代碼中允許對其進(jìn)行哪些操作？

查看完整描述

3 回答

慕桂英4014372

TA貢獻(xiàn)1871條經(jīng)驗(yàn) 獲得超13個(gè)贊

正如最初編寫的那樣，Thrust純粹是主機(jī)端抽象。它不能在內(nèi)核內(nèi)部使用。您可以thrust::device_vector像這樣將封裝在a中的設(shè)備內(nèi)存?zhèn)鬟f給自己的內(nèi)核：

thrust::device_vector< Foo > fooVector;

// Do something thrust-y with fooVector

Foo* fooArray = thrust::raw_pointer_cast( &fooVector[0] );

// Pass raw array and its size to kernel

someKernelCall<<< x, y >>>( fooArray, fooVector.size() );

并且您還可以通過使用裸cuda設(shè)備內(nèi)存指針實(shí)例化推力::: device_ptr來使用推力算法中推力未分配的設(shè)備內(nèi)存。

經(jīng)過四年半的編輯，根據(jù)@JackOLantern的答案進(jìn)行補(bǔ)充，推力1.8添加了順序執(zhí)行策略，這意味著您可以在設(shè)備上運(yùn)行推力算法的單線程版本。注意，仍然不可能直接將推力設(shè)備向量傳遞給內(nèi)核，并且設(shè)備向量不能直接在設(shè)備代碼中使用。

請注意，thrust::device在某些情況下，也可以使用執(zhí)行策略，以由內(nèi)核作為子網(wǎng)格啟動(dòng)并行推力執(zhí)行。這需要單獨(dú)的編譯/設(shè)備鏈接和支持動(dòng)態(tài)并行性的硬件。我不確定是否所有推力算法實(shí)際上都支持此功能，但是肯定可以使用某些推力算法。

反對回復(fù) 2019-10-29

素胚勾勒不出你

TA貢獻(xiàn)1827條經(jīng)驗(yàn) 獲得超9個(gè)贊

這是我先前回答的更新。

從Thrust 1.8.1開始，CUDA Thrust原語可以與thrust::device執(zhí)行策略結(jié)合起來，以利用CUDA 動(dòng)態(tài)并行性在單個(gè)CUDA線程中并行運(yùn)行。下面，舉一個(gè)例子。

#include <stdio.h>

#include <thrust/reduce.h>

#include <thrust/execution_policy.h>

#include "TimingGPU.cuh"

#include "Utilities.cuh"

#define BLOCKSIZE_1D 256

#define BLOCKSIZE_2D_X 32

#define BLOCKSIZE_2D_Y 32

/*************************/

/* TEST KERNEL FUNCTIONS */

/*************************/

__global__ void test1(const float * __restrict__ d_data, float * __restrict__ d_results, const int Nrows, const int Ncols) {

const unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;

if (tid < Nrows) d_results[tid] = thrust::reduce(thrust::seq, d_data + tid * Ncols, d_data + (tid + 1) * Ncols);

}

__global__ void test2(const float * __restrict__ d_data, float * __restrict__ d_results, const int Nrows, const int Ncols) {

const unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;

if (tid < Nrows) d_results[tid] = thrust::reduce(thrust::device, d_data + tid * Ncols, d_data + (tid + 1) * Ncols);

}

/********/

/* MAIN */

/********/

int main() {

const int Nrows = 64;

const int Ncols = 2048;

gpuErrchk(cudaFree(0));

// size_t DevQueue;

// gpuErrchk(cudaDeviceGetLimit(&DevQueue, cudaLimitDevRuntimePendingLaunchCount));

// DevQueue *= 128;

// gpuErrchk(cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, DevQueue));

float *h_data = (float *)malloc(Nrows * Ncols * sizeof(float));

float *h_results = (float *)malloc(Nrows * sizeof(float));

float *h_results1 = (float *)malloc(Nrows * sizeof(float));

float *h_results2 = (float *)malloc(Nrows * sizeof(float));

float sum = 0.f;

for (int i=0; i<Nrows; i++) {

h_results[i] = 0.f;

for (int j=0; j<Ncols; j++) {

h_data[i*Ncols+j] = i;

h_results[i] = h_results[i] + h_data[i*Ncols+j];

}

TimingGPU timerGPU;

float *d_data; gpuErrchk(cudaMalloc((void**)&d_data, Nrows * Ncols * sizeof(float)));

float *d_results1; gpuErrchk(cudaMalloc((void**)&d_results1, Nrows * sizeof(float)));

float *d_results2; gpuErrchk(cudaMalloc((void**)&d_results2, Nrows * sizeof(float)));

gpuErrchk(cudaMemcpy(d_data, h_data, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice));

timerGPU.StartCounter();

test1<<<iDivUp(Nrows, BLOCKSIZE_1D), BLOCKSIZE_1D>>>(d_data, d_results1, Nrows, Ncols);

gpuErrchk(cudaPeekAtLastError());

gpuErrchk(cudaDeviceSynchronize());

printf("Timing approach nr. 1 = %f\n", timerGPU.GetCounter());

gpuErrchk(cudaMemcpy(h_results1, d_results1, Nrows * sizeof(float), cudaMemcpyDeviceToHost));

for (int i=0; i<Nrows; i++) {

if (h_results1[i] != h_results[i]) {

printf("Approach nr. 1; Error at i = %i; h_results1 = %f; h_results = %f", i, h_results1[i], h_results[i]);

return 0;

}

timerGPU.StartCounter();

test2<<<iDivUp(Nrows, BLOCKSIZE_1D), BLOCKSIZE_1D>>>(d_data, d_results1, Nrows, Ncols);

gpuErrchk(cudaPeekAtLastError());

gpuErrchk(cudaDeviceSynchronize());

printf("Timing approach nr. 2 = %f\n", timerGPU.GetCounter());

gpuErrchk(cudaMemcpy(h_results1, d_results1, Nrows * sizeof(float), cudaMemcpyDeviceToHost));

for (int i=0; i<Nrows; i++) {

if (h_results1[i] != h_results[i]) {

printf("Approach nr. 2; Error at i = %i; h_results1 = %f; h_results = %f", i, h_results1[i], h_results[i]);

return 0;

}

printf("Test passed!\n");

}

上面的示例對矩陣的行進(jìn)行縮減的方式與使用CUDA減少矩陣行的意義相同，但此操作與以上文章不同，即直接從用戶編寫的內(nèi)核中調(diào)用CUDA Thrust原語。此外，以上示例還用于比較在執(zhí)行兩個(gè)執(zhí)行策略（即thrust::seq和）時(shí)相同操作的性能thrust::device。下面，一些圖表顯示了性能差異。

性能已在開普勒K20c和Maxwell GeForce GTX 850M上進(jìn)行了評估。

反對回復(fù) 2019-10-29

慕碼人2483693

TA貢獻(xiàn)1860條經(jīng)驗(yàn) 獲得超9個(gè)贊

我想對此問題提供更新的答案。

從Thrust 1.8開始，CUDA Thrust原語可以與thrust::seq執(zhí)行策略結(jié)合使用，以在單個(gè)CUDA線程中順序運(yùn)行（或在單個(gè)CPU線程中順序運(yùn)行）。下面，舉一個(gè)例子。

如果要在線程內(nèi)并行執(zhí)行，則可以考慮使用CUB，它提供了可從線程塊內(nèi)調(diào)用的簡化例程，只要您的卡啟用了動(dòng)態(tài)并行性。

這是推力的例子

#include <stdio.h>

#include <thrust/reduce.h>

#include <thrust/execution_policy.h>

/********************/

/* CUDA ERROR CHECK */

/********************/

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)

{

if (code != cudaSuccess)

{

fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);

if (abort) exit(code);

}

__global__ void test(float *d_A, int N) {

float sum = thrust::reduce(thrust::seq, d_A, d_A + N);

printf("Device side result = %f\n", sum);

}

int main() {

const int N = 16;

float *h_A = (float*)malloc(N * sizeof(float));

float sum = 0.f;

for (int i=0; i<N; i++) {

h_A[i] = i;

sum = sum + h_A[i];

}

printf("Host side result = %f\n", sum);

float *d_A; gpuErrchk(cudaMalloc((void**)&d_A, N * sizeof(float)));

gpuErrchk(cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice));

test<<<1,1>>>(d_A, N);

}

反對回復(fù) 2019-10-29

3 回答
0 關(guān)注
744 瀏覽

關(guān)注

添加回答

舉報(bào)

0/150

提交

取消

使用 Ctrl+D 可將網(wǎng)站添加到書簽

微信客服

購課補(bǔ)貼
聯(lián)系客服咨詢優(yōu)惠詳情

幫助反饋 APP下載

慕課網(wǎng)APP
您的移動(dòng)學(xué)習(xí)伙伴

公眾號

掃描二維碼
關(guān)注慕課網(wǎng)微信公眾號

第七色在线视频,2021少妇久久久久久久久久,亚洲欧洲精品成人久久av18,亚洲国产精品特色大片观看完整版,孙宇晨将参加特朗普的晚宴

熱搜

最近搜索清空

推動(dòng)用戶編寫的內(nèi)核

推動(dòng)用戶編寫的內(nèi)核

3 回答

添加回答