【cuda】 thrust 进行加速
作者:互联网
官方网站
:
- https://docs.nvidia.com/cuda/thrust/index.html
- https://github.com/NVIDIA/thrust
thrus的特点
- thrust一般还是只能单独(从host中)调用,并不适合和.cu混合使用。
- thrust中的算法主要是建立在vector和map<key,value>这两种数据结构之上。 比较适合工程使用,并不能实现复杂的算法;
- 如果你需要处理big size的vector和map,并且操作都比较简单,可以考虑 thrust 后缀是.cpp,编译使用cuda
- c++ release版本(debug目录有问题)
代码部分
定义核函数后,在main.cpp中进行调用
thrust_abcd.cuh
#pragma once
#include<thrust/host_vector.h>
#include<thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#include<chrono>
#include <iostream>
void gpuInit(int* a);
extern float time_pure_gpu;
extern float time_cp_total;
thrust_abdf.cu
#include "thrust_abcd.cuh"
struct saxpy_functor
{
const float a;
saxpy_functor(float _a) : a(_a) {}
__host__ __device__
float operator()(const float& x, const float& y) const {
return x + y;
}
};
float time_pure_gpu = 0;
float time_cp_total = 0;
void gpuInit(int* a)
{
thrust::host_vector<int> H(a, a + 25 * 25); // get cpu data
std::cout << "H has size " << H.size() << std::endl;
// Copy host_vector H to device_vector D
std::chrono::time_point<std::chrono::high_resolution_clock> p0 = std::chrono::high_resolution_clock::now();
thrust::device_vector<int> D = H; // listd
thrust::device_vector<int> E = H; // liste
thrust::device_vector<int> D_res(E.size()); // listd + liste -> res
std::chrono::time_point<std::chrono::high_resolution_clock> p1 = std::chrono::high_resolution_clock::now();
float time_cp = (float)std::chrono::duration_cast<std::chrono::microseconds>(p1 - p0).count() / 1000;
std::cout << "copy data to gpu time:" << time_cp << "ms" << std::endl;
if (time_cp < 1)
time_cp_total += time_cp;
//thrust::transform(数据A的开始, 数据A的结尾, 数据B的开始, 计算结果的开始, 对应算法thrust::multiplies<float>());
std::chrono::time_point<std::chrono::high_resolution_clock> p2 = std::chrono::high_resolution_clock::now();
thrust::transform(D.begin(), D.end(), E.begin(), D_res.begin(), thrust::plus<float>());
std::chrono::time_point<std::chrono::high_resolution_clock> p3 = std::chrono::high_resolution_clock::now();
float time_g = (float)std::chrono::duration_cast<std::chrono::microseconds>(p3 - p2).count() / 1000;
std::cout << "pure gpu time:" << time_g << "ms" << std::endl;
if (time_g <1)
time_pure_gpu += time_g;
thrust::host_vector<int> H_res = D_res;
/* for (int i = 0; i < D.size(); i++)
{
std::cout << "D[" << i << "] = " << D[i] << std::endl;
std::cout << "res[" << i << "] = " << res[i] << std::endl;
}*/
}
main.cpp
#include "thrust_abcd.cuh"
int main()
{
// create two imgs,1*25*25, fill with 1 and 2 separately
int h = 25; //row
int w = 25; //col
// method 2,thrust
int a[25 * 25];
for (int i = 0; i < 25 * 25; i++)
{
a[i] = i * (i + 1);
}
// warmup
gpuInit(a);
for (int i = 0; i < 100; ++i)
{
gpuInit(a);
}
std::cout << "avg pure gpu time: " << time_pure_gpu / 100 << "ms" << std::endl;
std::cout << "avg cp gpu time: " << time_cp_total / 100 << "ms" << std::endl;
std::cout << "add over" << std::endl;*/
return 0;
}
标签:std,25,int,float,cuda,thrust,include,加速 来源: https://blog.csdn.net/weixin_41449637/article/details/120321338