其他分享
首页 > 其他分享> > 【cuda】 thrust 进行加速

【cuda】 thrust 进行加速

作者:互联网

官方网站

thrus的特点

代码部分

定义核函数后,在main.cpp中进行调用

thrust_abcd.cuh

#pragma once
#include<thrust/host_vector.h>
#include<thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>

#include<chrono> 
#include <iostream>
void gpuInit(int* a);
extern float time_pure_gpu;
extern float time_cp_total;

thrust_abdf.cu

#include "thrust_abcd.cuh"

struct saxpy_functor
{
    const float a;

    saxpy_functor(float _a) : a(_a) {}

    __host__ __device__
        float operator()(const float& x, const float& y) const {
        return x + y;
    }
};

float time_pure_gpu = 0;
float time_cp_total = 0;
void gpuInit(int* a)
{
   
    thrust::host_vector<int> H(a, a + 25 * 25); // get cpu data
    std::cout << "H has size " << H.size() << std::endl;

    // Copy host_vector H to device_vector D
    std::chrono::time_point<std::chrono::high_resolution_clock> p0 = std::chrono::high_resolution_clock::now();
    thrust::device_vector<int> D = H;   // listd
    thrust::device_vector<int> E = H;   // liste
    thrust::device_vector<int> D_res(E.size()); // listd + liste -> res
    std::chrono::time_point<std::chrono::high_resolution_clock> p1 = std::chrono::high_resolution_clock::now();
    float time_cp = (float)std::chrono::duration_cast<std::chrono::microseconds>(p1 - p0).count() / 1000;
    std::cout << "copy data to gpu time:" << time_cp << "ms" << std::endl;
    
    if (time_cp < 1)
    time_cp_total += time_cp;


    //thrust::transform(数据A的开始, 数据A的结尾, 数据B的开始, 计算结果的开始, 对应算法thrust::multiplies<float>());
    std::chrono::time_point<std::chrono::high_resolution_clock> p2 = std::chrono::high_resolution_clock::now();
    thrust::transform(D.begin(), D.end(), E.begin(), D_res.begin(), thrust::plus<float>());
    std::chrono::time_point<std::chrono::high_resolution_clock> p3 = std::chrono::high_resolution_clock::now();
    float time_g = (float)std::chrono::duration_cast<std::chrono::microseconds>(p3 - p2).count() / 1000;
    std::cout << "pure gpu time:" << time_g << "ms" << std::endl;
    if (time_g <1)
    time_pure_gpu += time_g;

    thrust::host_vector<int> H_res = D_res;

   /* for (int i = 0; i < D.size(); i++)
    {
        std::cout << "D[" << i << "] = " << D[i] << std::endl;
        std::cout << "res[" << i << "] = " << res[i] << std::endl;
    }*/
    
}

main.cpp

#include "thrust_abcd.cuh"
int main()
{
   
    // create two imgs,1*25*25, fill with 1 and 2 separately
    int h = 25; //row
    int w = 25; //col
     
    // method 2,thrust
    int a[25 * 25];
    for (int i = 0; i < 25 * 25; i++)
    {
        a[i] = i * (i + 1);
    }
    // warmup
    gpuInit(a);
    
    for (int i = 0; i < 100; ++i)
    {
        gpuInit(a);
    }

    std::cout << "avg pure gpu time: " << time_pure_gpu / 100 << "ms" << std::endl;
    std::cout << "avg cp gpu time: " << time_cp_total / 100 << "ms" << std::endl;
    std::cout << "add over" << std::endl;*/
    return 0;
    
}

标签:std,25,int,float,cuda,thrust,include,加速
来源: https://blog.csdn.net/weixin_41449637/article/details/120321338