其他分享
首页 > 其他分享> > cuda矩阵相加分块平铺cudaMemcpy2D

cuda矩阵相加分块平铺cudaMemcpy2D

作者:互联网

该代码实现了矩阵相加计算,在GPU中将矩阵的数据分块平铺处理,使用cudaMemcpy2D优化矩阵访问,并且对比了算法相对于CPU版本的加速效果,验证了算法正确性。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <malloc.h>
#include <random>
#include "time.h"
#define W 2000//矩阵维度
#define H 3000

int a[H][W];
int b[H][W];
int c1[H][W];//存储CPU计算结果
int c2[H][W];//存储GPU计算结果

__global__ void matrixAddGPU(int* c, int* a, int* b, size_t pitch)//GPU版本
{
    int x = blockDim.x * blockIdx.x + threadIdx.x;//计算全局序号
    int y = blockDim.y * blockIdx.y + threadIdx.y;
    if (x < W && y < H) {//防止越界访问
        c[y * pitch + x] = a[y * pitch + x] + b[y * pitch + x];
    }
}

void matrixAddCPU(int c[][W], int a[][W], int b[][W]) {//CPU版本
    for (int i = 0; i < H; ++i) {
        for (int j = 0; j < W; ++j)
            c[i][j] = a[i][j] + b[i][j];
    }
}

int main()
{
    srand(0);
    for (int i = 0; i < H; ++i) {
        for (int j = 0; j < W; ++j) {
            a[i][j] = rand() % 1000;
            b[i][j] = rand() % 1000;
        }
    }

    clock_t start, end;
    double elapsedTime;
    start = clock();
    matrixAddCPU(c1, a, b);
    end = clock();
    elapsedTime = (double)(end - start);
    printf("time to generate CPU:% 5.3f ms\n", elapsedTime);//打印CPU执行用时

    int* dev_a, * dev_b, * dev_c;
    size_t pitch;
    cudaMallocPitch((void**)&dev_a, &pitch, sizeof(int) * W, H);
    cudaMemcpy2D(dev_a, pitch,
        a, sizeof(int) * W,
        sizeof(int) * W, H, cudaMemcpyHostToDevice);
    cudaMallocPitch((void**)&dev_b, &pitch, sizeof(int) * W, H);
    cudaMemcpy2D(dev_b, pitch,
        b, sizeof(int) * W,
        sizeof(int) * W, H, cudaMemcpyHostToDevice);
    cudaMallocPitch((void**)&dev_c, &pitch, sizeof(int) * W, H);

    dim3 dimBlock(16, 16);//矩阵分块大小
    dim3 dimGrid((W + 16 - 1) / 16, (H + 16 - 1) / 16);//矩阵分块后维度

    cudaEvent_t start1, stop1;
    cudaEventCreate(&start1);
    cudaEventCreate(&stop1);
    cudaEventRecord(start1, 0);
    matrixAddGPU << <dimGrid, dimBlock >> > (dev_c, 
        dev_a, dev_b, pitch/sizeof(int));
    cudaEventRecord(stop1, 0);
    cudaEventSynchronize(stop1);
    float elapsedTime1;
    cudaEventElapsedTime(&elapsedTime1, start1, stop1);
    printf("time to generate GPU:% 5.3f ms\n", elapsedTime1);//打印GPU执行用时
    cudaEventDestroy(start1);
    cudaEventDestroy(stop1);

    cudaMemcpy2D(c2, sizeof(int) * W,//将GPU计算结果拷贝回CPU端
        dev_c, pitch, sizeof(int) * W, H, cudaMemcpyDeviceToHost);

    bool flag = true;
    for (int i = 0; i < H; ++i) {//检验计算正确性
        for(int j=0;j<W;++j)
            if (c1[i][j] != c2[i][j]) {
                flag = false;
                break;
            }
    }
    if (flag) printf("Consistent!!!\n");
    else printf("Not consistent!!!\n");

    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    return 0;
}

标签:分块,int,dev,cudaMemcpy2D,cuda,pitch,sizeof,include,CPU
来源: https://blog.csdn.net/qq_44643644/article/details/119142045