unified memory 笔记

unified memory能够实现同一个地址同时被CPU和GPU访问,由CUDA的驱动实现CPU和GPU之间的数据传输。
unified memory以page size为单位实现CPU与GPU之间的数据传输,使用getconf PAGESIZE命令能够得到系统的page size。以下所示,我系统的page size 为4096ios

~ getconf PAGESIZE
4096

以下所示的代码,当申请的unified memory小于page size 时,在两个GPU的状况下,可能出现同一个page 上分配了两个unified memory,并且这个两个unified memory 属于不一样的被不一样的GPU使用,这样致使同一个page在不一样的gpu之间来回导数据,致使执行速度奇慢无比。web

#include <stdio.h>
#include <thread>
#include <unistd.h>
#include <iostream>
#include <cuda_runtime.h>
#include <vector>

__global__ void proc(int64_t* ptr[],int len){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    idx = idx % len;
    for(int i=0;i<128;++i){
        for(int j=0;j<128;++j){
            *ptr[idx] += *ptr[idx];
        }
    }
}

void proc_thread1(int gpu_id){
    void *array_ptr;
    auto err = cudaMallocManaged(&array_ptr,sizeof(void*)*128);
    if(err){
        std::cout << "proc_thread0 " << err << std::endl;
    }
    int64_t** (data_ptr) = (int64_t**)(array_ptr);
    for(int i=0; i<128; ++i){
        void* ptr=nullptr;
        auto err = cudaMallocManaged(&ptr,8);
        if(err){
            std::cout << "cudaMallocManaged " << err << std::endl;
        }
        data_ptr[i] = (int64_t*)ptr;
    }

    cudaSetDevice(gpu_id);
    while(true){
        auto start = std::chrono::high_resolution_clock::now();

        proc<<<128,128>>>(data_ptr,128);
        cudaDeviceSynchronize();

        auto stop = std::chrono::high_resolution_clock::now();
        auto span = (std::chrono::duration<double, std::milli>(stop - start)).count();
        std::cout << "gpu id = " << gpu_id;
        std::cout << ", cost :" << span << "(ms)" << std::endl;
    }
}

int main(){

    std::thread th0(&proc_thread1,0);
    std::thread th1(&proc_thread1,1);
    th0.join();
    th1.join();
    return 0;
}

执行效果以下bash

gpu id = 0, cost :625.921(ms)
gpu id = 0, cost :2100.8(ms)
gpu id = 0, cost :2100.55(ms)
gpu id = 0, cost :2091.64(ms)
gpu id = 1, cost :8067.24(ms)
gpu id = 0, cost :2101.75(ms)
gpu id = 0, cost :2096.87(ms)
gpu id = 0, cost :2096.91(ms)
gpu id = 0, cost :2094.44(ms)
gpu id = 1, cost :8310(ms)
gpu id = 0, cost :2105.92(ms)
gpu id = 0, cost :2097.66(ms)

若是申请的unified memory 等于page size ,则不会出现同一个page在不一样的gpu之间来回导数据的状况,此时执行数据也很是快,代码以下:svg

#include <stdio.h>
#include <thread>
#include <unistd.h>
#include <iostream>
#include <cuda_runtime.h>
#include <vector>


__global__ void proc(int64_t* ptr[],int len){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    idx = idx % len;
    for(int i=0;i<128;++i){
        for(int j=0;j<128;++j){
            *ptr[idx] += *ptr[idx];
        }
    }
}

void proc_thread1(int gpu_id){
    void *array_ptr;
    auto err = cudaMallocManaged(&array_ptr,sizeof(void*)*512);
    if(err){
        std::cout << "proc_thread0 " << err << std::endl;
    }
    int64_t** (data_ptr) = (int64_t**)(array_ptr);
    for(int i=0; i<128; ++i){
        void* ptr=nullptr;
        auto err = cudaMallocManaged(&ptr,4096);
        if(err){
            std::cout << "cudaMallocManaged " << err << std::endl;
        }
        data_ptr[i] = (int64_t*)ptr;
    }

    cudaSetDevice(gpu_id);
    while(true){
        auto start = std::chrono::high_resolution_clock::now();

        proc<<<128,128>>>(data_ptr,128);
        cudaDeviceSynchronize();

        auto stop = std::chrono::high_resolution_clock::now();
        auto span = (std::chrono::duration<double, std::milli>(stop - start)).count();
        std::cout << "gpu id = " << gpu_id;
        std::cout << ", cost :" << span << "(ms)" << std::endl;
    }
}


int main(){

    std::thread th0(&proc_thread1,0);
    std::thread th1(&proc_thread1,1);
    th0.join();
    th1.join();
    return 0;
}

执行效果以下:spa

gpu id = 0, cost :147.898(ms)
gpu id = 0, cost :133.469(ms)
gpu id = 1, cost :306.239(ms)
gpu id = 0, cost :121.874(ms)
gpu id = 1, cost :133.641(ms)
gpu id = 0, cost :118.545(ms)
gpu id = 1, cost :124.928(ms)
gpu id = 0, cost :117.004(ms)
gpu id = 1, cost :114.917(ms)
gpu id = 0, cost :116.955(ms)
gpu id = 1, cost :112.313(ms)
gpu id = 0, cost :116.965(ms)
gpu id = 1, cost :111.29(ms)
gpu id = 0, cost :116.978(ms)
gpu id = 1, cost :111.296(ms)
gpu id = 0, cost :117.008(ms)

因此在多卡的状况下使用unified memory,尽可能不要分配零碎的小片内存code