unified memory能够实现同一个地址同时被CPU和GPU访问,由CUDA的驱动实现CPU和GPU之间的数据传输。
unified memory以page size为单位实现CPU与GPU之间的数据传输,使用getconf PAGESIZE
命令能够得到系统的page size。以下所示,我系统的page size 为4096ios
~ getconf PAGESIZE 4096
以下所示的代码,当申请的unified memory小于page size 时,在两个GPU的状况下,可能出现同一个page 上分配了两个unified memory,并且这个两个unified memory 属于不一样的被不一样的GPU使用,这样致使同一个page在不一样的gpu之间来回导数据,致使执行速度奇慢无比。web
#include <stdio.h> #include <thread> #include <unistd.h> #include <iostream> #include <cuda_runtime.h> #include <vector> __global__ void proc(int64_t* ptr[],int len){ int idx = blockDim.x * blockIdx.x + threadIdx.x; idx = idx % len; for(int i=0;i<128;++i){ for(int j=0;j<128;++j){ *ptr[idx] += *ptr[idx]; } } } void proc_thread1(int gpu_id){ void *array_ptr; auto err = cudaMallocManaged(&array_ptr,sizeof(void*)*128); if(err){ std::cout << "proc_thread0 " << err << std::endl; } int64_t** (data_ptr) = (int64_t**)(array_ptr); for(int i=0; i<128; ++i){ void* ptr=nullptr; auto err = cudaMallocManaged(&ptr,8); if(err){ std::cout << "cudaMallocManaged " << err << std::endl; } data_ptr[i] = (int64_t*)ptr; } cudaSetDevice(gpu_id); while(true){ auto start = std::chrono::high_resolution_clock::now(); proc<<<128,128>>>(data_ptr,128); cudaDeviceSynchronize(); auto stop = std::chrono::high_resolution_clock::now(); auto span = (std::chrono::duration<double, std::milli>(stop - start)).count(); std::cout << "gpu id = " << gpu_id; std::cout << ", cost :" << span << "(ms)" << std::endl; } } int main(){ std::thread th0(&proc_thread1,0); std::thread th1(&proc_thread1,1); th0.join(); th1.join(); return 0; }
执行效果以下bash
gpu id = 0, cost :625.921(ms) gpu id = 0, cost :2100.8(ms) gpu id = 0, cost :2100.55(ms) gpu id = 0, cost :2091.64(ms) gpu id = 1, cost :8067.24(ms) gpu id = 0, cost :2101.75(ms) gpu id = 0, cost :2096.87(ms) gpu id = 0, cost :2096.91(ms) gpu id = 0, cost :2094.44(ms) gpu id = 1, cost :8310(ms) gpu id = 0, cost :2105.92(ms) gpu id = 0, cost :2097.66(ms)
若是申请的unified memory 等于page size ,则不会出现同一个page在不一样的gpu之间来回导数据的状况,此时执行数据也很是快,代码以下:svg
#include <stdio.h> #include <thread> #include <unistd.h> #include <iostream> #include <cuda_runtime.h> #include <vector> __global__ void proc(int64_t* ptr[],int len){ int idx = blockDim.x * blockIdx.x + threadIdx.x; idx = idx % len; for(int i=0;i<128;++i){ for(int j=0;j<128;++j){ *ptr[idx] += *ptr[idx]; } } } void proc_thread1(int gpu_id){ void *array_ptr; auto err = cudaMallocManaged(&array_ptr,sizeof(void*)*512); if(err){ std::cout << "proc_thread0 " << err << std::endl; } int64_t** (data_ptr) = (int64_t**)(array_ptr); for(int i=0; i<128; ++i){ void* ptr=nullptr; auto err = cudaMallocManaged(&ptr,4096); if(err){ std::cout << "cudaMallocManaged " << err << std::endl; } data_ptr[i] = (int64_t*)ptr; } cudaSetDevice(gpu_id); while(true){ auto start = std::chrono::high_resolution_clock::now(); proc<<<128,128>>>(data_ptr,128); cudaDeviceSynchronize(); auto stop = std::chrono::high_resolution_clock::now(); auto span = (std::chrono::duration<double, std::milli>(stop - start)).count(); std::cout << "gpu id = " << gpu_id; std::cout << ", cost :" << span << "(ms)" << std::endl; } } int main(){ std::thread th0(&proc_thread1,0); std::thread th1(&proc_thread1,1); th0.join(); th1.join(); return 0; }
执行效果以下:spa
gpu id = 0, cost :147.898(ms) gpu id = 0, cost :133.469(ms) gpu id = 1, cost :306.239(ms) gpu id = 0, cost :121.874(ms) gpu id = 1, cost :133.641(ms) gpu id = 0, cost :118.545(ms) gpu id = 1, cost :124.928(ms) gpu id = 0, cost :117.004(ms) gpu id = 1, cost :114.917(ms) gpu id = 0, cost :116.955(ms) gpu id = 1, cost :112.313(ms) gpu id = 0, cost :116.965(ms) gpu id = 1, cost :111.29(ms) gpu id = 0, cost :116.978(ms) gpu id = 1, cost :111.296(ms) gpu id = 0, cost :117.008(ms)
因此在多卡的状况下使用unified memory,尽可能不要分配零碎的小片内存code