AMD Fusion て何がFusionしてんの?
というわけ(?)で、前回の予想では、「単にGPUくっつけただけでしたズコー」という予想だったわけだが、実際のところどうなっているのか見ておく。
…見た。物体はE350M1/USB3。ひとことで言うと、僕が計測した範囲では「Fusion」という単語から想定されるような何かは全く見つからなかった。
想定してたのは、
- カーネル呼び出しのレイテンシがめっちゃ低い
- ホストのポインタがそのまま使える
- そのままでなくともうまくすれば使える
みたいな感じだった。
全くそんなことはなかった。
- CL_MEM_USE_HOST_PTR は使えない(ように見える)
- CL_MEM_ALLOC_HOST_PTR とか頑張ったけどなんかコピーしてるっぽい
- PCIeSpeedTestの結果は2GB/secぐらい。(DDR3-10600 single channel)
- カーネル呼び出しが 500[usec]
- メモリ転送のレイテンシも 500[usec]
ズコー。レイテンシ500usecて…500usecもあれば今時のCPUなら何でもできるがな…個人的には夢や希望というものを一切感じないですね…
Linuxでも検証しようとしたがドライバインストールがうまくいかんかったので保留。最近fglrxをちゃんと動かせた記憶が無い。
以下検証コード。
#ifdef __APPLE__ #include <OpenCL/cl.h> #else #include <CL/cl.h> #endif #include <stdlib.h> #include <string.h> #include <stdio.h> #ifdef _WIN32 #include <windows.h> typedef LARGE_INTEGER timesec_t; typedef LARGE_INTEGER freq_t; #else #include <sys/time.h> typedef double timesec_t; typedef int freq_t; static void QueryPerformanceCounter(timesec_t *t) { struct timeval tv; gettimeofday(&tv, NULL); *t = tv.tv_sec + tv.tv_usec/1000000.0; } static void QueryPerformanceFrequency(freq_t *t) { *t = 1; } static inline void* _aligned_malloc(size_t sz, int align) { void *p; posix_memalign(&p, align, sz); return p; } #endif freq_t qf; static cl_kernel empty_kernel, f_kernel; static double get_diff(timesec_t a, timesec_t b, freq_t freq) { #ifdef _WIN32 double da = a.QuadPart; double db = b.QuadPart; return ((da - db) / freq.QuadPart); #else return a-b; #endif } const char source[] = "" "__kernel void f(__global int *in, __global int *out, uint num_iter) {\n" " int idx = get_global_id(0);\n" " for (int i=0; i<num_iter; i++) {\n" " out[idx*num_iter + i] = in[idx*num_iter + i];\n" " }\n" "}\n" "__kernel void empty(void) {\n" "}\n" ; static int test(cl_context ctxt, cl_command_queue q, int buf_size, void *src_host, void *dst_host, int use_host_ptr) { cl_int err; cl_mem src_mem_obj, dst_mem_obj; void *src_mapped, *dst_mapped; if (use_host_ptr) { puts("==== use host ptr ===="); src_mem_obj = clCreateBuffer(ctxt, CL_MEM_USE_HOST_PTR|CL_MEM_READ_WRITE, buf_size, src_host, &err); dst_mem_obj = clCreateBuffer(ctxt, CL_MEM_USE_HOST_PTR|CL_MEM_READ_WRITE, buf_size, dst_host, &err); } else { puts("==== alloc host ptr ===="); src_mem_obj = clCreateBuffer(ctxt, CL_MEM_ALLOC_HOST_PTR|CL_MEM_READ_WRITE, buf_size, NULL, &err); dst_mem_obj = clCreateBuffer(ctxt, CL_MEM_ALLOC_HOST_PTR|CL_MEM_READ_WRITE, buf_size, NULL, &err); } if (err != CL_SUCCESS) { puts("map"); return 1; } { int j; for (j=0; j<4; j++) { timesec_t b,e; double t; QueryPerformanceCounter(&b); memcpy(dst_host, src_host, buf_size); QueryPerformanceCounter(&e); t = get_diff(e,b,qf); printf("memcpy %dMB: %f[sec] %f[GB/s]\n", buf_size/(1024*1024), t, (buf_size/t) / (1024*1024*1024)); } } { int j; for (j=0; j<4; j++) { timesec_t b,e; double t; cl_event ev[2]; QueryPerformanceCounter(&b); src_mapped = clEnqueueMapBuffer(q, src_mem_obj, CL_FALSE, CL_MAP_READ|CL_MAP_WRITE, 0, buf_size, 0, NULL, &ev[0], &err); dst_mapped = clEnqueueMapBuffer(q, dst_mem_obj, CL_FALSE, CL_MAP_READ|CL_MAP_WRITE, 0, buf_size, 0, NULL, &ev[1], &err); clWaitForEvents(2, ev); clReleaseEvent(ev[0]); clReleaseEvent(ev[1]); err = clEnqueueUnmapMemObject(q, src_mem_obj, src_mapped, 0, NULL, &ev[0]); err = clEnqueueUnmapMemObject(q, dst_mem_obj, dst_mapped, 0, NULL, &ev[1]); clWaitForEvents(2, ev); clReleaseEvent(ev[0]); clReleaseEvent(ev[1]); QueryPerformanceCounter(&e); t = get_diff(e,b,qf); printf("map-unmap %dMB: %f[sec] %f[GB/s]\n", buf_size/(1024*1024), t, (buf_size/t) / (1024*1024*1024)); } } { int j; for (j=0; j<4; j++) { timesec_t b,e; double t; cl_event ev[2]; QueryPerformanceCounter(&b); src_mapped = clEnqueueMapBuffer(q, src_mem_obj, CL_FALSE, CL_MAP_READ|CL_MAP_WRITE, 0, buf_size, 0, NULL, &ev[0], &err); dst_mapped = clEnqueueMapBuffer(q, dst_mem_obj, CL_FALSE, CL_MAP_READ|CL_MAP_WRITE, 0, buf_size, 0, NULL, &ev[1], &err); clWaitForEvents(2, ev); clReleaseEvent(ev[0]); clReleaseEvent(ev[1]); memcpy(dst_mapped, src_mapped, buf_size); err = clEnqueueUnmapMemObject(q, src_mem_obj, src_mapped, 0, NULL, &ev[0]); err = clEnqueueUnmapMemObject(q, dst_mem_obj, dst_mapped, 0, NULL, &ev[1]); clWaitForEvents(2, ev); QueryPerformanceCounter(&e); clReleaseEvent(ev[0]); clReleaseEvent(ev[1]); t = get_diff(e,b,qf); printf("map-unmap + memcpy %dMB: %f[sec] %f[GB/s]\n", buf_size/(1024*1024), t, (buf_size/t) / (1024*1024*1024)); } } { int j; for (j=0; j<4; j++) { size_t gws[] = {1, 1, 1}; size_t lws[] = {1, 1, 1}; cl_event ev; timesec_t b,e; double t; QueryPerformanceCounter(&b); err = clEnqueueNDRangeKernel(q, empty_kernel, 1, NULL, gws, lws, 0, NULL, &ev); clWaitForEvents(1, &ev); QueryPerformanceCounter(&e); clReleaseEvent(ev); if (err != CL_SUCCESS) { puts("enq nd"); return 1; } t = get_diff(e,b,qf); printf("empty latency %f[sec]\n", t); } } { int j; for (j=0; j<100; j++) { clEnqueueReadBuffer(q, dst_mem_obj, CL_TRUE, 0, 1, dst_host, 0, NULL, NULL); } for (j=0; j<4; j++) { timesec_t b,e; double t; QueryPerformanceCounter(&b); clEnqueueReadBuffer(q, dst_mem_obj, CL_TRUE, 0, 1, dst_host, 0, NULL, NULL); QueryPerformanceCounter(&e); t = get_diff(e,b,qf); printf("read latency(1byte): %f[sec]\n", t); } } { int j; int num_work_item = 256; int num_wg = 256; cl_uint num_iter = (buf_size/4) / num_wg; clSetKernelArg(f_kernel, 0, sizeof(cl_mem), (void*)&src_mem_obj); clSetKernelArg(f_kernel, 1, sizeof(cl_mem), (void*)&dst_mem_obj); clSetKernelArg(f_kernel, 2, sizeof(cl_uint), (void*)&num_iter); for (j=0; j<4; j++) { size_t gws[] = {num_wg, 0, 0}; size_t lws[] = {num_work_item, 0, 0}; timesec_t b,e; double t; cl_event ev; int c; memset(src_host, 0xaa+j+use_host_ptr*8, buf_size); memset(dst_host, 0x55, buf_size); //if (! use_host_ptr) clEnqueueWriteBuffer(q, src_mem_obj, CL_TRUE, 0, buf_size, src_host, 0, NULL, NULL); QueryPerformanceCounter(&b); err = clEnqueueNDRangeKernel(q, f_kernel, 1, NULL, gws, lws, 0, NULL, &ev); clWaitForEvents(1, &ev); QueryPerformanceCounter(&e); clReleaseEvent(ev); //if (! use_host_ptr) clEnqueueReadBuffer(q, dst_mem_obj, CL_TRUE, 0, buf_size, dst_host, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("enq nd %d\n", err); return 1; } t = get_diff(e,b,qf); printf("gpu memcpy %dMB: %f[sec] %f[GB/s]\n", buf_size/(1024*1024), t, (buf_size/t) / (1024*1024*1024)); for (c=0; c<buf_size; c++) { int s = ((unsigned char*)src_host)[c]; int d = ((unsigned char*)dst_host)[c]; if (s != d) { printf("%d %x!=%x\n", c, s, d); break; } } } } clReleaseMemObject(src_mem_obj); clReleaseMemObject(dst_mem_obj); return 0; } int main(int argc, char **argv) { cl_platform_id pls[1]; cl_device_id devs[1]; cl_uint n; cl_context_properties cps[3]; cl_int err; cl_context ctxt; cl_program prog; const char *src[] = {source}; size_t src_sz[] = {sizeof(source) - 1}; cl_command_queue q; size_t buf_size = 1024*1024; cl_bool hostptr; void *src_host, *dst_host; if (argc >= 2) { buf_size = atoi(argv[1]) * 1024*1024; } QueryPerformanceFrequency(&qf); src_host = _aligned_malloc(buf_size, 128); dst_host = _aligned_malloc(buf_size, 128); clGetPlatformIDs(1, pls, &n); cps[0] = CL_CONTEXT_PLATFORM; cps[1] = (cl_context_properties)pls[0]; cps[2] = 0; ctxt = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &err); clGetDeviceIDs(pls[0], CL_DEVICE_TYPE_GPU, 1, devs, &n); if (err != CL_SUCCESS) { puts("get dev"); return 1; } prog = clCreateProgramWithSource(ctxt, 1, src, src_sz, &err); if (err != CL_SUCCESS) { puts("cre pro"); return 1; } err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { char log[1024]; size_t ret; puts("build pro"); err = clGetProgramBuildInfo(prog, devs[0], CL_PROGRAM_BUILD_LOG, 1024, log, &ret); printf("%d %s\n", err, log); return 1; } empty_kernel = clCreateKernel(prog, "empty", &err); if (err != CL_SUCCESS) { printf("%d\n", err); puts("cre ker e"); return 1; } f_kernel = clCreateKernel(prog, "f", &err); if (err != CL_SUCCESS) { puts("cre ker f"); return 1; } q = clCreateCommandQueue(ctxt, devs[0], 0, &err); if (err != CL_SUCCESS) { puts("cre q"); return 1; } #if defined CL_DEVICE_HOST_UNIFIED_MEMORY err = clGetDeviceInfo(devs[0], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(hostptr), &hostptr, &n); if (err != CL_SUCCESS) { printf("%d %d\n", err, n); return 1; } if (hostptr) { puts("unified memory"); } #else hostptr = 0; #endif test(ctxt, q, buf_size, src_host, dst_host, 0); test(ctxt, q, buf_size, src_host, dst_host, 1); return 0; }
結果
==== alloc host ptr ==== memcpy 4MB: 0.005789[sec] 0.674741[GB/s] memcpy 4MB: 0.002957[sec] 1.321148[GB/s] memcpy 4MB: 0.002954[sec] 1.322579[GB/s] memcpy 4MB: 0.003057[sec] 1.277995[GB/s] map-unmap 4MB: 0.021169[sec] 0.184530[GB/s] map-unmap 4MB: 0.009776[sec] 0.399562[GB/s] map-unmap 4MB: 0.007539[sec] 0.518141[GB/s] map-unmap 4MB: 0.007918[sec] 0.493348[GB/s] map-unmap + memcpy 4MB: 0.010901[sec] 0.358325[GB/s] map-unmap + memcpy 4MB: 0.011074[sec] 0.352734[GB/s] map-unmap + memcpy 4MB: 0.010828[sec] 0.360760[GB/s] map-unmap + memcpy 4MB: 0.010718[sec] 0.364444[GB/s] empty latency 0.002554[sec] empty latency 0.000797[sec] empty latency 0.000739[sec] empty latency 0.000865[sec] read latency(1byte): 0.000918[sec] read latency(1byte): 0.000715[sec] read latency(1byte): 0.000884[sec] read latency(1byte): 0.000886[sec] gpu memcpy 4MB: 0.020204[sec] 0.193345[GB/s] gpu memcpy 4MB: 0.018170[sec] 0.214987[GB/s] gpu memcpy 4MB: 0.019316[sec] 0.202230[GB/s] gpu memcpy 4MB: 0.018692[sec] 0.208981[GB/s] ==== use host ptr ==== memcpy 4MB: 0.004560[sec] 0.856660[GB/s] memcpy 4MB: 0.003426[sec] 1.140027[GB/s] memcpy 4MB: 0.003097[sec] 1.261356[GB/s] memcpy 4MB: 0.003093[sec] 1.262922[GB/s] map-unmap 4MB: 0.019887[sec] 0.196425[GB/s] map-unmap 4MB: 0.023793[sec] 0.164175[GB/s] map-unmap 4MB: 0.019714[sec] 0.198146[GB/s] map-unmap 4MB: 0.021684[sec] 0.180141[GB/s] map-unmap + memcpy 4MB: 0.023859[sec] 0.163722[GB/s] map-unmap + memcpy 4MB: 0.022675[sec] 0.172270[GB/s] map-unmap + memcpy 4MB: 0.023456[sec] 0.166536[GB/s] map-unmap + memcpy 4MB: 0.024121[sec] 0.161945[GB/s] empty latency 0.001665[sec] empty latency 0.000922[sec] empty latency 0.000688[sec] empty latency 0.000630[sec] read latency(1byte): 0.000599[sec] read latency(1byte): 0.000444[sec] read latency(1byte): 0.000568[sec] read latency(1byte): 0.000540[sec] gpu memcpy 4MB: 0.019292[sec] 0.202485[GB/s] gpu memcpy 4MB: 0.018870[sec] 0.207010[GB/s] gpu memcpy 4MB: 0.018908[sec] 0.206590[GB/s] gpu memcpy 4MB: 0.018868[sec] 0.207032[GB/s]
今後に期待…と言いたいところだけど、AMDは石油が続く限り仕事しなくてよいという、競争原理から切り離された平和な会社である点を考えると、これ以上のものが出ることは無いだろうなぁ…
いや、drmを直接叩いてなんとか…ならんかね…いやカーネル死んだ。まあ、大体わかったのでいいか。
計算性能だけを考えるなら、AVX+FMAを実装すべきだったのでは。1.6[GHz] * 8[float] * 2[fma] * 2[core] = 51.2[GFlops]ですよ(E350のGPUは80[GFlops])。まあ、GPUが必要なんだと思うけど。