cal
もうどうでもいいか、と思ったが、やっぱり気になるのでCALでも測っておくと、
- カーネル起動 → やっぱ500usecぐらい
- localをmap/unmap → 1[msec]
- remoteをmap/unmap → 1[usec] (サイズ関係無し)
remoteを使えばいいのか…まあ、それでもカーネル起動500usecは話にならないけど。
#include <cal.h> #include <calcl.h> #include <stdlib.h> #include <stdio.h> #include <windows.h> const CALchar *k = "il_ps_2_0\n" "end\n"; static LARGE_INTEGER qf; static double get_diff(LARGE_INTEGER a, LARGE_INTEGER b, LARGE_INTEGER freq) { double da = a.QuadPart; double db = b.QuadPart; return ((da - db) / freq.QuadPart); } int main(int argc, char **argv) { QueryPerformanceFrequency(&qf); calInit(); CALuint num = 0; calDeviceGetCount(&num); CALdevice dev = 0; calDeviceOpen(&dev, 0); CALcontext ctxt; calCtxCreate(&ctxt, dev); CALdeviceattribs attribs; attribs.struct_size = sizeof(CALdeviceattribs); calDeviceGetAttribs(&attribs, 0); CALobject obj = NULL; CALlanguage lang = CAL_LANGUAGE_IL; if (calclCompile(&obj, lang, k, attribs.target) != CAL_RESULT_OK) { puts("build"); return 1; } CALimage img = 0; if (calclLink(&img, &obj, 1) != CAL_RESULT_OK) { puts("link"); return 1; } CALmodule mod = 0; calModuleLoad(&mod, ctxt, img); CALfunc func = 0; calModuleGetEntry(&func, ctxt, mod, "main"); CALdomain domain = {0,0,1,1}; CALevent e = 0; for (int i=0; i<4; i++) { LARGE_INTEGER tb,te; QueryPerformanceCounter(&tb); CALresult r = calCtxRunProgram(&e, ctxt, func, &domain); if (r != CAL_RESULT_OK) { puts("run"); return 1; } while (calCtxIsEventDone(ctxt, e) == CAL_RESULT_PENDING) ; QueryPerformanceCounter(&te); double d = get_diff(te, tb, qf); printf("kernel : %f[sec]\n", d); } CALresource mem_res = 0; calResAllocLocal1D(&mem_res, dev, 1, CAL_FORMAT_BYTE_1, 0); for (int i=0; i<4; i++) { LARGE_INTEGER tb,te; CALvoid *p; CALuint pitch; QueryPerformanceCounter(&tb); CALresult r = calResMap(&p, &pitch, mem_res, 0); if (r != CAL_RESULT_OK) { puts("map"); return 1; } r = calResUnmap(mem_res); if (r != CAL_RESULT_OK) { puts("unmap"); return 1; } QueryPerformanceCounter(&te); double d = get_diff(te, tb, qf); printf("map/unmap(local) : %f[sec]\n", d); } calResAllocRemote1D(&mem_res, &dev, 1, 1024*16, CAL_FORMAT_BYTE_1, 0); for (int i=0; i<4; i++) { LARGE_INTEGER tb,te; CALvoid *p; CALuint pitch; int tsc_b, tsc_e; tsc_b = __rdtsc(); //QueryPerformanceCounter(&tb); CALresult r = calResMap(&p, &pitch, mem_res, 0); if (r != CAL_RESULT_OK) { puts("map"); return 1; } r = calResUnmap(mem_res); if (r != CAL_RESULT_OK) { puts("unmap"); return 1; } //QueryPerformanceCounter(&te); tsc_e = __rdtsc(); /* double d = get_diff(te, tb, qf); printf("map/unmap : %f[sec]\n", d); */ printf("map/unmap(remote): %d[clk]\n", tsc_e-tsc_b); } }
結果
kernel : 0.003309[sec] kernel : 0.001690[sec] kernel : 0.000451[sec] kernel : 0.000296[sec] map/unmap(local) : 0.003380[sec] map/unmap(local) : 0.001555[sec] map/unmap(local) : 0.001237[sec] map/unmap(local) : 0.000992[sec] map/unmap(remote): 25515[clk] map/unmap(remote): 11472[clk] map/unmap(remote): 9759[clk] map/unmap(remote): 9873[clk]