cal

もうどうでもいいか、と思ったが、やっぱり気になるのでCALでも測っておくと、

  • カーネル起動 → やっぱ500usecぐらい
  • localをmap/unmap → 1[msec]
  • remoteをmap/unmap → 1[usec] (サイズ関係無し)

remoteを使えばいいのか…まあ、それでもカーネル起動500usecは話にならないけど。

#include <cal.h>
#include <calcl.h>
#include <stdlib.h>
#include <stdio.h>
#include <windows.h>

const CALchar *k = 
    "il_ps_2_0\n"
    "end\n";

static LARGE_INTEGER qf;

static double
get_diff(LARGE_INTEGER a,
         LARGE_INTEGER b,
         LARGE_INTEGER freq)
{
    double da = a.QuadPart;
    double db = b.QuadPart;

    return ((da - db) / freq.QuadPart);
}


int main(int argc, char **argv)
{
    QueryPerformanceFrequency(&qf);

    calInit();

    CALuint num = 0;
    calDeviceGetCount(&num);
    CALdevice dev = 0;
    calDeviceOpen(&dev, 0);
    CALcontext ctxt;
    calCtxCreate(&ctxt, dev);
    CALdeviceattribs attribs;
    attribs.struct_size = sizeof(CALdeviceattribs);
    calDeviceGetAttribs(&attribs, 0);

    CALobject obj = NULL;
    CALlanguage lang = CAL_LANGUAGE_IL;

    if (calclCompile(&obj, lang, k, attribs.target) != CAL_RESULT_OK) {
        puts("build");
        return 1;
    }

    CALimage img = 0;
    if (calclLink(&img, &obj, 1) != CAL_RESULT_OK) {
        puts("link");
        return 1;
    }

    CALmodule mod = 0;
    calModuleLoad(&mod, ctxt, img);

    CALfunc func = 0;
    calModuleGetEntry(&func, ctxt, mod, "main");

    CALdomain domain = {0,0,1,1};

    CALevent e = 0;

    for (int i=0; i<4; i++) {
        LARGE_INTEGER tb,te;
        QueryPerformanceCounter(&tb);
        CALresult r = calCtxRunProgram(&e, ctxt, func, &domain);
        if (r != CAL_RESULT_OK) {
            puts("run");
            return 1;
        }
        while (calCtxIsEventDone(ctxt, e) == CAL_RESULT_PENDING)
            ;
        QueryPerformanceCounter(&te);

        double d = get_diff(te, tb, qf);
        printf("kernel : %f[sec]\n", d);
    }

    CALresource mem_res = 0;
    calResAllocLocal1D(&mem_res, dev, 1, CAL_FORMAT_BYTE_1, 0);
    for (int i=0; i<4; i++) {
        LARGE_INTEGER tb,te;
        CALvoid *p;
        CALuint pitch;

        QueryPerformanceCounter(&tb);
        CALresult r = calResMap(&p, &pitch, mem_res, 0);
        if (r != CAL_RESULT_OK) {
            puts("map");
            return 1;
        }
        r = calResUnmap(mem_res);
        if (r != CAL_RESULT_OK) {
            puts("unmap");
            return 1;
        }

        QueryPerformanceCounter(&te);
        double d = get_diff(te, tb, qf);
        printf("map/unmap(local) : %f[sec]\n", d);
    }


    calResAllocRemote1D(&mem_res, &dev, 1, 1024*16, CAL_FORMAT_BYTE_1, 0);
    for (int i=0; i<4; i++) {
        LARGE_INTEGER tb,te;
        CALvoid *p;
        CALuint pitch;
        int tsc_b, tsc_e;

        tsc_b = __rdtsc();
        //QueryPerformanceCounter(&tb);
        CALresult r = calResMap(&p, &pitch, mem_res, 0);
        if (r != CAL_RESULT_OK) {
            puts("map");
            return 1;
        }
        r = calResUnmap(mem_res);
        if (r != CAL_RESULT_OK) {
            puts("unmap");
            return 1;
        }

        //QueryPerformanceCounter(&te);
        tsc_e = __rdtsc();
/*
        double d = get_diff(te, tb, qf);
        printf("map/unmap : %f[sec]\n", d);
*/

        printf("map/unmap(remote): %d[clk]\n", tsc_e-tsc_b);
    }
}


結果

kernel : 0.003309[sec]
kernel : 0.001690[sec]
kernel : 0.000451[sec]
kernel : 0.000296[sec]
map/unmap(local) : 0.003380[sec]
map/unmap(local) : 0.001555[sec]
map/unmap(local) : 0.001237[sec]
map/unmap(local) : 0.000992[sec]
map/unmap(remote): 25515[clk]
map/unmap(remote): 11472[clk]
map/unmap(remote): 9759[clk]
map/unmap(remote): 9873[clk]