はじめてのCUDAプログラミング

思ったより呼び出しコスト少ないな…

#include <stdio.h>
#include <cutil.h>
#include <sys/time.h>

#define NUM_BLOCKS 1
#define NUM_THREADS 1

__global__ static void nanika(int *x, int y, int i)
{
	x[0] = 240+i;
}

unsigned long getu()
{
	struct timeval tv;
	gettimeofday(&tv, NULL);
	return tv.tv_sec*1000000 + tv.tv_usec;
}

int main(int argc, char **argv)
{
	int a[NUM_THREADS];
	int i;
	int begin, end;
	int N = 10000;

	int *arr;

	CUT_DEVICE_INIT(argc, argv);

	CUDA_SAFE_CALL(cudaMalloc((void**)&arr, sizeof(int)*4));
	a[0] = 0;
	CUDA_SAFE_CALL(cudaMemcpy(arr, a, sizeof(int)*4, cudaMemcpyHostToDevice));

	nanika<<<NUM_BLOCKS, NUM_THREADS>>>(arr, 1, 0);

	CUDA_SAFE_CALL(cudaMemcpy(arr, a, sizeof(int), cudaMemcpyHostToDevice));
	begin = getu();
	for (i=0; i<N; i++) {
		nanika<<<NUM_BLOCKS, NUM_THREADS>>>(arr, 1, i);
	}
	end = getu();
	CUDA_SAFE_CALL(cudaMemcpy(a, arr, sizeof(int), cudaMemcpyDeviceToHost));


	CUDA_SAFE_CALL(cudaMemcpy(a, arr, sizeof(int)*4, cudaMemcpyDeviceToHost));

	printf("%d %lf\n", a[0], (end-begin)/(double)N);
}

一回あたり3usecぐらい。もっとかかると思っていた。これは考えを修正しないといけない。

ちなみに

	for (i=0; i<N; i++) {
		CUDA_SAFE_CALL(cudaMemcpy(arr, a, sizeof(int), cudaMemcpyHostToDevice));
		nanika<<<NUM_BLOCKS, NUM_THREADS>>>(arr, 1, i);
		CUDA_SAFE_CALL(cudaMemcpy(a, arr, sizeof(int), cudaMemcpyDeviceToHost));
	}

こうすると30usecくらい。4byteメモリ転送のほうがデバイス間の関数呼び出しよりもはるかにレイテンシでかいのは謎だが。

Core2(E6600)だと

#include <stdio.h>
#include <sys/time.h>
#include <stdlib.h>
#include <string.h>

#define NUM_BLOCKS 1
#define NUM_THREADS 1

unsigned long getu()
{
	struct timeval tv;
	gettimeofday(&tv, NULL);
	return tv.tv_sec*1000000 + tv.tv_usec;
}

int main(int argc, char **argv)
{
	int N = 10000;
	int begin,end;
	int i;

	int *a;
	int *b;

	a = malloc(sizeof(int)*1024*1024);
	b = malloc(sizeof(int)*1024*1024);

	begin = getu();
	while (getu() - begin < 1000000);

	begin = getu();
	for (i=0; i<N; i++) {
 		memcpy(a,b,1024*100);
		memcpy(b,a,1024*100);
	}
	end = getu();

	printf("%d %lf\n", a[0], (end-begin)/(double)N);
}

これが30usecぐらい。キャッシュに載ってたとしても、100KB(4byte で 150x150ぐらい)のデータを操作するようになったらCUDAを使えば速くなるかもしれん…ってところか。