haswell - w_o’s diary

とりあえず手元にツールある分くらいはのせとく

https://github.com/tanakamura/instruction-bench

http://int.main.jp/prog/bin/bench.exe

ターボブースト止めかたわからんかったので信頼性は若干よくない。まあaddのレイテンシ1.00になってるし多分あってる。

アンロール8回程度ではmulx2のレイテンシを埋められない問題が発覚してるのでIPC変な値になってるやつはソース確認してください。

== latency/throughput ==
   reg64:       add:   latency: CPI=    1.00, IPC=    1.00
   reg64:       add:throughput: CPI=    0.29, IPC=    3.49
   reg64:      load:   latency: CPI=    5.00, IPC=    0.20
   reg64:      load:throughput: CPI=    0.63, IPC=    1.60
    m128:      pxor:   latency: CPI=    0.28, IPC=    3.55
    m128:      pxor:throughput: CPI=    0.28, IPC=    3.55
    m128:      padd:   latency: CPI=    1.01, IPC=    0.99
    m128:      padd:throughput: CPI=    0.50, IPC=    2.00
    m128:    pmuldq:   latency: CPI=    5.00, IPC=    0.20
    m128:    pmuldq:throughput: CPI=    1.00, IPC=    1.00
    m128:    loadps:   latency: CPI=    7.01, IPC=    0.14
    m128:    loadps:throughput: CPI=    0.50, IPC=    2.00
    m128:     xorps:   latency: CPI=    0.28, IPC=    3.55
    m128:     xorps:throughput: CPI=    0.28, IPC=    3.56
    m128:     addps:   latency: CPI=    3.00, IPC=    0.33
    m128:     addps:throughput: CPI=    1.00, IPC=    1.00
    m128:     mulps:   latency: CPI=    5.03, IPC=    0.20
    m128:     mulps:throughput: CPI=    0.63, IPC=    1.59
    m128:   blendps:   latency: CPI=    1.00, IPC=    1.00
    m128:   blendps:throughput: CPI=    0.33, IPC=    3.00
    m128:    pshufb:   latency: CPI=    1.00, IPC=    1.00
    m128:    pshufb:throughput: CPI=    1.00, IPC=    1.00
    m128:    pmullw:   latency: CPI=    5.02, IPC=    0.20
    m128:    pmullw:throughput: CPI=    1.00, IPC=    1.00
    m128:    phaddd:   latency: CPI=    3.00, IPC=    0.33
    m128:    phaddd:throughput: CPI=    2.00, IPC=    0.50
    m128:    pinsrd:   latency: CPI=    2.06, IPC=    0.49
    m128:    pinsrd:throughput: CPI=    2.00, IPC=    0.50
    m128:      dpps:   latency: CPI=   14.03, IPC=    0.07
    m128:      dpps:throughput: CPI=    2.00, IPC=    0.50
    m128:  cvtps2dq:   latency: CPI=    3.00, IPC=    0.33
    m128:  cvtps2dq:throughput: CPI=    1.00, IPC=    1.00
    m256:    loadps:   latency: CPI=    1.00, IPC=    1.00
    m256:    loadps:throughput: CPI=    0.50, IPC=    2.00
    m256:     xorps:   latency: CPI=    0.28, IPC=    3.54
    m256:     xorps:throughput: CPI=    0.28, IPC=    3.56
    m256:     mulps:   latency: CPI=    5.03, IPC=    0.20
    m256:     mulps:throughput: CPI=    0.63, IPC=    1.58
    m256:     addps:   latency: CPI=    3.01, IPC=    0.33
    m256:     addps:throughput: CPI=    1.00, IPC=    1.00
    m256:     divps:   latency: CPI=   18.11, IPC=    0.06
    m256:     divps:throughput: CPI=   14.13, IPC=    0.07
    m256:     divpd:   latency: CPI=   19.11, IPC=    0.05
    m256:     divpd:throughput: CPI=   16.14, IPC=    0.06
    m256:   rsqrtps:   latency: CPI=    7.00, IPC=    0.14
    m256:   rsqrtps:throughput: CPI=    2.00, IPC=    0.50
    m256:     rcpps:   latency: CPI=    7.01, IPC=    0.14
    m256:     rcpps:throughput: CPI=    2.00, IPC=    0.50
    m256:    sqrtps:   latency: CPI=   18.13, IPC=    0.06
    m256:    sqrtps:throughput: CPI=   14.22, IPC=    0.07
    m256:vperm2f128:   latency: CPI=    3.00, IPC=    0.33
    m256:vperm2f128:throughput: CPI=    1.00, IPC=    1.00

https://github.com/tanakamura/clminibench

http://int.main.jp/prog/bin/clinstbench.exe

使いかた全く説明してないが、bench.c の文字コードをSJISにして(Androidから使う場合はutf-8にして)jni/w32以下のCMakeLists.txtをビルドすればよいです。

「おそい」「はやい」とかのコメントは全く意味ない（人間ごときがコンピュータの性能を速い遅いとか評価していいのかという精神）ので無視してよいです。

TSX

http://ark.intel.com/products/75117/Intel-Core-i7-4700MQ-Processor-6M-Cache-up-to-3_40-GHz

ごめんTSX付いてないやつだった。誰かにまかせる。

vgather

#include <immintrin.h>
#include <x86intrin.h>
#include <stdio.h>

__attribute__((aligned(64))) unsigned int zero_mem[4096];

__m256i data_zero;
volatile __m256i result;

template <int scale>
__attribute__((noinline, noclone))
static void
bench_gather(int i0, int i1, int i2, int i3,
             int i4, int i5, int i6, int i7,
             int nloop)
{
    __m256i pattern = _mm256_set_epi32(i0, i1, i2, i3, i4, i5, i6, i7);
    __m256i zero = data_zero;
    __int64 b = __rdtsc();

    for (int i=0; i<nloop; i++) {
        __m256i data = _mm256_i32gather_epi32((const int *)zero_mem, pattern, scale);

        data = _mm256_and_si256(zero, data);
        pattern = _mm256_add_epi32(pattern, data);
    }

    result = pattern;

    __int64 e = __rdtsc();

    printf("gather: [%2d,%2d,%2d,%2d,%2d,%2d,%2d,%2d]: %f\n",
           i0*scale, i1*scale, i2*scale, i3*scale, i4*scale, i5*scale, i6*scale, i7*scale,
           (e-b)/(double)nloop);
}

void
__attribute__((noinline, noclone))
bench_ins(int i0, int i1, int i2, int i3,
          int i4, int i5, int i6, int i7,
          int nloop, int scale)
{
    __int64 b = __rdtsc();

    for (int i=0; i<nloop; i++) {
        int v0 = zero_mem[i0*scale];
        int v1 = zero_mem[i1*scale];
        int v2 = zero_mem[i2*scale];
        int v3 = zero_mem[i3*scale];
        int v4 = zero_mem[i4*scale];
        int v5 = zero_mem[i5*scale];
        int v6 = zero_mem[i6*scale];
        int v7 = zero_mem[i7*scale];

        __m128i lo = _mm_cvtsi32_si128(v0);
        lo = _mm_insert_epi32(lo, v1, 1);
        lo = _mm_insert_epi32(lo, v2, 2);
        lo = _mm_insert_epi32(lo, v3, 3);
        __m128i hi = _mm_cvtsi32_si128(v4);
        hi = _mm_insert_epi32(hi, v5, 1);
        hi = _mm_insert_epi32(hi, v6, 2);
        hi = _mm_insert_epi32(hi, v7, 3);

        __m256i v = _mm256_castsi128_si256(lo);
        v = _mm256_inserti128_si256(v, hi, 1);

        ((__m256i*)zero_mem)[0] = v;
    }

    __int64 e = __rdtsc();

    printf("insert: [%2d,%2d,%2d,%2d,%2d,%2d,%2d,%2d]: %f\n",
           i0*scale, i1*scale, i2*scale, i3*scale, i4*scale, i5*scale, i6*scale, i7*scale,
           (e-b)/(double)nloop);
}

template <int scale>
static void
bench(int i0, int i1, int i2, int i3,
      int i4, int i5, int i6, int i7,
      int nloop)
{
    bench_gather<scale>(i0, i1, i2, i3, i4, i5, i6, i7, nloop);
    bench_ins(i0, i1, i2, i3, i4, i5, i6, i7, nloop, scale);
}


int main()
{
    int nloop = 2048*16;
    bench<4>(0,1,2,3,4,5,6,7, nloop);
    bench<8>(0,1,2,3,4,5,6,7, nloop);
    bench<1>(0,1,2,3,4,5,6,7, nloop);

    bench<4>(4*0,4*1,
             4*2,4*3,
             4*4,4*5,
             4*6,4*7,
             nloop);

    bench<1>(32*0,32*1,
             32*2,32*3,
             32*4,32*5,
             32*6,32*7,nloop);

    bench<1>(128*0,128*1,
             128*2,128*3,
             128*4,128*5,
             128*6,128*7,nloop);

}

やったーvgather命令vinsrdで実装しなおしたらはやくなったよー。

gather: [ 0, 4, 8,12,16,20,24,28]: 68.279663
insert: [ 0, 4, 8,12,16,20,24,28]: 39.467285
gather: [ 0, 8,16,24,32,40,48,56]: 68.245972
insert: [ 0, 8,16,24,32,40,48,56]: 36.012817
gather: [ 0, 1, 2, 3, 4, 5, 6, 7]: 68.246704
insert: [ 0, 1, 2, 3, 4, 5, 6, 7]: 48.155273
gather: [ 0,16,32,48,64,80,96,112]: 68.210449
insert: [ 0,16,32,48,64,80,96,112]: 36.268433
gather: [ 0,32,64,96,128,160,192,224]: 68.238281
insert: [ 0,32,64,96,128,160,192,224]: 36.318604
gather: [ 0,128,256,384,512,640,768,896]: 68.216675
insert: [ 0,128,256,384,512,640,768,896]: 36.013184

avx2/fma

とりあえず生ログだが

    m256:      pxor:   latency: CPI=    0.28, IPC=    3.56
    m256:      pxor:throughput: CPI=    0.28, IPC=    3.56
    m256:     paddd:   latency: CPI=    1.00, IPC=    1.00
    m256:     paddd:throughput: CPI=    0.50, IPC=    2.00
    m256:   vpermps:   latency: CPI=    3.00, IPC=    0.33
    m256:   vpermps:throughput: CPI=    1.00, IPC=    1.00
    m256:   vpermpd:   latency: CPI=    3.00, IPC=    0.33
    m256:   vpermpd:throughput: CPI=    1.00, IPC=    1.00
    m256:    vfmaps:   latency: CPI=    5.00, IPC=    0.20
    m256:    vfmaps:throughput: CPI=    0.50, IPC=    2.00
    m256:    vfmapd:   latency: CPI=    5.00, IPC=    0.20
    m256:    vfmapd:throughput: CPI=    0.50, IPC=    2.00

上のIPCのやつ適当にグラフも描いといたから各自見といて。
https://skydrive.live.com/view.aspx?resid=ECB59E566C2D71F1!1173&cid=ecb59e566c2d71f1&app=Excel&authkey=!AI3dvNyGgQ4hkWs

追記: スループット/レイテンシは公式情報も出てます

http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-optimization-manual.html

一致してない値もあるけど多分原因調べたりはしないので、そっから先は必要な人が各自調べてくだしあ。