ひとり ARM tech なんとか

http://www.event-info.com/arm-seminar2014/

明日だった。あとintel なんとかも明日ですね。

まあ別にこういうイベントって真面目系だから興味無いのだけど、せっかくなので Cortex A15も詰め詰めしよう。

A15 は今 37.2% で、これはA9より低い。なんで？

まあ4コア使うと1.605GHzに落ちているっぽいのだけどそれを考慮しても、41% もっと出てもいいはず。

今の最内ループは、

    __asm__ __volatile__ (".p2align 4\n\t"
                          "1:\n\t"
                          "vld1.32 {d0,d1}, [%[inL00_0]:64]!\n\t"
                          "vld1.32 {d2,d3}, [%[inL00_1]:64]!\n\t"
                          "pld [%[inRp1], %[pld_offset]]\n\t"
                          "vldmia %[inRp1], {q8-q11}\n\t"

                          "add %[inRp1], %[inRp1], %[pitch_f32]\n\t"

                          "vmla.f32 %q[vout0_0], q8, d0[0]\n\t"
                          "vmla.f32 %q[vout1_0], q8, d2[0]\n\t"
                          "vmla.f32 %q[vout0_1], q9, d0[0]\n\t"
                          "vmla.f32 %q[vout1_1], q9, d2[0]\n\t"
                          "pld [%[inL00_0], #64]\n\t"
                          "vmla.f32 %q[vout0_2], q10, d0[0]\n\t"
                          "vmla.f32 %q[vout1_2], q10, d2[0]\n\t"
                          "vmla.f32 %q[vout0_3], q11, d0[0]\n\t"
                          "vmla.f32 %q[vout1_3], q11, d2[0]\n\t"

                          "pld [%[inRp1], %[pld_offset]]\n\t"
                          "vldmia %[inRp1], {q8-q11}\n\t"
                          "add %[inRp1], %[inRp1], %[pitch_f32]\n\t"

                          "vmla.f32 %q[vout0_0], q8, d0[1]\n\t"
                          "vmla.f32 %q[vout1_0], q8, d2[1]\n\t"
                          "vmla.f32 %q[vout0_1], q9, d0[1]\n\t"
                          "vmla.f32 %q[vout1_1], q9, d2[1]\n\t"
                          "pld [%[inL00_1], #64]\n\t"
                          "vmla.f32 %q[vout0_2], q10, d0[1]\n\t"
                          "vmla.f32 %q[vout1_2], q10, d2[1]\n\t"
                          "vmla.f32 %q[vout0_3], q11, d0[1]\n\t"
                          "vmla.f32 %q[vout1_3], q11, d2[1]\n\t"

                          "pld [%[inRp1], %[pld_offset]]\n\t"
                          "vldmia %[inRp1], {q8-q11}\n\t"
                          "add %[inRp1], %[inRp1], %[pitch_f32]\n\t"

                          "vmla.f32 %q[vout0_0], q8, d1[0]\n\t"
                          "vmla.f32 %q[vout1_0], q8, d3[0]\n\t"
                          "vmla.f32 %q[vout0_1], q9, d1[0]\n\t"
                          "vmla.f32 %q[vout1_1], q9, d3[0]\n\t"
                          "vmla.f32 %q[vout0_2], q10, d1[0]\n\t"
                          "vmla.f32 %q[vout1_2], q10, d3[0]\n\t"
                          "vmla.f32 %q[vout0_3], q11, d1[0]\n\t"
                          "vmla.f32 %q[vout1_3], q11, d3[0]\n\t"

                          "pld [%[inRp1], %[pld_offset]]\n\t"
                          "vldmia %[inRp1], {q8-q11}\n\t"
                          "add %[inRp1], %[inRp1], %[pitch_f32]\n\t"

                          "vmla.f32 %q[vout0_0], q8, d1[1]\n\t"
                          "vmla.f32 %q[vout1_0], q8, d3[1]\n\t"
                          "vmla.f32 %q[vout0_1], q9, d1[1]\n\t"
                          "cmp %[inRp1], %[inRp_end]\n\t"
                          "vmla.f32 %q[vout1_1], q9, d3[1]\n\t"
                          "vmla.f32 %q[vout0_2], q10, d1[1]\n\t"
                          "vmla.f32 %q[vout1_2], q10, d3[1]\n\t"
                          "vmla.f32 %q[vout0_3], q11, d1[1]\n\t"
                          "vmla.f32 %q[vout1_3], q11, d3[1]\n\t"

                          "bne 1b\n\t"
                          :[inL00_0]"+r"(inL00_0

こうなっている。vmla 以外にもいくつか命令があるが、Cortex A15は、OoO 3issue なので、多少他の命令が混ざっていても、80-90%ぐらい出てもよさげだが。

Tegra4 は、4+1 とかいうのを実装していて、負荷によってonlineになるCPUの数が変わるらしい。

http://elinux.org/Jetson/Performance

無効にして、全コア有効にするのが良い。

が、まずは何故かクロックが下がってしまう4コアのことは忘れて、1コアだけonlineにしておこう。cpufreq で、クロックは1.8105 GHz に固定しておく。また、Android上だと複数スレッド時に値が安定しないようなので、AndroidにLinuxをインストールしてその上で動かしている。

最初の条件

1コア
1.8105GHz
理論値 14.484 (https://play.google.com/store/apps/details?id=jp.flatlib.flatlib3.vfpbench を動かすと 13.438 出る)

とりあえずグラフ描くと

こう(オレンジが理論値)。まあ、メモリでいくらか引っかかってる気もするが、それ以上に別の理由で止まってるように見える。

とりあえず、

#if 0
    __asm__ __volatile__ (".p2align 4\n\t"
                          "1:\n\t"
                          "vld1.32 {d0,d1}, [%[inL00_0]:64]!\n\t"
                          <..snip..>
                         );

#else
    asm __volatile__ (""
                      :[inL00_0]"+r"(inL00_0), [inL00_1]"+r"(inL00_1),
                       [vout0_0]"+w"(vout0_0),
                       [vout0_1]"+w"(vout0_1),
                       [vout0_2]"+w"(vout0_2),
                       [vout0_3]"+w"(vout0_3),
                       [vout1_0]"+w"(vout1_0),
                       [vout1_1]"+w"(vout1_1),
                       [vout1_2]"+w"(vout1_2),
                       [inRp1]"+r"(inRp1)
                      :[pitch_f32]"r"(pitch_f32*4), [inRp_end]"r"(inRp1 + 128*pitch_f32), [pld_offset]"r"(pitch_f32*4*4));
#endif

こんな感じにする。こうすると、最内ループ以外で消費している時間がわかる。(inline asm の引数は、コンパイラの最適化をほどよく制御するのに便利ですね。覚えておきましょう。)

asm 中身あり
( 1024- 0):neon                : sec= 0.39874,   5.01581[GFLOPS],  0.02939[GB/s]
( 1024- 1):neon                : sec= 0.39206,   5.10120[GFLOPS],  0.02989[GB/s]
( 1024- 2):neon                : sec= 0.39550,   5.05686[GFLOPS],  0.02963[GB/s]

asm 中身無し
( 1024- 0):neon                : sec= 0.01932, 103.53541[GFLOPS],  0.60665[GB/s]
( 1024- 1):neon                : sec= 0.01427, 140.15840[GFLOPS],  0.82124[GB/s]
( 1024- 2):neon                : sec= 0.01227, 162.98274[GFLOPS],  0.95498[GB/s]

まあ5%ぐらいか。何も処理していないことを考えると、ちょっと大きい気もするが、今見るべきでは無いという気もする。

まず演算かメモリを切り分ける。

    __asm__ __volatile__ (".p2align 4\n\t"
                          "1:\n\t"
                          "//vld1.32 {d0,d1}, [%[inL00_0]:64]!\n\t"
                          "//vld1.32 {d2,d3}, [%[inL00_1]:64]!\n\t"
                          "//pld [%[inRp1], %[pld_offset]]\n\t"
                          "//vldmia %[inRp1], {q8-q11}\n\t"

                          "add %[inRp1], %[inRp1], %[pitch_f32]\n\t"

                          "vmla.f32 %q[vout0_0], q8, d0[0]\n\t"
                          "vmla.f32 %q[vout1_0], q8, d2[0]\n\t"
                          "vmla.f32 %q[vout0_1], q9, d0[0]\n\t"

こんな感じにする。わかりづらいが、メモリ関連命令を消している。

( 1024- 0):neon                : sec= 0.22819,   8.76453[GFLOPS],  0.05135[GB/s]
( 1024- 1):neon                : sec= 0.21176,   9.44475[GFLOPS],  0.05534[GB/s]
( 1024- 2):neon                : sec= 0.19515,  10.24869[GFLOPS],  0.06005[GB/s]
( 1152- 0):neon                : sec= 0.26081,  10.91833[GFLOPS],  0.05687[GB/s]
( 1152- 1):neon                : sec= 0.23850,  11.93990[GFLOPS],  0.06219[GB/s]
( 1152- 2):neon                : sec= 0.23796,  11.96672[GFLOPS],  0.06233[GB/s]

こんな感じ…んーなんか普通にメモリネックな気が…

次にロード命令かキャッシュかを切り分ける。

    __asm__ __volatile__ (".p2align 4\n\t"
                          "1:\n\t"
                          "vld1.32 {d0,d1}, [%[inL00_0]:64]\n\t"  // !
                          "vld1.32 {d2,d3}, [%[inL00_1]:64]\n\t"  // !
                          "pld [%[inRp1], %[pld_offset]]\n\t"
                          "vldmia %[inRp1], {q8-q11}\n\t"

                          "add %[inRp1_cnt], %[inRp1_cnt], %[pitch_f32]\n\t" // !

                          "vmla.f32 %q[vout0_0], q8, d0[0]\n\t"
                          "vmla.f32 %q[vout1_0], q8, d2[0]\n\t"
                          "vmla.f32 %q[vout0_1], q9, d0[0]\n\t"
                          "vmla.f32 %q[vout1_1], q9, d2[0]\n\t"
                          "pld [%[inL00_0], #64]\n\t"
                          "vmla.f32 %q[vout0_2], q10, d0[0]\n\t"
                          "vmla.f32 %q[vout1_2], q10, d2[0]\n\t"
                          "vmla.f32 %q[vout0_3], q11, d0[0]\n\t"
                          "vmla.f32 %q[vout1_3], q11, d2[0]\n\t"

                          "pld [%[inRp1], %[pld_offset]]\n\t"
                          "vldmia %[inRp1], {q8-q11}\n\t"
                          "add %[inRp1_cnt], %[inRp1_cnt], %[pitch_f32]\n\t"  // !

こんな感じにする。わかりづらいが、ポインタの加算をやめている。今は、ポインタとループカウンタを一緒にしているので、レジスタ使用が一個増えてしまっているが…まあ無視しよう。(ポインタとループカウンタを共有する手法については、VisualStudioが出すコードを参考にすると良い。あれはいつ見ても良い)

( 1280- 0):neon                : sec= 0.48997,   7.97241[GFLOPS],  0.03737[GB/s]
( 1280- 1):neon                : sec= 0.45807,   8.52755[GFLOPS],  0.03997[GB/s]
( 1280- 2):neon                : sec= 0.49251,   7.93127[GFLOPS],  0.03718[GB/s]
( 1408- 0):neon                : sec= 0.77800,   6.68278[GFLOPS],  0.02848[GB/s]
( 1408- 1):neon                : sec= 0.73396,   7.08381[GFLOPS],  0.03019[GB/s]
( 1408- 2):neon                : sec= 0.73258,   7.09712[GFLOPS],  0.03024[GB/s]

あまり変化が無い。最初のグラフで見た、7GFLOPSは、キャッシュに入るとそのぐらい出そうだというように見える。

あ、pldはキャッシュに乗っててもスループットが悪いとか。消した。まあ変わらん。

ということを考えると、普通にロード命令がネックでは？という気がする。

    __asm__ __volatile__ (".p2align 4\n\t"
                          "1:\n\t"
                          "vld1.32 {d0,d1}, [%[inL00_0]:64]!\n\t"
                          "vld1.32 {d2,d3}, [%[inL00_1]:64]!\n\t"
                          "pld [%[inRp1], %[pld_offset]]\n\t"
                          "vldmia %[inRp1], {q8-q11}\n\t"

                          "add %[inRp1], %[inRp1], %[pitch_f32]\n\t"

                          "//vmla.f32 %q[vout0_0], q8, d0[0]\n\t"
                          "//vmla.f32 %q[vout1_0], q8, d2[0]\n\t"
                          "//vmla.f32 %q[vout0_1], q9, d0[0]\n\t"
                          "//vmla.f32 %q[vout1_1], q9, d2[0]\n\t"
                          "pld [%[inL00_0], #64]\n\t"
                          "//vmla.f32 %q[vout0_2], q10, d0[0]\n\t"
                          "//vmla.f32 %q[vout1_2], q10, d2[0]\n\t"
                          "//vmla.f32 %q[vout0_3], q11, d0[0]\n\t"
                          "//vmla.f32 %q[vout1_3], q11, d2[0]\n\t"

                          "pld [%[inRp1], %[pld_offset]]\n\t"
                          "vldmia %[inRp1], {q8-q11}\n\t"
                          "add %[inRp1], %[inRp1], %[pitch_f32]\n\t"

                          "//vmla.f32 %q[vout0_0], q8, d0[1]\n\t"
                          "//vmla.f32 %q[vout1_0], q8, d2[1]\n\t"
                          "//vmla.f32 %q[vout0_1], q9, d0[1]\n\t"
                          "//vmla.f32 %q[vout1_1], q9, d2[1]\n\t"

演算を消す。まあ、ちゃんとロードしたレジスタに触れてあげないと公平ではないんだが、まあいいや。

( 1024- 0):neon                : sec= 0.33853,   5.90783[GFLOPS],  0.03462[GB/s]
( 1024- 1):neon                : sec= 0.33344,   5.99811[GFLOPS],  0.03515[GB/s]
( 1024- 2):neon                : sec= 0.33037,   6.05381[GFLOPS],  0.03547[GB/s]

は？もしかしてロード命令ってスループット128bit/cycle出ないんじゃね？

ここでやっとマニュアルを見る。

マヌアルの読みかたがわからないのだった。(終わり)