main.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. /* This file is part of the Vc project
  2. Copyright (C) 2009-2015 Matthias Kretz <kretz@kde.org>
  3. Permission to use, copy, modify, and distribute this software
  4. and its documentation for any purpose and without fee is hereby
  5. granted, provided that the above copyright notice appear in all
  6. copies and that both that the copyright notice and this
  7. permission notice and warranty disclaimer appear in supporting
  8. documentation, and that the name of the author not be used in
  9. advertising or publicity pertaining to distribution of the
  10. software without specific, written prior permission.
  11. The author disclaim all warranties with regard to this
  12. software, including all implied warranties of merchantability
  13. and fitness. In no event shall the author be liable for any
  14. special, indirect or consequential damages or any damages
  15. whatsoever resulting from loss of use, data or profits, whether
  16. in an action of contract, negligence or other tortious action,
  17. arising out of or in connection with the use or performance of
  18. this software.
  19. */
  20. #include <Vc/Vc>
  21. #include <Vc/IO>
  22. #include <iostream>
  23. #include <iomanip>
  24. #include <valarray>
  25. #include "../tsc.h"
  26. static constexpr int UnrollOuterloop = 4;
  27. template <typename T, size_t N> class MatrixValarray
  28. {
  29. using V = std::valarray<T>;
  30. V data[N];
  31. public:
  32. MatrixValarray()
  33. {
  34. for (auto &v : data) {
  35. v.resize(N);
  36. }
  37. }
  38. std::valarray<T> &operator[](std::size_t i) { return data[i]; }
  39. const std::valarray<T> &operator[](std::size_t i) const { return data[i]; }
  40. };
  41. template <typename T, size_t N>
  42. inline MatrixValarray<T, N> operator*(const MatrixValarray<T, N> &a,
  43. const MatrixValarray<T, N> &b)
  44. {
  45. MatrixValarray<T, N> c;
  46. for (size_t i = 0; i < N; ++i) {
  47. c[i] = a[i][0] * b[0];
  48. for (size_t k = 1; k < N; ++k) {
  49. c[i] += a[i][k] * b[k];
  50. }
  51. }
  52. return c;
  53. }
  54. template <typename T, size_t N> class Matrix
  55. {
  56. using V = Vc::Vector<T>;
  57. // round up to the next multiple of V::size()
  58. static constexpr size_t NPadded = (N + V::size() - 1) / V::size() * V::size();
  59. // the inner array stores one row of values and is padded
  60. using RowArray = std::array<T, NPadded>;
  61. // The following function is a workaround for GCC 4.8, which fails to recognize
  62. // V::MemoryAlignment as a constant expression inside the alignas operator.
  63. static constexpr size_t dataAlignment() { return V::MemoryAlignment; }
  64. // The outer array stores N rows and does not require further padding. It must be
  65. // aligned correctly for Vc::Aligned loads and stores, though.
  66. alignas(dataAlignment()) std::array<RowArray, N> data;
  67. public:
  68. Matrix()
  69. {
  70. for (int i = 0; i < int(N); ++i) {
  71. for (int j = N; j < int(NPadded); ++j) {
  72. data[i][j] = 0;
  73. }
  74. }
  75. }
  76. // returns a reference to the i-th row
  77. RowArray &operator[](size_t i) { return data[i]; }
  78. // const overload of the above
  79. const RowArray &operator[](size_t i) const { return data[i]; }
  80. };
  81. // vectorized matrix multiplication
  82. template <typename T, size_t N>
  83. inline Matrix<T, N> operator*(const Matrix<T, N> &a, const Matrix<T, N> &b)
  84. {
  85. constexpr int NN = N;
  86. using V = Vc::Vector<T>;
  87. // resulting matrix c
  88. Matrix<T, N> c;
  89. // The row index (for a and c) is unrolled using the UnrollOuterloop stride. Therefore
  90. // the last rows may need special treatment if N is not a multiple of UnrollOuterloop.
  91. // N0 is the number of rows that can safely be iterated with a stride of
  92. // UnrollOuterloop.
  93. constexpr int N0 = N / UnrollOuterloop * UnrollOuterloop;
  94. for (int i = 0; i < N0; i += UnrollOuterloop) {
  95. // The iteration over the column index of b and c uses a stride of V::size(). This
  96. // enables row-vector loads (from b) and stores (to c). The matrix storage is
  97. // padded accordingly, ensuring correct bounds and alignment.
  98. for (int j = 0; j < NN; j += int(V::size())) {
  99. // This temporary variables are used to accumulate the results of the products
  100. // producing the new values for the c matrix. This variable is necessary
  101. // because we need a V object for data-parallel accumulation. Storing to c
  102. // directly stores to scalar objects and thus would drop the ability for
  103. // data-parallel (SIMD) addition.
  104. V c_ij[UnrollOuterloop];
  105. for (int n = 0; n < UnrollOuterloop; ++n) {
  106. c_ij[n] = a[i + n][0] * V(&b[0][j], Vc::Aligned);
  107. }
  108. for (int k = 1; k < NN - 1; ++k) {
  109. for (int n = 0; n < UnrollOuterloop; ++n) {
  110. c_ij[n] += a[i + n][k] * V(&b[k][j], Vc::Aligned);
  111. }
  112. }
  113. for (int n = 0; n < UnrollOuterloop; ++n) {
  114. c_ij[n] += a[i + n][NN - 1] * V(&b[NN - 1][j], Vc::Aligned);
  115. c_ij[n].store(&c[i + n][j], Vc::Aligned);
  116. }
  117. }
  118. }
  119. // This final loop treats the remaining NN - N0 rows.
  120. for (int j = 0; j < NN; j += int(V::size())) {
  121. V c_ij[UnrollOuterloop];
  122. for (int n = N0; n < NN; ++n) {
  123. c_ij[n - N0] = a[n][0] * V(&b[0][j], Vc::Aligned);
  124. }
  125. for (int k = 1; k < NN - 1; ++k) {
  126. for (int n = N0; n < NN; ++n) {
  127. c_ij[n - N0] += a[n][k] * V(&b[k][j], Vc::Aligned);
  128. }
  129. }
  130. for (int n = N0; n < NN; ++n) {
  131. c_ij[n - N0] += a[n][NN - 1] * V(&b[N - 1][j], Vc::Aligned);
  132. c_ij[n - N0].store(&c[n][j], Vc::Aligned);
  133. }
  134. }
  135. return c;
  136. }
  137. // scalar matrix multiplication
  138. template <typename T, size_t N>
  139. Matrix<T, N> scalar_mul(const Matrix<T, N> &a, const Matrix<T, N> &b)
  140. {
  141. constexpr int NN = N;
  142. Matrix<T, N> c;
  143. for (int i = 0; i < NN; ++i) {
  144. for (int j = 0; j < NN; ++j) {
  145. c[i][j] = a[i][0] * b[0][j];
  146. for (int k = 1; k < NN; ++k) {
  147. c[i][j] += a[i][k] * b[k][j];
  148. }
  149. }
  150. }
  151. return c;
  152. }
  153. // scalar matrix multiplication
  154. template <typename T, size_t N>
  155. Matrix<T, N> scalar_mul_blocked(const Matrix<T, N> &a, const Matrix<T, N> &b)
  156. {
  157. constexpr int NN = N;
  158. Matrix<T, N> c;
  159. constexpr int N0 = N / UnrollOuterloop * UnrollOuterloop;
  160. for (int i = 0; i < N0; i += UnrollOuterloop) {
  161. for (int j = 0; j < NN; ++j) {
  162. for (int n = 0; n < UnrollOuterloop; ++n) {
  163. c[i + n][j] = a[i + n][0] * b[0][j];
  164. }
  165. for (int k = 1; k < NN; ++k) {
  166. for (int n = 0; n < UnrollOuterloop; ++n) {
  167. c[i + n][j] += a[i + n][k] * b[k][j];
  168. }
  169. }
  170. }
  171. }
  172. for (int j = 0; j < NN; ++j) {
  173. for (int n = N0; n < NN; ++n) {
  174. c[n][j] = a[n][0] * b[0][j];
  175. }
  176. for (int k = 1; k < NN; ++k) {
  177. for (int n = N0; n < NN; ++n) {
  178. c[n][j] += a[n][k] * b[k][j];
  179. }
  180. }
  181. }
  182. return c;
  183. }
  184. // valarray matrix multiplication
  185. template <typename T, size_t N>
  186. Matrix<T, N> valarray_mul(const Matrix<T, N> &a, const Matrix<T, N> &b)
  187. {
  188. Matrix<T, N> c;
  189. using V = std::valarray<T>;
  190. for (size_t i = 0; i < N; ++i) {
  191. V c_i = a[i][0] * V(&b[0][0], N);
  192. for (size_t k = 1; k < N; ++k) {
  193. c_i += a[i][k] * V(&b[k][0], N);
  194. }
  195. std::copy(std::begin(c_i), std::end(c_i), &c[i][0]);
  196. }
  197. return c;
  198. }
  199. template <template <typename, size_t> class M, typename T, size_t N,
  200. typename = Vc::enable_if<(std::is_same<M<T, N>, Matrix<T, N>>::value ||
  201. std::is_same<M<T, N>, MatrixValarray<T, N>>::value)>>
  202. std::ostream &operator<<(std::ostream &out, const M<T, N> &m)
  203. {
  204. out.precision(3);
  205. auto &&w = std::setw(6);
  206. out << "⎡" << w << m[0][0];
  207. for (size_t j = 1; j < N; ++j) {
  208. out << ", " << w << m[0][j];
  209. }
  210. out << " ⎤\n";
  211. for (size_t i = 1; i + 1 < N; ++i) {
  212. out << "⎢" << w << m[i][0];
  213. for (size_t j = 1; j < N; ++j) {
  214. out << ", " << w << m[i][j];
  215. }
  216. out << " ⎥\n";
  217. }
  218. out << "⎣" << w << m[N - 1][0];
  219. for (size_t j = 1; j < N; ++j) {
  220. out << ", " << w << m[N - 1][j];
  221. }
  222. return out << " ⎦\n";
  223. }
  224. #ifdef Vc_MSVC
  225. #pragma optimize("", off)
  226. template <typename T> void unused(T &&x) { x = x; }
  227. #pragma optimize("", on)
  228. #else
  229. template <typename T> void unused(T &&x) { asm("" ::"m"(x)); }
  230. #endif
  231. template <size_t N, typename F> Vc_ALWAYS_INLINE void benchmark(F &&f)
  232. {
  233. TimeStampCounter tsc;
  234. auto cycles = tsc.cycles();
  235. cycles = 0x7fffffff;
  236. for (int i = 0; i < 100; ++i) {
  237. tsc.start();
  238. for (int j = 0; j < 10; ++j) {
  239. auto C = f();
  240. unused(C);
  241. }
  242. tsc.stop();
  243. cycles = std::min(cycles, tsc.cycles());
  244. }
  245. //std::cout << cycles << " Cycles for " << N *N *(N + N - 1) << " FLOP => ";
  246. std::cout << std::setw(19) << std::setprecision(3)
  247. << double(N * N * (N + N - 1) * 10) / cycles;
  248. //<< " FLOP/cycle (" << variant << ")\n";
  249. }
  250. template <size_t N> void run()
  251. {
  252. Matrix<float, N> A;
  253. Matrix<float, N> B;
  254. MatrixValarray<float, N> AV;
  255. MatrixValarray<float, N> BV;
  256. for (size_t i = 0; i < N; ++i) {
  257. for (size_t j = 0; j < N; ++j) {
  258. A[i][j] = 0.01 * (i + j);
  259. B[i][j] = 0.01 * (N + i - j);
  260. AV[i][j] = 0.01 * (i + j);
  261. BV[i][j] = 0.01 * (N + i - j);
  262. }
  263. }
  264. std::cout << std::setw(2) << N;
  265. #if defined Vc_MSVC
  266. auto &&fakeModify = [](Matrix<float, N> &a, Matrix<float, N> &b) {
  267. unused(a);
  268. unused(b);
  269. };
  270. #else
  271. auto &&fakeModify = [](Matrix<float, N> &a, Matrix<float, N> &b) {
  272. #ifdef Vc_ICC
  273. asm("" ::"r"(&a), "r"(&b));
  274. #else
  275. asm("" : "+m"(a), "+m"(b));
  276. #endif
  277. };
  278. #endif
  279. benchmark<N>([&] {
  280. fakeModify(A, B);
  281. return scalar_mul(A, B);
  282. });
  283. benchmark<N>([&] {
  284. fakeModify(A, B);
  285. return scalar_mul_blocked(A, B);
  286. });
  287. benchmark<N>([&] {
  288. fakeModify(A, B);
  289. return A * B;
  290. });
  291. benchmark<N>([&] {
  292. fakeModify(A, B);
  293. return AV * BV;
  294. });
  295. std::cout << std::endl;
  296. }
  297. int Vc_CDECL main()
  298. {
  299. std::cout << " N scalar scalar & blocked Vector<T> valarray\n";
  300. run< 4>();
  301. run< 5>();
  302. run< 6>();
  303. run< 7>();
  304. run< 8>();
  305. run< 9>();
  306. run<10>();
  307. run<11>();
  308. run<12>();
  309. run<13>();
  310. run<14>();
  311. run<15>();
  312. run<16>();
  313. run<17>();
  314. run<18>();
  315. run<19>();
  316. run<20>();
  317. run<21>();
  318. run<22>();
  319. run<23>();
  320. run<24>();
  321. run<25>();
  322. run<26>();
  323. run<27>();
  324. run<28>();
  325. run<29>();
  326. run<30>();
  327. run<31>();
  328. run<32>();
  329. run<33>();
  330. run<34>();
  331. run<35>();
  332. return 0;
  333. }