OptimizeForArchitecture.cmake 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582
  1. # Determine the host CPU feature set and determine the best set of compiler
  2. # flags to enable all supported SIMD relevant features. Alternatively, the
  3. # target CPU can be explicitly selected (for generating more generic binaries
  4. # or for targeting a different system).
  5. # Compilers provide e.g. the -march=native flag to achieve a similar result.
  6. # This fails to address the need for building for a different microarchitecture
  7. # than the current host.
  8. # The script tries to deduce all settings from the model and family numbers of
  9. # the CPU instead of reading the CPUID flags from e.g. /proc/cpuinfo. This makes
  10. # the detection more independent from the CPUID code in the kernel (e.g. avx2 is
  11. # not listed on older kernels).
  12. #
  13. # Usage:
  14. # OptimizeForArchitecture()
  15. # If either of Vc_SSE_INTRINSICS_BROKEN, Vc_AVX_INTRINSICS_BROKEN,
  16. # Vc_AVX2_INTRINSICS_BROKEN is defined and set, the OptimizeForArchitecture
  17. # macro will consequently disable the relevant features via compiler flags.
  18. #=============================================================================
  19. # Copyright 2010-2016 Matthias Kretz <kretz@kde.org>
  20. #
  21. # Redistribution and use in source and binary forms, with or without
  22. # modification, are permitted provided that the following conditions are
  23. # met:
  24. #
  25. # * Redistributions of source code must retain the above copyright notice,
  26. # this list of conditions and the following disclaimer.
  27. # * Redistributions in binary form must reproduce the above copyright notice,
  28. # this list of conditions and the following disclaimer in the documentation
  29. # and/or other materials provided with the distribution.
  30. # * Neither the names of contributing organizations nor the
  31. # names of its contributors may be used to endorse or promote products
  32. # derived from this software without specific prior written permission.
  33. #
  34. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS''
  35. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  36. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  37. # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
  38. # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  39. # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  40. # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  41. # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  42. # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  43. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  44. #=============================================================================
  45. get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH)
  46. include("${_currentDir}/AddCompilerFlag.cmake")
  47. include(CheckIncludeFileCXX)
  48. macro(_my_find _list _value _ret)
  49. list(FIND ${_list} "${_value}" _found)
  50. if(_found EQUAL -1)
  51. set(${_ret} FALSE)
  52. else(_found EQUAL -1)
  53. set(${_ret} TRUE)
  54. endif(_found EQUAL -1)
  55. endmacro(_my_find)
  56. macro(AutodetectHostArchitecture)
  57. set(TARGET_ARCHITECTURE "generic")
  58. set(Vc_ARCHITECTURE_FLAGS)
  59. set(_vendor_id)
  60. set(_cpu_family)
  61. set(_cpu_model)
  62. if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
  63. file(READ "/proc/cpuinfo" _cpuinfo)
  64. string(REGEX REPLACE ".*vendor_id[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _vendor_id "${_cpuinfo}")
  65. string(REGEX REPLACE ".*cpu family[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_family "${_cpuinfo}")
  66. string(REGEX REPLACE ".*model[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_model "${_cpuinfo}")
  67. string(REGEX REPLACE ".*flags[ \t]*:[ \t]+([^\n]+).*" "\\1" _cpu_flags "${_cpuinfo}")
  68. elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
  69. exec_program("/usr/sbin/sysctl -n machdep.cpu.vendor machdep.cpu.model machdep.cpu.family machdep.cpu.features" OUTPUT_VARIABLE _sysctl_output_string)
  70. string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string})
  71. list(GET _sysctl_output 0 _vendor_id)
  72. list(GET _sysctl_output 1 _cpu_model)
  73. list(GET _sysctl_output 2 _cpu_family)
  74. list(GET _sysctl_output 3 _cpu_flags)
  75. string(TOLOWER "${_cpu_flags}" _cpu_flags)
  76. string(REPLACE "." "_" _cpu_flags "${_cpu_flags}")
  77. elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
  78. get_filename_component(_vendor_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;VendorIdentifier]" NAME CACHE)
  79. get_filename_component(_cpu_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;Identifier]" NAME CACHE)
  80. mark_as_advanced(_vendor_id _cpu_id)
  81. string(REGEX REPLACE ".* Family ([0-9]+) .*" "\\1" _cpu_family "${_cpu_id}")
  82. string(REGEX REPLACE ".* Model ([0-9]+) .*" "\\1" _cpu_model "${_cpu_id}")
  83. endif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
  84. if(_vendor_id STREQUAL "GenuineIntel")
  85. if(_cpu_family EQUAL 6)
  86. # taken from the Intel ORM
  87. # http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html
  88. # CPUID Signature Values of Of Recent Intel Microarchitectures
  89. # 4E 5E | Skylake microarchitecture
  90. # 3D 47 56 | Broadwell microarchitecture
  91. # 3C 45 46 3F | Haswell microarchitecture
  92. # 3A 3E | Ivy Bridge microarchitecture
  93. # 2A 2D | Sandy Bridge microarchitecture
  94. # 25 2C 2F | Intel microarchitecture Westmere
  95. # 1A 1E 1F 2E | Intel microarchitecture Nehalem
  96. # 17 1D | Enhanced Intel Core microarchitecture
  97. # 0F | Intel Core microarchitecture
  98. #
  99. # Intel SDM Vol. 3C 35-1 / December 2016:
  100. # 57 | Xeon Phi 3200, 5200, 7200 [Knights Landing]
  101. # 85 | Future Xeon Phi
  102. # 8E 9E | 7th gen. Core [Kaby Lake]
  103. # 55 | Future Xeon [Skylake w/ AVX512]
  104. # 4E 5E | 6th gen. Core / E3 v5 [Skylake w/o AVX512]
  105. # 56 | Xeon D-1500 [Broadwell]
  106. # 4F | Xeon E5 v4, E7 v4, i7-69xx [Broadwell]
  107. # 47 | 5th gen. Core / Xeon E3 v4 [Broadwell]
  108. # 3D | M-5xxx / 5th gen. [Broadwell]
  109. # 3F | Xeon E5 v3, E7 v3, i7-59xx [Haswell-E]
  110. # 3C 45 46 | 4th gen. Core, Xeon E3 v3 [Haswell]
  111. # 3E | Xeon E5 v2, E7 v2, i7-49xx [Ivy Bridge-E]
  112. # 3A | 3rd gen. Core, Xeon E3 v2 [Ivy Bridge]
  113. # 2D | Xeon E5, i7-39xx [Sandy Bridge]
  114. # 2F | Xeon E7
  115. # 2A | Xeon E3, 2nd gen. Core [Sandy Bridge]
  116. # 2E | Xeon 7500, 6500 series
  117. # 25 2C | Xeon 3600, 5600 series, Core i7, i5 and i3
  118. #
  119. # Values from the Intel SDE:
  120. # 5C | Goldmont
  121. # 5A | Silvermont
  122. # 57 | Knights Landing
  123. # 66 | Cannonlake
  124. # 55 | Skylake Server
  125. # 4E | Skylake Client
  126. # 3C | Broadwell (likely a bug in the SDE)
  127. # 3C | Haswell
  128. if(_cpu_model EQUAL 87) # 57
  129. set(TARGET_ARCHITECTURE "knl") # Knights Landing
  130. elseif(_cpu_model EQUAL 92)
  131. set(TARGET_ARCHITECTURE "goldmont")
  132. elseif(_cpu_model EQUAL 90 OR _cpu_model EQUAL 76)
  133. set(TARGET_ARCHITECTURE "silvermont")
  134. elseif(_cpu_model EQUAL 102)
  135. set(TARGET_ARCHITECTURE "cannonlake")
  136. elseif(_cpu_model EQUAL 142 OR _cpu_model EQUAL 158) # 8E, 9E
  137. set(TARGET_ARCHITECTURE "kaby-lake")
  138. elseif(_cpu_model EQUAL 85) # 55
  139. set(TARGET_ARCHITECTURE "skylake-avx512")
  140. elseif(_cpu_model EQUAL 78 OR _cpu_model EQUAL 94) # 4E, 5E
  141. set(TARGET_ARCHITECTURE "skylake")
  142. elseif(_cpu_model EQUAL 61 OR _cpu_model EQUAL 71 OR _cpu_model EQUAL 79 OR _cpu_model EQUAL 86) # 3D, 47, 4F, 56
  143. set(TARGET_ARCHITECTURE "broadwell")
  144. elseif(_cpu_model EQUAL 60 OR _cpu_model EQUAL 69 OR _cpu_model EQUAL 70 OR _cpu_model EQUAL 63)
  145. set(TARGET_ARCHITECTURE "haswell")
  146. elseif(_cpu_model EQUAL 58 OR _cpu_model EQUAL 62)
  147. set(TARGET_ARCHITECTURE "ivy-bridge")
  148. elseif(_cpu_model EQUAL 42 OR _cpu_model EQUAL 45)
  149. set(TARGET_ARCHITECTURE "sandy-bridge")
  150. elseif(_cpu_model EQUAL 37 OR _cpu_model EQUAL 44 OR _cpu_model EQUAL 47)
  151. set(TARGET_ARCHITECTURE "westmere")
  152. elseif(_cpu_model EQUAL 26 OR _cpu_model EQUAL 30 OR _cpu_model EQUAL 31 OR _cpu_model EQUAL 46)
  153. set(TARGET_ARCHITECTURE "nehalem")
  154. elseif(_cpu_model EQUAL 23 OR _cpu_model EQUAL 29)
  155. set(TARGET_ARCHITECTURE "penryn")
  156. elseif(_cpu_model EQUAL 15)
  157. set(TARGET_ARCHITECTURE "merom")
  158. elseif(_cpu_model EQUAL 28)
  159. set(TARGET_ARCHITECTURE "atom")
  160. elseif(_cpu_model EQUAL 14)
  161. set(TARGET_ARCHITECTURE "core")
  162. elseif(_cpu_model LESS 14)
  163. message(WARNING "Your CPU (family ${_cpu_family}, model ${_cpu_model}) is not known. Auto-detection of optimization flags failed and will use the generic CPU settings with SSE2.")
  164. set(TARGET_ARCHITECTURE "generic")
  165. else()
  166. message(WARNING "Your CPU (family ${_cpu_family}, model ${_cpu_model}) is not known. Auto-detection of optimization flags failed and will use the 65nm Core 2 CPU settings.")
  167. set(TARGET_ARCHITECTURE "merom")
  168. endif()
  169. elseif(_cpu_family EQUAL 7) # Itanium (not supported)
  170. message(WARNING "Your CPU (Itanium: family ${_cpu_family}, model ${_cpu_model}) is not supported by OptimizeForArchitecture.cmake.")
  171. elseif(_cpu_family EQUAL 15) # NetBurst
  172. list(APPEND _available_vector_units_list "sse" "sse2")
  173. if(_cpu_model GREATER 2) # Not sure whether this must be 3 or even 4 instead
  174. list(APPEND _available_vector_units_list "sse" "sse2" "sse3")
  175. endif(_cpu_model GREATER 2)
  176. endif(_cpu_family EQUAL 6)
  177. elseif(_vendor_id STREQUAL "AuthenticAMD")
  178. if(_cpu_family EQUAL 23)
  179. set(TARGET_ARCHITECTURE "zen")
  180. elseif(_cpu_family EQUAL 22) # 16h
  181. set(TARGET_ARCHITECTURE "AMD 16h")
  182. elseif(_cpu_family EQUAL 21) # 15h
  183. if(_cpu_model LESS 2)
  184. set(TARGET_ARCHITECTURE "bulldozer")
  185. else()
  186. set(TARGET_ARCHITECTURE "piledriver")
  187. endif()
  188. elseif(_cpu_family EQUAL 20) # 14h
  189. set(TARGET_ARCHITECTURE "AMD 14h")
  190. elseif(_cpu_family EQUAL 18) # 12h
  191. elseif(_cpu_family EQUAL 16) # 10h
  192. set(TARGET_ARCHITECTURE "barcelona")
  193. elseif(_cpu_family EQUAL 15)
  194. set(TARGET_ARCHITECTURE "k8")
  195. if(_cpu_model GREATER 64) # I don't know the right number to put here. This is just a guess from the hardware I have access to
  196. set(TARGET_ARCHITECTURE "k8-sse3")
  197. endif(_cpu_model GREATER 64)
  198. endif()
  199. endif(_vendor_id STREQUAL "GenuineIntel")
  200. endmacro()
  201. macro(OptimizeForArchitecture)
  202. if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(x86|AMD64)")
  203. OptimizeForArchitectureX86()
  204. else()
  205. message(STATUS "No support for auto-detection of the target instruction set/extension")
  206. set(TARGET_ARCHITECTURE "unused" CACHE STRING "CPU architecture to optimize for. (unused)")
  207. endif()
  208. endmacro()
  209. macro(OptimizeForArchitectureX86)
  210. set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. \
  211. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. \
  212. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. \
  213. Other supported values are: \"none\", \"generic\", \"core\", \"merom\" (65nm Core2), \
  214. \"penryn\" (45nm Core2), \"nehalem\", \"westmere\", \"sandy-bridge\", \"ivy-bridge\", \
  215. \"haswell\", \"broadwell\", \"skylake\", \"skylake-xeon\", \"kaby-lake\", \"cannonlake\", \"silvermont\", \
  216. \"goldmont\", \"knl\" (Knights Landing), \"atom\", \"k8\", \"k8-sse3\", \"barcelona\", \
  217. \"istanbul\", \"magny-cours\", \"bulldozer\", \"interlagos\", \"piledriver\", \
  218. \"AMD 14h\", \"AMD 16h\", \"zen\".")
  219. set(_force)
  220. if(NOT _last_target_arch STREQUAL "${TARGET_ARCHITECTURE}")
  221. message(STATUS "target changed from \"${_last_target_arch}\" to \"${TARGET_ARCHITECTURE}\"")
  222. set(_force FORCE)
  223. endif()
  224. set(_last_target_arch "${TARGET_ARCHITECTURE}" CACHE STRING "" FORCE)
  225. mark_as_advanced(_last_target_arch)
  226. string(TOLOWER "${TARGET_ARCHITECTURE}" TARGET_ARCHITECTURE)
  227. set(_march_flag_list)
  228. set(_available_vector_units_list)
  229. if(TARGET_ARCHITECTURE STREQUAL "auto")
  230. AutodetectHostArchitecture()
  231. message(STATUS "Detected CPU: ${TARGET_ARCHITECTURE}")
  232. endif(TARGET_ARCHITECTURE STREQUAL "auto")
  233. macro(_nehalem)
  234. list(APPEND _march_flag_list "nehalem")
  235. list(APPEND _march_flag_list "corei7")
  236. list(APPEND _march_flag_list "core2")
  237. list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2")
  238. endmacro()
  239. macro(_westmere)
  240. list(APPEND _march_flag_list "westmere")
  241. _nehalem()
  242. endmacro()
  243. macro(_sandybridge)
  244. list(APPEND _march_flag_list "sandybridge")
  245. list(APPEND _march_flag_list "corei7-avx")
  246. _westmere()
  247. list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "avx")
  248. endmacro()
  249. macro(_ivybridge)
  250. list(APPEND _march_flag_list "ivybridge")
  251. list(APPEND _march_flag_list "core-avx-i")
  252. _sandybridge()
  253. list(APPEND _available_vector_units_list "rdrnd" "f16c")
  254. endmacro()
  255. macro(_haswell)
  256. list(APPEND _march_flag_list "haswell")
  257. list(APPEND _march_flag_list "core-avx2")
  258. _ivybridge()
  259. list(APPEND _available_vector_units_list "avx2" "fma" "bmi" "bmi2")
  260. endmacro()
  261. macro(_broadwell)
  262. list(APPEND _march_flag_list "broadwell")
  263. _haswell()
  264. endmacro()
  265. macro(_skylake)
  266. list(APPEND _march_flag_list "skylake")
  267. _broadwell()
  268. endmacro()
  269. macro(_skylake_avx512)
  270. list(APPEND _march_flag_list "skylake-avx512")
  271. _skylake()
  272. list(APPEND _available_vector_units_list "avx512f" "avx512cd" "avx512dq" "avx512bw" "avx512vl")
  273. endmacro()
  274. macro(_cannonlake)
  275. list(APPEND _march_flag_list "cannonlake")
  276. _skylake_avx512()
  277. list(APPEND _available_vector_units_list "avx512ifma" "avx512vbmi")
  278. endmacro()
  279. macro(_knightslanding)
  280. list(APPEND _march_flag_list "knl")
  281. _broadwell()
  282. list(APPEND _available_vector_units_list "avx512f" "avx512pf" "avx512er" "avx512cd")
  283. endmacro()
  284. macro(_silvermont)
  285. list(APPEND _march_flag_list "silvermont")
  286. _westmere()
  287. list(APPEND _available_vector_units_list "rdrnd")
  288. endmacro()
  289. macro(_goldmont)
  290. list(APPEND _march_flag_list "goldmont")
  291. _silvermont()
  292. endmacro()
  293. if(TARGET_ARCHITECTURE STREQUAL "core")
  294. list(APPEND _march_flag_list "core2")
  295. list(APPEND _available_vector_units_list "sse" "sse2" "sse3")
  296. elseif(TARGET_ARCHITECTURE STREQUAL "merom")
  297. list(APPEND _march_flag_list "merom")
  298. list(APPEND _march_flag_list "core2")
  299. list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3")
  300. elseif(TARGET_ARCHITECTURE STREQUAL "penryn")
  301. list(APPEND _march_flag_list "penryn")
  302. list(APPEND _march_flag_list "core2")
  303. list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3")
  304. message(STATUS "Sadly the Penryn architecture exists in variants with SSE4.1 and without SSE4.1.")
  305. if(_cpu_flags MATCHES "sse4_1")
  306. message(STATUS "SSE4.1: enabled (auto-detected from this computer's CPU flags)")
  307. list(APPEND _available_vector_units_list "sse4.1")
  308. else()
  309. message(STATUS "SSE4.1: disabled (auto-detected from this computer's CPU flags)")
  310. endif()
  311. elseif(TARGET_ARCHITECTURE STREQUAL "knl")
  312. _knightslanding()
  313. elseif(TARGET_ARCHITECTURE STREQUAL "cannonlake")
  314. _cannonlake()
  315. elseif(TARGET_ARCHITECTURE STREQUAL "kaby-lake")
  316. _skylake()
  317. elseif(TARGET_ARCHITECTURE STREQUAL "skylake-xeon" OR TARGET_ARCHITECTURE STREQUAL "skylake-avx512")
  318. _skylake_avx512()
  319. elseif(TARGET_ARCHITECTURE STREQUAL "skylake")
  320. _skylake()
  321. elseif(TARGET_ARCHITECTURE STREQUAL "broadwell")
  322. _broadwell()
  323. elseif(TARGET_ARCHITECTURE STREQUAL "haswell")
  324. _haswell()
  325. elseif(TARGET_ARCHITECTURE STREQUAL "ivy-bridge")
  326. _ivybridge()
  327. elseif(TARGET_ARCHITECTURE STREQUAL "sandy-bridge")
  328. _sandybridge()
  329. elseif(TARGET_ARCHITECTURE STREQUAL "westmere")
  330. _westmere()
  331. elseif(TARGET_ARCHITECTURE STREQUAL "nehalem")
  332. _nehalem()
  333. elseif(TARGET_ARCHITECTURE STREQUAL "goldmont")
  334. _goldmont()
  335. elseif(TARGET_ARCHITECTURE STREQUAL "silvermont")
  336. _silvermont()
  337. elseif(TARGET_ARCHITECTURE STREQUAL "atom")
  338. list(APPEND _march_flag_list "atom")
  339. list(APPEND _march_flag_list "core2")
  340. list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3")
  341. elseif(TARGET_ARCHITECTURE STREQUAL "k8")
  342. list(APPEND _march_flag_list "k8")
  343. list(APPEND _available_vector_units_list "sse" "sse2")
  344. elseif(TARGET_ARCHITECTURE STREQUAL "k8-sse3")
  345. list(APPEND _march_flag_list "k8-sse3")
  346. list(APPEND _march_flag_list "k8")
  347. list(APPEND _available_vector_units_list "sse" "sse2" "sse3")
  348. elseif(TARGET_ARCHITECTURE STREQUAL "AMD 16h")
  349. list(APPEND _march_flag_list "btver2")
  350. list(APPEND _march_flag_list "btver1")
  351. list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "f16c")
  352. elseif(TARGET_ARCHITECTURE STREQUAL "AMD 14h")
  353. list(APPEND _march_flag_list "btver1")
  354. list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a")
  355. elseif(TARGET_ARCHITECTURE STREQUAL "zen")
  356. list(APPEND _march_flag_list "znver1")
  357. _skylake()
  358. list(APPEND _available_vector_units_list "sse4a")
  359. elseif(TARGET_ARCHITECTURE STREQUAL "piledriver")
  360. list(APPEND _march_flag_list "bdver2")
  361. list(APPEND _march_flag_list "bdver1")
  362. list(APPEND _march_flag_list "bulldozer")
  363. list(APPEND _march_flag_list "barcelona")
  364. list(APPEND _march_flag_list "core2")
  365. list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4" "fma" "f16c")
  366. elseif(TARGET_ARCHITECTURE STREQUAL "interlagos")
  367. list(APPEND _march_flag_list "bdver1")
  368. list(APPEND _march_flag_list "bulldozer")
  369. list(APPEND _march_flag_list "barcelona")
  370. list(APPEND _march_flag_list "core2")
  371. list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4")
  372. elseif(TARGET_ARCHITECTURE STREQUAL "bulldozer")
  373. list(APPEND _march_flag_list "bdver1")
  374. list(APPEND _march_flag_list "bulldozer")
  375. list(APPEND _march_flag_list "barcelona")
  376. list(APPEND _march_flag_list "core2")
  377. list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4")
  378. elseif(TARGET_ARCHITECTURE STREQUAL "barcelona")
  379. list(APPEND _march_flag_list "barcelona")
  380. list(APPEND _march_flag_list "core2")
  381. list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a")
  382. elseif(TARGET_ARCHITECTURE STREQUAL "istanbul")
  383. list(APPEND _march_flag_list "barcelona")
  384. list(APPEND _march_flag_list "core2")
  385. list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a")
  386. elseif(TARGET_ARCHITECTURE STREQUAL "magny-cours")
  387. list(APPEND _march_flag_list "barcelona")
  388. list(APPEND _march_flag_list "core2")
  389. list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a")
  390. elseif(TARGET_ARCHITECTURE STREQUAL "generic")
  391. list(APPEND _march_flag_list "generic")
  392. elseif(TARGET_ARCHITECTURE STREQUAL "none")
  393. # add this clause to remove it from the else clause
  394. else(TARGET_ARCHITECTURE STREQUAL "core")
  395. message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.")
  396. endif(TARGET_ARCHITECTURE STREQUAL "core")
  397. if(NOT TARGET_ARCHITECTURE STREQUAL "none")
  398. set(_disable_vector_unit_list)
  399. set(_enable_vector_unit_list)
  400. if(DEFINED Vc_AVX_INTRINSICS_BROKEN AND Vc_AVX_INTRINSICS_BROKEN)
  401. UserWarning("AVX disabled per default because of old/broken toolchain")
  402. set(_avx_broken true)
  403. set(_avx2_broken true)
  404. set(_fma4_broken true)
  405. set(_xop_broken true)
  406. else()
  407. set(_avx_broken false)
  408. if(DEFINED Vc_FMA4_INTRINSICS_BROKEN AND Vc_FMA4_INTRINSICS_BROKEN)
  409. UserWarning("FMA4 disabled per default because of old/broken toolchain")
  410. set(_fma4_broken true)
  411. else()
  412. set(_fma4_broken false)
  413. endif()
  414. if(DEFINED Vc_XOP_INTRINSICS_BROKEN AND Vc_XOP_INTRINSICS_BROKEN)
  415. UserWarning("XOP disabled per default because of old/broken toolchain")
  416. set(_xop_broken true)
  417. else()
  418. set(_xop_broken false)
  419. endif()
  420. if(DEFINED Vc_AVX2_INTRINSICS_BROKEN AND Vc_AVX2_INTRINSICS_BROKEN)
  421. UserWarning("AVX2 disabled per default because of old/broken toolchain")
  422. set(_avx2_broken true)
  423. else()
  424. set(_avx2_broken false)
  425. endif()
  426. endif()
  427. macro(_enable_or_disable _name _flag _documentation _broken)
  428. if(_broken)
  429. set(_found false)
  430. else()
  431. _my_find(_available_vector_units_list "${_flag}" _found)
  432. endif()
  433. set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force})
  434. mark_as_advanced(USE_${_name})
  435. if(USE_${_name})
  436. list(APPEND _enable_vector_unit_list "${_flag}")
  437. else()
  438. list(APPEND _disable_vector_unit_list "${_flag}")
  439. endif()
  440. endmacro()
  441. _enable_or_disable(SSE2 "sse2" "Use SSE2. If SSE2 instructions are not enabled the SSE implementation will be disabled." false)
  442. _enable_or_disable(SSE3 "sse3" "Use SSE3. If SSE3 instructions are not enabled they will be emulated." false)
  443. _enable_or_disable(SSSE3 "ssse3" "Use SSSE3. If SSSE3 instructions are not enabled they will be emulated." false)
  444. _enable_or_disable(SSE4_1 "sse4.1" "Use SSE4.1. If SSE4.1 instructions are not enabled they will be emulated." false)
  445. _enable_or_disable(SSE4_2 "sse4.2" "Use SSE4.2. If SSE4.2 instructions are not enabled they will be emulated." false)
  446. _enable_or_disable(SSE4a "sse4a" "Use SSE4a. If SSE4a instructions are not enabled they will be emulated." false)
  447. _enable_or_disable(AVX "avx" "Use AVX. This will all floating-point vector sizes relative to SSE." _avx_broken)
  448. _enable_or_disable(FMA "fma" "Use FMA." _avx_broken)
  449. _enable_or_disable(BMI2 "bmi2" "Use BMI2." _avx_broken)
  450. _enable_or_disable(AVX2 "avx2" "Use AVX2. This will double all of the vector sizes relative to SSE." _avx2_broken)
  451. _enable_or_disable(XOP "xop" "Use XOP." _xop_broken)
  452. _enable_or_disable(FMA4 "fma4" "Use FMA4." _fma4_broken)
  453. _enable_or_disable(AVX512F "avx512f" "Use AVX512F. This will double all floating-point vector sizes relative to AVX2." false)
  454. _enable_or_disable(AVX512VL "avx512vl" "Use AVX512VL. This enables 128- and 256-bit vector length instructions with EVEX coding (improved write-masking & more vector registers)." _avx2_broken)
  455. _enable_or_disable(AVX512PF "avx512pf" "Use AVX512PF. This enables prefetch instructions for gathers and scatters." false)
  456. _enable_or_disable(AVX512ER "avx512er" "Use AVX512ER. This enables exponential and reciprocal instructions." false)
  457. _enable_or_disable(AVX512CD "avx512cd" "Use AVX512CD." false)
  458. _enable_or_disable(AVX512DQ "avx512dq" "Use AVX512DQ." false)
  459. _enable_or_disable(AVX512BW "avx512bw" "Use AVX512BW." false)
  460. _enable_or_disable(AVX512IFMA "avx512ifma" "Use AVX512IFMA." false)
  461. _enable_or_disable(AVX512VBMI "avx512vbmi" "Use AVX512VBMI." false)
  462. if(MSVC)
  463. # MSVC on 32 bit can select /arch:SSE2 (since 2010 also /arch:AVX)
  464. # MSVC on 64 bit cannot select anything (should have changed with MSVC 2010)
  465. _my_find(_enable_vector_unit_list "avx2" _found)
  466. if(_found)
  467. AddCompilerFlag("/arch:AVX2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS CXX_RESULT _found)
  468. endif()
  469. if(NOT _found)
  470. _my_find(_enable_vector_unit_list "avx" _found)
  471. if(_found)
  472. AddCompilerFlag("/arch:AVX" CXX_FLAGS Vc_ARCHITECTURE_FLAGS CXX_RESULT _found)
  473. endif()
  474. endif()
  475. if(NOT _found)
  476. _my_find(_enable_vector_unit_list "sse2" _found)
  477. if(_found)
  478. AddCompilerFlag("/arch:SSE2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
  479. endif()
  480. endif()
  481. foreach(_flag ${_enable_vector_unit_list})
  482. string(TOUPPER "${_flag}" _flag)
  483. string(REPLACE "." "_" _flag "__${_flag}__")
  484. add_definitions("-D${_flag}")
  485. endforeach(_flag)
  486. elseif(CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") # ICC (on Linux)
  487. set(OFA_map_knl "-xMIC-AVX512")
  488. set(OFA_map_cannonlake "-xCORE-AVX512")
  489. set(OFA_map_skylake-avx512 "-xCORE-AVX512")
  490. set(OFA_map_skylake "-xCORE-AVX2")
  491. set(OFA_map_broadwell "-xCORE-AVX2")
  492. set(OFA_map_haswell "-xCORE-AVX2")
  493. set(OFA_map_ivybridge "-xCORE-AVX-I")
  494. set(OFA_map_sandybridge "-xAVX")
  495. set(OFA_map_westmere "-xSSE4.2")
  496. set(OFA_map_nehalem "-xSSE4.2")
  497. set(OFA_map_penryn "-xSSSE3")
  498. set(OFA_map_merom "-xSSSE3")
  499. set(OFA_map_core2 "-xSSE3")
  500. set(_ok FALSE)
  501. foreach(arch ${_march_flag_list})
  502. if(DEFINED OFA_map_${arch})
  503. AddCompilerFlag(${OFA_map_${arch}} CXX_FLAGS Vc_ARCHITECTURE_FLAGS CXX_RESULT _ok)
  504. if(_ok)
  505. break()
  506. endif()
  507. endif()
  508. endforeach()
  509. if(NOT _ok)
  510. # This is the Intel compiler, so SSE2 is a very reasonable baseline.
  511. message(STATUS "Did not recognize the requested architecture flag, falling back to SSE2")
  512. AddCompilerFlag("-xSSE2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
  513. endif()
  514. else() # not MSVC and not ICC => GCC, Clang, Open64
  515. foreach(_flag ${_march_flag_list})
  516. AddCompilerFlag("-march=${_flag}" CXX_RESULT _good CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
  517. if(_good)
  518. break()
  519. endif(_good)
  520. endforeach(_flag)
  521. foreach(_flag ${_enable_vector_unit_list})
  522. AddCompilerFlag("-m${_flag}" CXX_RESULT _result)
  523. if(_result)
  524. set(_header FALSE)
  525. if(_flag STREQUAL "sse3")
  526. set(_header "pmmintrin.h")
  527. elseif(_flag STREQUAL "ssse3")
  528. set(_header "tmmintrin.h")
  529. elseif(_flag STREQUAL "sse4.1")
  530. set(_header "smmintrin.h")
  531. elseif(_flag STREQUAL "sse4.2")
  532. set(_header "smmintrin.h")
  533. elseif(_flag STREQUAL "sse4a")
  534. set(_header "ammintrin.h")
  535. elseif(_flag STREQUAL "avx")
  536. set(_header "immintrin.h")
  537. elseif(_flag STREQUAL "avx2")
  538. set(_header "immintrin.h")
  539. elseif(_flag STREQUAL "fma4")
  540. set(_header "x86intrin.h")
  541. elseif(_flag STREQUAL "xop")
  542. set(_header "x86intrin.h")
  543. endif()
  544. set(_resultVar "HAVE_${_header}")
  545. string(REPLACE "." "_" _resultVar "${_resultVar}")
  546. if(_header)
  547. CHECK_INCLUDE_FILE_CXX("${_header}" ${_resultVar} "-m${_flag}")
  548. if(NOT ${_resultVar})
  549. set(_useVar "USE_${_flag}")
  550. string(TOUPPER "${_useVar}" _useVar)
  551. string(REPLACE "." "_" _useVar "${_useVar}")
  552. message(STATUS "disabling ${_useVar} because ${_header} is missing")
  553. set(${_useVar} FALSE)
  554. list(APPEND _disable_vector_unit_list "${_flag}")
  555. endif()
  556. endif()
  557. if(NOT _header OR ${_resultVar})
  558. list(APPEND Vc_ARCHITECTURE_FLAGS "-m${_flag}")
  559. endif()
  560. endif()
  561. endforeach(_flag)
  562. foreach(_flag ${_disable_vector_unit_list})
  563. AddCompilerFlag("-mno-${_flag}" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
  564. endforeach(_flag)
  565. endif()
  566. endif()
  567. endmacro()