| /**************************************************************************** |
| ** |
| ** Copyright (C) 2015 The Qt Company Ltd. |
| ** Contact: http://www.qt.io/licensing/ |
| ** |
| ** This file is part of the QtCore module of the Qt Toolkit. |
| ** |
| ** $QT_BEGIN_LICENSE:LGPL21$ |
| ** Commercial License Usage |
| ** Licensees holding valid commercial Qt licenses may use this file in |
| ** accordance with the commercial license agreement provided with the |
| ** Software or, alternatively, in accordance with the terms contained in |
| ** a written agreement between you and The Qt Company. For licensing terms |
| ** and conditions see http://www.qt.io/terms-conditions. For further |
| ** information use the contact form at http://www.qt.io/contact-us. |
| ** |
| ** GNU Lesser General Public License Usage |
| ** Alternatively, this file may be used under the terms of the GNU Lesser |
| ** General Public License version 2.1 or version 3 as published by the Free |
| ** Software Foundation and appearing in the file LICENSE.LGPLv21 and |
| ** LICENSE.LGPLv3 included in the packaging of this file. Please review the |
| ** following information to ensure the GNU Lesser General Public License |
| ** requirements will be met: https://www.gnu.org/licenses/lgpl.html and |
| ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. |
| ** |
| ** As a special exception, The Qt Company gives you certain additional |
| ** rights. These rights are described in The Qt Company LGPL Exception |
| ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. |
| ** |
| ** $QT_END_LICENSE$ |
| ** |
| ****************************************************************************/ |
| |
| #ifndef QSIMD_P_H |
| #define QSIMD_P_H |
| |
| // |
| // W A R N I N G |
| // ------------- |
| // |
| // This file is not part of the Qt API. It exists purely as an |
| // implementation detail. This header file may change from version to |
| // version without notice, or even be removed. |
| // |
| // We mean it. |
| // |
| |
| #include <qglobal.h> |
| #include <qatomic.h> |
| |
| /* |
| * qt_module_config.prf defines the QT_COMPILER_SUPPORTS_XXX macros. |
| * They mean the compiler supports the necessary flags and the headers |
| * for the x86 and ARM intrinsics: |
| * - GCC: the -mXXX or march=YYY flag is necessary before #include |
| * up to 4.8; GCC >= 4.9 can include unconditionally |
| * - Intel CC: #include can happen unconditionally |
| * - MSVC: #include can happen unconditionally |
| * - RVCT: ??? |
| * |
| * We will try to include all headers possible under this configuration. |
| * |
| * MSVC does not define __SSE2__ & family, so we will define them. MSVC 2013 & |
| * up do define __AVX__ if the -arch:AVX option is passed on the command-line. |
| * |
| * Supported XXX are: |
| * Flag | Arch | GCC | Intel CC | MSVC | |
| * ARM_NEON | ARM | I & C | None | ? | |
| * SSE2 | x86 | I & C | I & C | I & C | |
| * SSE3 | x86 | I & C | I & C | I only | |
| * SSSE3 | x86 | I & C | I & C | I only | |
| * SSE4_1 | x86 | I & C | I & C | I only | |
| * SSE4_2 | x86 | I & C | I & C | I only | |
| * AVX | x86 | I & C | I & C | I & C | |
| * AVX2 | x86 | I & C | I & C | I only | |
| * I = intrinsics; C = code generation |
| * |
| * Code can use the following constructs to determine compiler support & status: |
| * - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__) |
| * If this test passes, then the compiler is already generating code for that |
| * given sub-architecture. The intrinsics for that sub-architecture are |
| * #included and can be used without restriction or runtime check. |
| * |
| * - #if QT_COMPILER_SUPPORTS(XXX) |
| * If this test passes, then the compiler is able to generate code for that |
| * given sub-architecture in another translation unit, given the right set of |
| * flags. Use of the intrinsics is not guaranteed. This is useful with |
| * runtime detection (see below). |
| * |
| * - #if QT_COMPILER_SUPPORTS_HERE(XXX) |
| * If this test passes, then the compiler is able to generate code for that |
| * given sub-architecture in this translation unit, even if it is not doing |
| * that now (it might be). Individual functions may be tagged with |
| * QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that |
| * sub-arch. Only inside such functions is the use of the intrisics |
| * guaranteed to work. This is useful with runtime detection (see below). |
| * |
| * Runtime detection of a CPU sub-architecture can be done with the |
| * qCpuHasFeature(XXX) function. There are two strategies for generating |
| * optimized code like that: |
| * |
| * 1) place the optimized code in a different translation unit (C or assembly |
| * sources) and pass the correct flags to the compiler to enable support. Those |
| * sources must not include qglobal.h, which means they cannot include this |
| * file either. The dispatcher function would look like this: |
| * |
| * void foo() |
| * { |
| * #if QT_COMPILER_SUPPORTS(XXX) |
| * if (qCpuHasFeature(XXX)) { |
| * foo_optimized_xxx(); |
| * return; |
| * } |
| * #endif |
| * foo_plain(); |
| * } |
| * |
| * 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and |
| * surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use |
| * other Qt code. The dispatcher function would look like this: |
| * |
| * void foo() |
| * { |
| * #if QT_COMPILER_SUPPORTS_HERE(XXX) |
| * if (qCpuHasFeature(XXX)) { |
| * foo_optimized_xxx(); |
| * return; |
| * } |
| * #endif |
| * foo_plain(); |
| * } |
| */ |
| |
| #if defined(__MINGW64_VERSION_MAJOR) || (defined(Q_CC_MSVC) && !defined(Q_OS_WINCE)) |
| #include <intrin.h> |
| #endif |
| |
| #define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0) |
| |
| #if (defined(Q_CC_INTEL) || defined(Q_CC_MSVC) \ |
| || (defined(Q_CC_GNU) && !defined(Q_CC_CLANG) && (__GNUC__-0) * 100 + (__GNUC_MINOR__-0) >= 409)) \ |
| && !defined(QT_BOOTSTRAPPED) |
| # define QT_COMPILER_SUPPORTS_SIMD_ALWAYS |
| # define QT_COMPILER_SUPPORTS_HERE(x) QT_COMPILER_SUPPORTS(x) |
| # if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) |
| /* GCC requires attributes for a function */ |
| # define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x))) |
| # else |
| # define QT_FUNCTION_TARGET(x) |
| # endif |
| #else |
| # define QT_COMPILER_SUPPORTS_HERE(x) defined(__ ## x ## __) |
| # define QT_FUNCTION_TARGET(x) |
| #endif |
| |
| // SSE intrinsics |
| #define QT_FUNCTION_TARGET_STRING_SSE2 "sse2" |
| #if defined(__SSE2__) || (defined(QT_COMPILER_SUPPORTS_SSE2) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)) |
| #if defined(QT_LINUXBASE) || defined(Q_OS_ANDROID_NO_SDK) |
| /// this is an evil hack - the posix_memalign declaration in LSB |
| /// is wrong - see http://bugs.linuxbase.org/show_bug.cgi?id=2431 |
| # define posix_memalign _lsb_hack_posix_memalign |
| # include <emmintrin.h> |
| # undef posix_memalign |
| #else |
| # include <emmintrin.h> |
| #endif |
| #if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2) |
| # define __SSE__ 1 |
| # define __SSE2__ 1 |
| #endif |
| #endif |
| |
| // SSE3 intrinsics |
| #define QT_FUNCTION_TARGET_STRING_SSE3 "sse3" |
| #if defined(__SSE3__) || (defined(QT_COMPILER_SUPPORTS_SSE3) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)) |
| #include <pmmintrin.h> |
| #endif |
| |
| // SSSE3 intrinsics |
| #define QT_FUNCTION_TARGET_STRING_SSSE3 "ssse3" |
| #if defined(__SSSE3__) || (defined(QT_COMPILER_SUPPORTS_SSSE3) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)) |
| #include <tmmintrin.h> |
| #endif |
| |
| // SSE4.1 intrinsics |
| #define QT_FUNCTION_TARGET_STRING_SSE4_1 "sse4.1" |
| #if defined(__SSE4_1__) || (defined(QT_COMPILER_SUPPORTS_SSE4_1) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)) |
| #include <smmintrin.h> |
| #endif |
| |
| // SSE4.2 intrinsics |
| #define QT_FUNCTION_TARGET_STRING_SSE4_2 "sse4.2" |
| #if defined(__SSE4_2__) || (defined(QT_COMPILER_SUPPORTS_SSE4_2) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)) |
| #include <nmmintrin.h> |
| #endif |
| |
| // AVX intrinsics |
| #define QT_FUNCTION_TARGET_STRING_AVX "avx" |
| #define QT_FUNCTION_TARGET_STRING_AVX2 "avx2" |
| #if defined(__AVX__) || (defined(QT_COMPILER_SUPPORTS_AVX) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)) |
| // immintrin.h is the ultimate header, we don't need anything else after this |
| #include <immintrin.h> |
| |
| # if defined(Q_CC_MSVC) && (defined(_M_AVX) || defined(__AVX__)) |
| // MS Visual Studio 2010 has no macro pre-defined to identify the use of /arch:AVX |
| // MS Visual Studio 2013 adds it: __AVX__ |
| // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32 |
| # define __SSE3__ 1 |
| # define __SSSE3__ 1 |
| // no Intel CPU supports SSE4a, so don't define it |
| # define __SSE4_1__ 1 |
| # define __SSE4_2__ 1 |
| # ifndef __AVX__ |
| # define __AVX__ 1 |
| # endif |
| # endif |
| #endif |
| |
| // other x86 intrinsics |
| #if defined(Q_PROCESSOR_X86) && ((defined(Q_CC_GNU) && (Q_CC_GNU >= 404)) \ |
| || (defined(Q_CC_CLANG) && (Q_CC_CLANG >= 208)) \ |
| || defined(Q_CC_INTEL)) |
| # define QT_COMPILER_SUPPORTS_X86INTRIN |
| # ifdef Q_CC_INTEL |
| // The Intel compiler has no <x86intrin.h> -- all intrinsics are in <immintrin.h>; |
| # include <immintrin.h> |
| # else |
| // GCC 4.4 and Clang 2.8 added a few more intrinsics there |
| # include <x86intrin.h> |
| # endif |
| #endif |
| |
| // NEON intrinsics |
| // note: as of GCC 4.9, does not support function targets for ARM |
| #if defined __ARM_NEON__ |
| #include <arm_neon.h> |
| #define QT_FUNCTION_TARGET_STRING_ARM_NEON "neon" |
| #endif |
| |
| #undef QT_COMPILER_SUPPORTS_SIMD_ALWAYS |
| |
| QT_BEGIN_NAMESPACE |
| |
| |
| enum CPUFeatures { |
| NEON = 0x2, ARM_NEON = NEON, |
| SSE2 = 0x4, |
| SSE3 = 0x8, |
| SSSE3 = 0x10, |
| SSE4_1 = 0x20, |
| SSE4_2 = 0x40, |
| AVX = 0x80, |
| AVX2 = 0x100, |
| HLE = 0x200, |
| RTM = 0x400, |
| DSP = 0x800, |
| DSPR2 = 0x1000, |
| |
| // used only to indicate that the CPU detection was initialised |
| QSimdInitialized = 0x80000000 |
| }; |
| |
| static const uint qCompilerCpuFeatures = 0 |
| #if defined __RTM__ |
| | RTM |
| #endif |
| #if defined __HLE__ |
| | HLE |
| #endif |
| #if defined __AVX2__ |
| | AVX2 |
| #endif |
| #if defined __AVX__ |
| | AVX |
| #endif |
| #if defined __SSE4_2__ |
| | SSE4_2 |
| #endif |
| #if defined __SSE4_1__ |
| | SSE4_1 |
| #endif |
| #if defined __SSSE3__ |
| | SSSE3 |
| #endif |
| #if defined __SSE3__ |
| | SSE3 |
| #endif |
| #if defined __SSE2__ |
| | SSE2 |
| #endif |
| #if defined __ARM_NEON__ |
| | NEON |
| #endif |
| #if defined __mips_dsp |
| | DSP |
| #endif |
| #if defined __mips_dspr2 |
| | DSPR2 |
| #endif |
| ; |
| |
| extern Q_CORE_EXPORT QBasicAtomicInt qt_cpu_features; |
| Q_CORE_EXPORT void qDetectCpuFeatures(); |
| |
| static inline uint qCpuFeatures() |
| { |
| int features = qt_cpu_features.load(); |
| if (Q_UNLIKELY(features == 0)) { |
| qDetectCpuFeatures(); |
| features = qt_cpu_features.load(); |
| Q_ASSUME(features != 0); |
| } |
| return uint(features); |
| } |
| |
| #define qCpuHasFeature(feature) ((qCompilerCpuFeatures & (feature)) || (qCpuFeatures() & (feature))) |
| |
| #ifdef Q_PROCESSOR_X86 |
| // Bit scan functions for x86 |
| # if defined(Q_CC_MSVC) && !defined(Q_OS_WINCE) |
| // MSVC calls it _BitScanReverse and returns the carry flag, which we don't need |
| static __forceinline unsigned long _bit_scan_reverse(uint val) |
| { |
| unsigned long result; |
| _BitScanReverse(&result, val); |
| return result; |
| } |
| static __forceinline unsigned long _bit_scan_forward(uint val) |
| { |
| unsigned long result; |
| _BitScanForward(&result, val); |
| return result; |
| } |
| # elif (defined(Q_CC_CLANG) || (defined(Q_CC_GNU) && Q_CC_GNU < 405)) \ |
| && !defined(Q_CC_INTEL) |
| // Clang is missing the intrinsic for _bit_scan_reverse |
| // GCC only added it in version 4.5 |
| static inline __attribute__((always_inline)) |
| unsigned _bit_scan_reverse(unsigned val) |
| { |
| unsigned result; |
| asm("bsr %1, %0" : "=r" (result) : "r" (val)); |
| return result; |
| } |
| static inline __attribute__((always_inline)) |
| unsigned _bit_scan_forward(unsigned val) |
| { |
| unsigned result; |
| asm("bsf %1, %0" : "=r" (result) : "r" (val)); |
| return result; |
| } |
| # endif |
| #endif // Q_PROCESSOR_X86 |
| |
| #define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \ |
| for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i) |
| |
| QT_END_NAMESPACE |
| |
| #endif // QSIMD_P_H |