[clang] 6c174ab - [X86] Remove __builtin_ia32_padd/psub saturated intrinsics and use generic __builtin_elementwise_add/sub_sat
Simon Pilgrim via cfe-commits
cfe-commits at lists.llvm.org
Tue Feb 8 06:22:56 PST 2022
Author: Simon Pilgrim
Date: 2022-02-08T14:21:20Z
New Revision: 6c174ab2ad0676b295f11f6c3913eff9289fa6b9
URL: https://github.com/llvm/llvm-project/commit/6c174ab2ad0676b295f11f6c3913eff9289fa6b9
DIFF: https://github.com/llvm/llvm-project/commit/6c174ab2ad0676b295f11f6c3913eff9289fa6b9.diff
LOG: [X86] Remove __builtin_ia32_padd/psub saturated intrinsics and use generic __builtin_elementwise_add/sub_sat
D117898 added the generic __builtin_elementwise_add_sat and __builtin_elementwise_sub_sat with the same integer behaviour as the SSE/AVX instructions
This patch removes the __builtin_ia32_padd/psub saturated intrinsics and just uses the generics - the existing tests see no changes:
__m256i test_mm256_adds_epi8(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_adds_epi8
// CHECK: call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
return _mm256_adds_epi8(a, b);
}
Added:
Modified:
clang/include/clang/Basic/BuiltinsX86.def
clang/lib/CodeGen/CGBuiltin.cpp
clang/lib/Headers/avx2intrin.h
clang/lib/Headers/avx512bwintrin.h
clang/lib/Headers/emmintrin.h
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index 0669a96b942b..51d5db64f333 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -254,14 +254,6 @@ TARGET_BUILTIN(__builtin_ia32_minpd, "V2dV2dV2d", "ncV:128:", "sse2")
TARGET_BUILTIN(__builtin_ia32_maxpd, "V2dV2dV2d", "ncV:128:", "sse2")
TARGET_BUILTIN(__builtin_ia32_minsd, "V2dV2dV2d", "ncV:128:", "sse2")
TARGET_BUILTIN(__builtin_ia32_maxsd, "V2dV2dV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_paddsb128, "V16cV16cV16c", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_paddsw128, "V8sV8sV8s", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psubsb128, "V16cV16cV16c", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psubsw128, "V8sV8sV8s", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_paddusb128, "V16cV16cV16c", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_paddusw128, "V8sV8sV8s", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psubusb128, "V16cV16cV16c", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psubusw128, "V8sV8sV8s", "ncV:128:", "sse2")
TARGET_BUILTIN(__builtin_ia32_pmulhw128, "V8sV8sV8s", "ncV:128:", "sse2")
TARGET_BUILTIN(__builtin_ia32_pavgb128, "V16cV16cV16c", "ncV:128:", "sse2")
TARGET_BUILTIN(__builtin_ia32_pavgw128, "V8sV8sV8s", "ncV:128:", "sse2")
@@ -547,14 +539,6 @@ TARGET_BUILTIN(__builtin_ia32_packsswb256, "V32cV16sV16s", "ncV:256:", "avx2")
TARGET_BUILTIN(__builtin_ia32_packssdw256, "V16sV8iV8i", "ncV:256:", "avx2")
TARGET_BUILTIN(__builtin_ia32_packuswb256, "V32cV16sV16s", "ncV:256:", "avx2")
TARGET_BUILTIN(__builtin_ia32_packusdw256, "V16sV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_paddsb256, "V32cV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_paddsw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psubsb256, "V32cV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psubsw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_paddusb256, "V32cV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_paddusw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psubusb256, "V32cV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psubusw256, "V16sV16sV16s", "ncV:256:", "avx2")
TARGET_BUILTIN(__builtin_ia32_palignr256, "V32cV32cV32cIi", "ncV:256:", "avx2")
TARGET_BUILTIN(__builtin_ia32_pavgb256, "V32cV32cV32c", "ncV:256:", "avx2")
TARGET_BUILTIN(__builtin_ia32_pavgw256, "V16sV16sV16s", "ncV:256:", "avx2")
@@ -1009,17 +993,9 @@ TARGET_BUILTIN(__builtin_ia32_packssdw512, "V32sV16iV16i", "ncV:512:", "avx512bw
TARGET_BUILTIN(__builtin_ia32_packsswb512, "V64cV32sV32s", "ncV:512:", "avx512bw")
TARGET_BUILTIN(__builtin_ia32_packusdw512, "V32sV16iV16i", "ncV:512:", "avx512bw")
TARGET_BUILTIN(__builtin_ia32_packuswb512, "V64cV32sV32s", "ncV:512:", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_paddsb512, "V64cV64cV64c", "ncV:512:", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_paddsw512, "V32sV32sV32s", "ncV:512:", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_paddusb512, "V64cV64cV64c", "ncV:512:", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_paddusw512, "V32sV32sV32s", "ncV:512:", "avx512bw")
TARGET_BUILTIN(__builtin_ia32_pavgb512, "V64cV64cV64c", "ncV:512:", "avx512bw")
TARGET_BUILTIN(__builtin_ia32_pavgw512, "V32sV32sV32s", "ncV:512:", "avx512bw")
TARGET_BUILTIN(__builtin_ia32_pshufb512, "V64cV64cV64c", "ncV:512:", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_psubsb512, "V64cV64cV64c", "ncV:512:", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_psubsw512, "V32sV32sV32s", "ncV:512:", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_psubusb512, "V64cV64cV64c", "ncV:512:", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_psubusw512, "V32sV32sV32s", "ncV:512:", "avx512bw")
TARGET_BUILTIN(__builtin_ia32_vpconflictdi_128, "V2OiV2Oi", "ncV:128:", "avx512cd,avx512vl")
TARGET_BUILTIN(__builtin_ia32_vpconflictdi_256, "V4OiV4Oi", "ncV:256:", "avx512cd,avx512vl")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index efb327e6d770..310a1ea583ce 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -12552,13 +12552,6 @@ static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op,
return CGF.Builder.CreateSExt(Mask, DstTy, "vpmovm2");
}
-// Emit binary intrinsic with the same type used in result/args.
-static Value *EmitX86BinaryIntrinsic(CodeGenFunction &CGF,
- ArrayRef<Value *> Ops, Intrinsic::ID IID) {
- llvm::Function *F = CGF.CGM.getIntrinsic(IID, Ops[0]->getType());
- return CGF.Builder.CreateCall(F, {Ops[0], Ops[1]});
-}
-
Value *CodeGenFunction::EmitX86CpuIs(const CallExpr *E) {
const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts();
StringRef CPUStr = cast<clang::StringLiteral>(CPUExpr)->getString();
@@ -15012,34 +15005,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
Load->setVolatile(true);
return Load;
}
- case X86::BI__builtin_ia32_paddsb512:
- case X86::BI__builtin_ia32_paddsw512:
- case X86::BI__builtin_ia32_paddsb256:
- case X86::BI__builtin_ia32_paddsw256:
- case X86::BI__builtin_ia32_paddsb128:
- case X86::BI__builtin_ia32_paddsw128:
- return EmitX86BinaryIntrinsic(*this, Ops, Intrinsic::sadd_sat);
- case X86::BI__builtin_ia32_paddusb512:
- case X86::BI__builtin_ia32_paddusw512:
- case X86::BI__builtin_ia32_paddusb256:
- case X86::BI__builtin_ia32_paddusw256:
- case X86::BI__builtin_ia32_paddusb128:
- case X86::BI__builtin_ia32_paddusw128:
- return EmitX86BinaryIntrinsic(*this, Ops, Intrinsic::uadd_sat);
- case X86::BI__builtin_ia32_psubsb512:
- case X86::BI__builtin_ia32_psubsw512:
- case X86::BI__builtin_ia32_psubsb256:
- case X86::BI__builtin_ia32_psubsw256:
- case X86::BI__builtin_ia32_psubsb128:
- case X86::BI__builtin_ia32_psubsw128:
- return EmitX86BinaryIntrinsic(*this, Ops, Intrinsic::ssub_sat);
- case X86::BI__builtin_ia32_psubusb512:
- case X86::BI__builtin_ia32_psubusw512:
- case X86::BI__builtin_ia32_psubusb256:
- case X86::BI__builtin_ia32_psubusw256:
- case X86::BI__builtin_ia32_psubusb128:
- case X86::BI__builtin_ia32_psubusw128:
- return EmitX86BinaryIntrinsic(*this, Ops, Intrinsic::usub_sat);
case X86::BI__builtin_ia32_encodekey128_u32: {
Intrinsic::ID IID = Intrinsic::x86_encodekey128;
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index e33514a60ff3..f8521e7d72b5 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -92,25 +92,25 @@ _mm256_add_epi64(__m256i __a, __m256i __b)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_adds_epi8(__m256i __a, __m256i __b)
{
- return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
+ return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_adds_epi16(__m256i __a, __m256i __b)
{
- return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
+ return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_adds_epu8(__m256i __a, __m256i __b)
{
- return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
+ return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_adds_epu16(__m256i __a, __m256i __b)
{
- return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
+ return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
}
#define _mm256_alignr_epi8(a, b, n) \
@@ -628,25 +628,25 @@ _mm256_sub_epi64(__m256i __a, __m256i __b)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_subs_epi8(__m256i __a, __m256i __b)
{
- return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
+ return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_subs_epi16(__m256i __a, __m256i __b)
{
- return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
+ return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_subs_epu8(__m256i __a, __m256i __b)
{
- return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
+ return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_subs_epu16(__m256i __a, __m256i __b)
{
- return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
+ return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index 522ef100bab1..c99ef9e3bd54 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -617,7 +617,7 @@ _mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_adds_epi8 (__m512i __A, __m512i __B)
{
- return (__m512i)__builtin_ia32_paddsb512((__v64qi)__A, (__v64qi)__B);
+ return (__m512i)__builtin_elementwise_add_sat((__v64qs)__A, (__v64qs)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@@ -639,7 +639,7 @@ _mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_adds_epi16 (__m512i __A, __m512i __B)
{
- return (__m512i)__builtin_ia32_paddsw512((__v32hi)__A, (__v32hi)__B);
+ return (__m512i)__builtin_elementwise_add_sat((__v32hi)__A, (__v32hi)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@@ -661,7 +661,7 @@ _mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_adds_epu8 (__m512i __A, __m512i __B)
{
- return (__m512i)__builtin_ia32_paddusb512((__v64qi) __A, (__v64qi) __B);
+ return (__m512i)__builtin_elementwise_add_sat((__v64qu) __A, (__v64qu) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@@ -683,7 +683,7 @@ _mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_adds_epu16 (__m512i __A, __m512i __B)
{
- return (__m512i)__builtin_ia32_paddusw512((__v32hi) __A, (__v32hi) __B);
+ return (__m512i)__builtin_elementwise_add_sat((__v32hu) __A, (__v32hu) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@@ -950,7 +950,7 @@ _mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_subs_epi8 (__m512i __A, __m512i __B)
{
- return (__m512i)__builtin_ia32_psubsb512((__v64qi)__A, (__v64qi)__B);
+ return (__m512i)__builtin_elementwise_sub_sat((__v64qs)__A, (__v64qs)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@@ -972,7 +972,7 @@ _mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_subs_epi16 (__m512i __A, __m512i __B)
{
- return (__m512i)__builtin_ia32_psubsw512((__v32hi)__A, (__v32hi)__B);
+ return (__m512i)__builtin_elementwise_sub_sat((__v32hi)__A, (__v32hi)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@@ -994,7 +994,7 @@ _mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_subs_epu8 (__m512i __A, __m512i __B)
{
- return (__m512i)__builtin_ia32_psubusb512((__v64qi) __A, (__v64qi) __B);
+ return (__m512i)__builtin_elementwise_sub_sat((__v64qu) __A, (__v64qu) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@@ -1016,7 +1016,7 @@ _mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_subs_epu16 (__m512i __A, __m512i __B)
{
- return (__m512i)__builtin_ia32_psubusw512((__v32hi) __A, (__v32hi) __B);
+ return (__m512i)__builtin_elementwise_sub_sat((__v32hu) __A, (__v32hu) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 4618b808efc4..942a0f788a8c 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -2225,7 +2225,7 @@ _mm_add_epi64(__m128i __a, __m128i __b)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epi8(__m128i __a, __m128i __b)
{
- return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
+ return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
}
/// Adds, with saturation, the corresponding elements of two 128-bit
@@ -2247,7 +2247,7 @@ _mm_adds_epi8(__m128i __a, __m128i __b)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epi16(__m128i __a, __m128i __b)
{
- return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
+ return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
}
/// Adds, with saturation, the corresponding elements of two 128-bit
@@ -2268,7 +2268,7 @@ _mm_adds_epi16(__m128i __a, __m128i __b)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epu8(__m128i __a, __m128i __b)
{
- return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
+ return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
}
/// Adds, with saturation, the corresponding elements of two 128-bit
@@ -2289,7 +2289,7 @@ _mm_adds_epu8(__m128i __a, __m128i __b)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epu16(__m128i __a, __m128i __b)
{
- return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
+ return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
}
/// Computes the rounded averages of corresponding elements of two
@@ -2667,7 +2667,7 @@ _mm_sub_epi64(__m128i __a, __m128i __b)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epi8(__m128i __a, __m128i __b)
{
- return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
+ return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
}
/// Subtracts corresponding 16-bit signed integer values in the input and
@@ -2688,7 +2688,7 @@ _mm_subs_epi8(__m128i __a, __m128i __b)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epi16(__m128i __a, __m128i __b)
{
- return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
+ return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
}
/// Subtracts corresponding 8-bit unsigned integer values in the input
@@ -2708,7 +2708,7 @@ _mm_subs_epi16(__m128i __a, __m128i __b)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epu8(__m128i __a, __m128i __b)
{
- return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
+ return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
}
/// Subtracts corresponding 16-bit unsigned integer values in the input
@@ -2728,7 +2728,7 @@ _mm_subs_epu8(__m128i __a, __m128i __b)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epu16(__m128i __a, __m128i __b)
{
- return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
+ return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
}
/// Performs a bitwise AND of two 128-bit integer vectors.
More information about the cfe-commits
mailing list