[clang] [llvm] [X86] EmitX86BuiltinExpr - attempt to convert SSE41/AVX1 roundps/d/ss/sd builtins to regular rounding modes (PR #171227)
Gergo Stomfai via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 11 15:50:43 PST 2025
https://github.com/stomfaig updated https://github.com/llvm/llvm-project/pull/171227
>From 62647bf9b0323e8ca161dd87657e56e5d6aa20b1 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Sun, 7 Dec 2025 23:37:58 +0000
Subject: [PATCH 01/15] adding initial handlers
---
clang/lib/CodeGen/TargetBuiltins/X86.cpp | 98 ++++++++++++++++++++++++
1 file changed, 98 insertions(+)
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index be2b7d442645e..a3e5c48629228 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -840,6 +840,104 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
Ops[0]);
return Builder.CreateExtractValue(Call, 0);
}
+ case X86::BI__builtin_ia32_roundps:
+ case X86::BI__builtin_ia32_roundpd:
+ case X86::BI__builtin_ia32_roundps256:
+ case X86::BI__builtin_ia32_roundpd256: {
+ unsigned M = cast<ConstantInt>(Ops[1])->getZExtValue();
+ unsigned roundingModeAndPE = M & 0b111;
+ unsigned updatePE = M & 0b100;
+ unsigned use_MXCSR = M & 0b1000;
+
+ Intrinsic::ID ID;
+
+ // Currently no ops for MXCSR bit set, so lower directly to SSE41 instruction
+ if (use_MXCSR) {
+ switch (BuiltinID) {
+ case X86::BI__builtin_ia32_roundps: ID = Intrinsic::x86_sse41_round_ps; break;
+ case X86::BI__builtin_ia32_roundpd: ID = Intrinsic::x86_sse41_round_pd; break;
+ }
+ return nullptr;
+ } else {
+ switch (roundingModeAndPE) {
+ default: return nullptr;
+ case 0b000: ID = Intrinsic::nearbyint; break;
+ case 0b001: ID = Intrinsic::floor; break;
+ case 0b010: ID = Intrinsic::ceil; break;
+ case 0b011: ID = Intrinsic::trunc; break;
+ case 0b100: ID = Intrinsic::experimental_constrained_floor; break; // TODO: replace with actual op
+ case 0b101: ID = Intrinsic::experimental_constrained_floor; break;
+ case 0b110: ID = Intrinsic::experimental_constrained_ceil; break;
+ case 0b111: ID = Intrinsic::experimental_constrained_trunc; break;
+ }
+ }
+
+ Function *F = CGM.getIntrinsic(ID, Ops[0]->getType());
+
+ if (updatePE) {
+ LLVMContext &Ctx = CGM.getLLVMContext();
+
+ Value *ExceptMode =MetadataAsValue::get(
+ Ctx,
+ MDString::get(Ctx, "fpexcept.strict")
+ );
+
+ return Builder.CreateCall(F, {Ops[0], ExceptMode});
+ }
+
+ return Builder.CreateCall(F, {Ops[0]});
+ }
+ case X86::BI__builtin_ia32_roundss:
+ case X86::BI__builtin_ia32_roundsd: {
+ unsigned M = cast<ConstantInt>(Ops[2])->getZExtValue();
+ unsigned roundingModeAndPE = M & 0b111;
+ unsigned updatePE = M & 0b100;
+ unsigned use_MXCSR = M & 0b1000;
+
+ Intrinsic::ID ID;
+
+ // Currently no ops for MXCSR bit set, so lower directly to SSE41 instruction
+ if (use_MXCSR) {
+ switch (BuiltinID) {
+ case X86::BI__builtin_ia32_roundss: ID = Intrinsic::x86_sse41_round_ss; break;
+ case X86::BI__builtin_ia32_roundsd: ID = Intrinsic::x86_sse41_round_sd; break;
+ }
+ return nullptr;
+ } else {
+ switch (roundingModeAndPE) {
+ default: return nullptr;
+ case 0b000: ID = Intrinsic::nearbyint; break;
+ case 0b001: ID = Intrinsic::floor; break;
+ case 0b010: ID = Intrinsic::ceil; break;
+ case 0b011: ID = Intrinsic::trunc; break;
+ case 0b100: ID = Intrinsic::experimental_constrained_floor; break; // TODO: replace with actual op
+ case 0b101: ID = Intrinsic::experimental_constrained_floor; break;
+ case 0b110: ID = Intrinsic::experimental_constrained_ceil; break;
+ case 0b111: ID = Intrinsic::experimental_constrained_trunc; break;
+ }
+ }
+
+ Value *idx = Builder.getInt32(0);
+ Value *b0 = Builder.CreateExtractElement(Ops[1], idx);
+ Value *rounded0;
+
+ Function *F = CGM.getIntrinsic(ID, b0->getType());
+
+ if (updatePE) {
+ LLVMContext &Ctx = CGM.getLLVMContext();
+
+ Value *ExceptMode =MetadataAsValue::get(
+ Ctx,
+ MDString::get(Ctx, "fpexcept.strict")
+ );
+
+ rounded0 = Builder.CreateCall(F, {b0, ExceptMode});
+ } else {
+ rounded0 = Builder.CreateCall(F, {b0});
+ }
+
+ return Builder.CreateInsertElement(Ops[0], rounded0, idx);
+ }
case X86::BI__builtin_ia32_lzcnt_u16:
case X86::BI__builtin_ia32_lzcnt_u32:
case X86::BI__builtin_ia32_lzcnt_u64: {
>From 56f72b858744ff3c837170aac2f6b9654dd9be0e Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Sun, 7 Dec 2025 23:38:35 +0000
Subject: [PATCH 02/15] modify relevant tests
---
clang/test/CodeGen/X86/avx-builtins.c | 12 ++++-----
clang/test/CodeGen/X86/pr51324.c | 2 +-
clang/test/CodeGen/X86/sse41-builtins.c | 36 ++++++++++++++++---------
3 files changed, 31 insertions(+), 19 deletions(-)
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 13da4292c5b92..506327bc910c7 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -246,13 +246,13 @@ TEST_CONSTEXPR(match_m128i(_mm256_castsi256_si128((__m256i)(__v4du){0xBFF0000000
__m256d test_mm256_ceil_pd(__m256d x) {
// CHECK-LABEL: test_mm256_ceil_pd
- // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 2)
+ // CHECK: %{{.*}} = call <4 x double> @llvm.ceil.v4f64(<4 x double> %{{.*}})
return _mm256_ceil_pd(x);
}
__m256 test_mm_ceil_ps(__m256 x) {
// CHECK-LABEL: test_mm_ceil_ps
- // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 2)
+ // CHECK: %{{.*}} = call <8 x float> @llvm.ceil.v8f32(<8 x float> %{{.*}})
return _mm256_ceil_ps(x);
}
@@ -1095,13 +1095,13 @@ TEST_CONSTEXPR(match_m128i(_mm256_extractf128_si256(((__m256i){0ULL, 1ULL, 2ULL,
__m256d test_mm256_floor_pd(__m256d x) {
// CHECK-LABEL: test_mm256_floor_pd
- // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 1)
+ // CHECK: %{{.*}} = call <4 x double> @llvm.floor.v4f64(<4 x double> %{{.*}})
return _mm256_floor_pd(x);
}
__m256 test_mm_floor_ps(__m256 x) {
// CHECK-LABEL: test_mm_floor_ps
- // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 1)
+ // CHECK: %{{.*}} = call <8 x float> @llvm.floor.v8f32(<8 x float> %{{.*}})
return _mm256_floor_ps(x);
}
@@ -1511,13 +1511,13 @@ __m256 test_mm256_rcp_ps(__m256 A) {
__m256d test_mm256_round_pd(__m256d x) {
// CHECK-LABEL: test_mm256_round_pd
- // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 4)
+ // CHECK: %{{.*}} = call <4 x double> @llvm.experimental.constrained.floor.v4f64(<4 x double> %{{.*}}, metadata !"fpexcept.strict")
return _mm256_round_pd(x, 4);
}
__m256 test_mm256_round_ps(__m256 x) {
// CHECK-LABEL: test_mm256_round_ps
- // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 4)
+ // CHECK: %{{.*}} = call <8 x float> @llvm.experimental.constrained.floor.v8f32(<8 x float> %{{.*}}, metadata !"fpexcept.strict")
return _mm256_round_ps(x, 4);
}
diff --git a/clang/test/CodeGen/X86/pr51324.c b/clang/test/CodeGen/X86/pr51324.c
index 10d1ba3c84b85..de97183aa6613 100644
--- a/clang/test/CodeGen/X86/pr51324.c
+++ b/clang/test/CodeGen/X86/pr51324.c
@@ -9,7 +9,7 @@
// Make sure brackets work after macro intrinsics.
float pr51324(__m128 a) {
// CHECK-LABEL: pr51324
- // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 0)
+ // call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{.*}})
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
return _mm_round_ps(a, 0)[0];
}
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index 35fa65a99836b..9163b14a9fc11 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -75,25 +75,29 @@ TEST_CONSTEXPR(match_m128(_mm_blendv_ps((__m128)(__v4sf){0.0f, 1.0f, 2.0f, 3.0f}
__m128d test_mm_ceil_pd(__m128d x) {
// CHECK-LABEL: test_mm_ceil_pd
- // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 2)
+ // CHECK %{{.*}} = call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{.*}})
return _mm_ceil_pd(x);
}
__m128 test_mm_ceil_ps(__m128 x) {
// CHECK-LABEL: test_mm_ceil_ps
- // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2)
+ // CHECK: %{{.*}} = call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{.*}})
return _mm_ceil_ps(x);
}
__m128d test_mm_ceil_sd(__m128d x, __m128d y) {
// CHECK-LABEL: test_mm_ceil_sd
- // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 2)
+ // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0
+ // CHECK: %[[B:.*]] = call double @llvm.ceil.f64(double %[[A:.*]])
+ // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0
return _mm_ceil_sd(x, y);
}
__m128 test_mm_ceil_ss(__m128 x, __m128 y) {
// CHECK-LABEL: test_mm_ceil_ss
- // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 2)
+ // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: %[[B:.*]] = call float @llvm.ceil.f32(float %[[A:.*]])
+ // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0
return _mm_ceil_ss(x, y);
}
@@ -256,25 +260,29 @@ TEST_CONSTEXPR(_mm_extract_ps(((__m128){1.25f, 2.5f, 3.75f, 5.0f}), 6) == __buil
__m128d test_mm_floor_pd(__m128d x) {
// CHECK-LABEL: test_mm_floor_pd
- // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 1)
+ // CHECK: %{{.*}} = call <2 x double> @llvm.floor.v2f64(<2 x double> %{{.*}})
return _mm_floor_pd(x);
}
__m128 test_mm_floor_ps(__m128 x) {
// CHECK-LABEL: test_mm_floor_ps
- // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 1)
+ // CHECK: %{{.*}} = call <4 x float> @llvm.floor.v4f32(<4 x float> %{{.*}})
return _mm_floor_ps(x);
}
__m128d test_mm_floor_sd(__m128d x, __m128d y) {
// CHECK-LABEL: test_mm_floor_sd
- // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 1)
+ // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0
+ // CHECK: %[[B:.*]] = call double @llvm.floor.f64(double %[[A:.*]])
+ // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0
return _mm_floor_sd(x, y);
}
__m128 test_mm_floor_ss(__m128 x, __m128 y) {
// CHECK-LABEL: test_mm_floor_ss
- // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 1)
+ // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: %[[B:.*]] = call float @llvm.floor.f32(float %[[A:.*]])
+ // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0
return _mm_floor_ss(x, y);
}
@@ -430,25 +438,29 @@ TEST_CONSTEXPR(match_v8hi(_mm_packus_epi32((__m128i)(__v4si){40000, -50000, 3276
__m128d test_mm_round_pd(__m128d x) {
// CHECK-LABEL: test_mm_round_pd
- // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 4)
+ // CHECK: %{{.*}} = call <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double> %{{.*}}, metadata !"fpexcept.strict")
return _mm_round_pd(x, 4);
}
__m128 test_mm_round_ps(__m128 x) {
// CHECK-LABEL: test_mm_round_ps
- // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 4)
+ // CHECK: %{{.*}} = call <4 x float> @llvm.experimental.constrained.floor.v4f32(<4 x float> %{{.*}}, metadata !"fpexcept.strict")
return _mm_round_ps(x, 4);
}
__m128d test_mm_round_sd(__m128d x, __m128d y) {
// CHECK-LABEL: test_mm_round_sd
- // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 4)
+ // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0
+ // CHECK: %[[B:.*]] = call double @llvm.experimental.constrained.floor.f64(double %[[A:.*]], metadata !"fpexcept.strict")
+ // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0
return _mm_round_sd(x, y, 4);
}
__m128 test_mm_round_ss(__m128 x, __m128 y) {
// CHECK-LABEL: test_mm_round_ss
- // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 4)
+ // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: %[[B:.*]] = call float @llvm.experimental.constrained.floor.f32(float %[[A:.*]], metadata !"fpexcept.strict")
+ // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0
return _mm_round_ss(x, y, 4);
}
>From aab58a9b7309e3daa7b95ddc49da33ddebfef2cb Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Sun, 7 Dec 2025 23:39:05 +0000
Subject: [PATCH 03/15] remove ClangBuiltin from ops
---
llvm/include/llvm/IR/IntrinsicsX86.td | 18 ++++++------------
1 file changed, 6 insertions(+), 12 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 1dd23f60c7e1e..6369e97f807fb 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -626,17 +626,13 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
// FP rounding ops
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
- def int_x86_sse41_round_ss : ClangBuiltin<"__builtin_ia32_roundss">,
- DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
+ def int_x86_sse41_round_ss : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
- def int_x86_sse41_round_ps : ClangBuiltin<"__builtin_ia32_roundps">,
- DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
+ def int_x86_sse41_round_ps : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
- def int_x86_sse41_round_sd : ClangBuiltin<"__builtin_ia32_roundsd">,
- DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
+ def int_x86_sse41_round_sd : DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
- def int_x86_sse41_round_pd : ClangBuiltin<"__builtin_ia32_roundpd">,
- DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
+ def int_x86_sse41_round_pd : DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
}
@@ -921,11 +917,9 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx_rcp_ps_256 : ClangBuiltin<"__builtin_ia32_rcpps256">,
DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
- def int_x86_avx_round_pd_256 : ClangBuiltin<"__builtin_ia32_roundpd256">,
- DefaultAttrsIntrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty],
+ def int_x86_avx_round_pd_256 : DefaultAttrsIntrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<1>>]>;
- def int_x86_avx_round_ps_256 : ClangBuiltin<"__builtin_ia32_roundps256">,
- DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty],
+ def int_x86_avx_round_ps_256 : DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<1>>]>;
}
>From 156d2aa3b72e90699885973f85aaacc0eb930435 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Mon, 8 Dec 2025 23:06:21 +0000
Subject: [PATCH 04/15] moving rounding functionality to helper
---
clang/lib/CodeGen/TargetBuiltins/X86.cpp | 155 ++++++++++-------------
1 file changed, 69 insertions(+), 86 deletions(-)
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index a3e5c48629228..167ad4478e6b1 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -75,6 +75,70 @@ static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
return MaskVec;
}
+static Value *emitX86Round(CodeGenFunction &CGF,
+ Value *X,
+ unsigned M) {
+ unsigned RoundingMask = 0b11;
+ unsigned UpdatePEBit = 0b100;
+ unsigned UseMXCSRBit = 0b1000;
+
+ unsigned roundingMode = M & RoundingMask;
+ bool updatePE = M & UpdatePEBit;
+ bool useMXCSR = M & UseMXCSRBit;
+
+ Intrinsic::ID ID = Intrinsic::not_intrinsic;
+ LLVMContext &Ctx = CGF.CGM.getLLVMContext();
+
+ if (useMXCSR) {
+ ID = Intrinsic::experimental_constrained_nearbyint;
+
+ auto PE_metatadata = updatePE ? "fpexcept.strict" : "fpexcept.ignore";
+
+ Value *ExceptMode = MetadataAsValue::get(
+ Ctx,
+ MDString::get(Ctx, PE_metatadata)
+ );
+
+ Value *RoundingMode = MetadataAsValue::get(
+ Ctx,
+ MDString::get(Ctx, "rounding.dynamic")
+ );
+
+ Function *F = CGF.CGM.getIntrinsic(ID, X->getType());
+ return CGF.Builder.CreateCall(F, {X, ExceptMode, RoundingMode});
+ }
+
+ if (updatePE) {
+ switch (roundingMode) {
+ case 0b00: ID = Intrinsic::experimental_constrained_roundeven; break;
+ case 0b01: ID = Intrinsic::experimental_constrained_floor; break;
+ case 0b10: ID = Intrinsic::experimental_constrained_ceil; break;
+ case 0b11: ID = Intrinsic::experimental_constrained_trunc; break;
+ default: llvm_unreachable("Invalid rounding mode");
+ }
+
+ Value *ExceptMode =MetadataAsValue::get(
+ Ctx,
+ MDString::get(Ctx, "fpexcept.strict")
+ );
+
+ Function *F = CGF.CGM.getIntrinsic(ID, X->getType());
+ return CGF.Builder.CreateCall(F, {X, ExceptMode});
+ }
+
+ // Otherwise we can use the standard ops
+ switch (roundingMode) {
+ case 0b00: ID = Intrinsic::roundeven; break;
+ case 0b01: ID = Intrinsic::floor; break;
+ case 0b10: ID = Intrinsic::ceil; break;
+ case 0b11: ID = Intrinsic::trunc; break;
+ default: llvm_unreachable("Invalid rounding mode");
+ }
+
+ Function *F = CGF.CGM.getIntrinsic(ID, X->getType());
+ return CGF.Builder.CreateCall(F, {X});
+}
+
static Value *EmitX86MaskedStore(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
Align Alignment) {
Value *Ptr = Ops[0];
@@ -843,100 +907,19 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_roundps:
case X86::BI__builtin_ia32_roundpd:
case X86::BI__builtin_ia32_roundps256:
- case X86::BI__builtin_ia32_roundpd256: {
+ case X86::BI__builtin_ia32_roundpd256: {
unsigned M = cast<ConstantInt>(Ops[1])->getZExtValue();
- unsigned roundingModeAndPE = M & 0b111;
- unsigned updatePE = M & 0b100;
- unsigned use_MXCSR = M & 0b1000;
-
- Intrinsic::ID ID;
-
- // Currently no ops for MXCSR bit set, so lower directly to SSE41 instruction
- if (use_MXCSR) {
- switch (BuiltinID) {
- case X86::BI__builtin_ia32_roundps: ID = Intrinsic::x86_sse41_round_ps; break;
- case X86::BI__builtin_ia32_roundpd: ID = Intrinsic::x86_sse41_round_pd; break;
- }
- return nullptr;
- } else {
- switch (roundingModeAndPE) {
- default: return nullptr;
- case 0b000: ID = Intrinsic::nearbyint; break;
- case 0b001: ID = Intrinsic::floor; break;
- case 0b010: ID = Intrinsic::ceil; break;
- case 0b011: ID = Intrinsic::trunc; break;
- case 0b100: ID = Intrinsic::experimental_constrained_floor; break; // TODO: replace with actual op
- case 0b101: ID = Intrinsic::experimental_constrained_floor; break;
- case 0b110: ID = Intrinsic::experimental_constrained_ceil; break;
- case 0b111: ID = Intrinsic::experimental_constrained_trunc; break;
- }
- }
-
- Function *F = CGM.getIntrinsic(ID, Ops[0]->getType());
-
- if (updatePE) {
- LLVMContext &Ctx = CGM.getLLVMContext();
-
- Value *ExceptMode =MetadataAsValue::get(
- Ctx,
- MDString::get(Ctx, "fpexcept.strict")
- );
-
- return Builder.CreateCall(F, {Ops[0], ExceptMode});
- }
-
- return Builder.CreateCall(F, {Ops[0]});
+ return emitX86Round(*this, Ops[0], M);
}
case X86::BI__builtin_ia32_roundss:
case X86::BI__builtin_ia32_roundsd: {
unsigned M = cast<ConstantInt>(Ops[2])->getZExtValue();
- unsigned roundingModeAndPE = M & 0b111;
- unsigned updatePE = M & 0b100;
- unsigned use_MXCSR = M & 0b1000;
- Intrinsic::ID ID;
-
- // Currently no ops for MXCSR bit set, so lower directly to SSE41 instruction
- if (use_MXCSR) {
- switch (BuiltinID) {
- case X86::BI__builtin_ia32_roundss: ID = Intrinsic::x86_sse41_round_ss; break;
- case X86::BI__builtin_ia32_roundsd: ID = Intrinsic::x86_sse41_round_sd; break;
- }
- return nullptr;
- } else {
- switch (roundingModeAndPE) {
- default: return nullptr;
- case 0b000: ID = Intrinsic::nearbyint; break;
- case 0b001: ID = Intrinsic::floor; break;
- case 0b010: ID = Intrinsic::ceil; break;
- case 0b011: ID = Intrinsic::trunc; break;
- case 0b100: ID = Intrinsic::experimental_constrained_floor; break; // TODO: replace with actual op
- case 0b101: ID = Intrinsic::experimental_constrained_floor; break;
- case 0b110: ID = Intrinsic::experimental_constrained_ceil; break;
- case 0b111: ID = Intrinsic::experimental_constrained_trunc; break;
- }
- }
-
Value *idx = Builder.getInt32(0);
- Value *b0 = Builder.CreateExtractElement(Ops[1], idx);
- Value *rounded0;
-
- Function *F = CGM.getIntrinsic(ID, b0->getType());
-
- if (updatePE) {
- LLVMContext &Ctx = CGM.getLLVMContext();
-
- Value *ExceptMode =MetadataAsValue::get(
- Ctx,
- MDString::get(Ctx, "fpexcept.strict")
- );
-
- rounded0 = Builder.CreateCall(F, {b0, ExceptMode});
- } else {
- rounded0 = Builder.CreateCall(F, {b0});
- }
+ Value *ValAt0 = Builder.CreateExtractElement(Ops[1], idx);
+ Value *RoundedAt0 = emitX86Round(*this, ValAt0, M);
- return Builder.CreateInsertElement(Ops[0], rounded0, idx);
+ return Builder.CreateInsertElement(Ops[0], RoundedAt0, idx);
}
case X86::BI__builtin_ia32_lzcnt_u16:
case X86::BI__builtin_ia32_lzcnt_u32:
>From 7ab45f89c6e3092b210ab2c12a24bd706b8de41c Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Mon, 8 Dec 2025 23:06:41 +0000
Subject: [PATCH 05/15] update tests
---
clang/test/CodeGen/X86/avx-builtins.c | 4 ++--
clang/test/CodeGen/X86/sse41-builtins.c | 8 ++++----
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 506327bc910c7..f3844adf0a498 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1511,13 +1511,13 @@ __m256 test_mm256_rcp_ps(__m256 A) {
__m256d test_mm256_round_pd(__m256d x) {
// CHECK-LABEL: test_mm256_round_pd
- // CHECK: %{{.*}} = call <4 x double> @llvm.experimental.constrained.floor.v4f64(<4 x double> %{{.*}}, metadata !"fpexcept.strict")
+ // CHECK: %{{.*}} = call <4 x double> @llvm.experimental.constrained.roundeven.v4f64(<4 x double> %{{.*}}, metadata !"fpexcept.strict")
return _mm256_round_pd(x, 4);
}
__m256 test_mm256_round_ps(__m256 x) {
// CHECK-LABEL: test_mm256_round_ps
- // CHECK: %{{.*}} = call <8 x float> @llvm.experimental.constrained.floor.v8f32(<8 x float> %{{.*}}, metadata !"fpexcept.strict")
+ // CHECK: %{{.*}} = call <8 x float> @llvm.experimental.constrained.roundeven.v8f32(<8 x float> %{{.*}}, metadata !"fpexcept.strict")
return _mm256_round_ps(x, 4);
}
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index 9163b14a9fc11..f084e1dfade15 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -438,20 +438,20 @@ TEST_CONSTEXPR(match_v8hi(_mm_packus_epi32((__m128i)(__v4si){40000, -50000, 3276
__m128d test_mm_round_pd(__m128d x) {
// CHECK-LABEL: test_mm_round_pd
- // CHECK: %{{.*}} = call <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double> %{{.*}}, metadata !"fpexcept.strict")
+ // CHECK: %{{.*}} = call <2 x double> @llvm.experimental.constrained.roundeven.v2f64(<2 x double> %{{.*}}, metadata !"fpexcept.strict")
return _mm_round_pd(x, 4);
}
__m128 test_mm_round_ps(__m128 x) {
// CHECK-LABEL: test_mm_round_ps
- // CHECK: %{{.*}} = call <4 x float> @llvm.experimental.constrained.floor.v4f32(<4 x float> %{{.*}}, metadata !"fpexcept.strict")
+ // CHECK: %{{.*}} = call <4 x float> @llvm.experimental.constrained.roundeven.v4f32(<4 x float> %{{.*}}, metadata !"fpexcept.strict")
return _mm_round_ps(x, 4);
}
__m128d test_mm_round_sd(__m128d x, __m128d y) {
// CHECK-LABEL: test_mm_round_sd
// CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0
- // CHECK: %[[B:.*]] = call double @llvm.experimental.constrained.floor.f64(double %[[A:.*]], metadata !"fpexcept.strict")
+ // CHECK: %[[B:.*]] = call double @llvm.experimental.constrained.roundeven.f64(double %[[A:.*]], metadata !"fpexcept.strict")
// CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0
return _mm_round_sd(x, y, 4);
}
@@ -459,7 +459,7 @@ __m128d test_mm_round_sd(__m128d x, __m128d y) {
__m128 test_mm_round_ss(__m128 x, __m128 y) {
// CHECK-LABEL: test_mm_round_ss
// CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0
- // CHECK: %[[B:.*]] = call float @llvm.experimental.constrained.floor.f32(float %[[A:.*]], metadata !"fpexcept.strict")
+ // CHECK: %[[B:.*]] = call float @llvm.experimental.constrained.roundeven.f32(float %[[A:.*]], metadata !"fpexcept.strict")
// CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0
return _mm_round_ss(x, y, 4);
}
>From 62a18f9a34347c6243c15e0c627b1d74cfe916a7 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Mon, 8 Dec 2025 23:07:42 +0000
Subject: [PATCH 06/15] format
---
clang/lib/CodeGen/TargetBuiltins/X86.cpp | 76 ++++++++++++++----------
llvm/include/llvm/IR/IntrinsicsX86.td | 32 ++++++----
2 files changed, 63 insertions(+), 45 deletions(-)
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index 167ad4478e6b1..c8b55e855e717 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -75,52 +75,53 @@ static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
return MaskVec;
}
-static Value *emitX86Round(CodeGenFunction &CGF,
- Value *X,
- unsigned M) {
+static Value *emitX86Round(CodeGenFunction &CGF, Value *X, unsigned M) {
unsigned RoundingMask = 0b11;
unsigned UpdatePEBit = 0b100;
unsigned UseMXCSRBit = 0b1000;
-
+
unsigned roundingMode = M & RoundingMask;
bool updatePE = M & UpdatePEBit;
bool useMXCSR = M & UseMXCSRBit;
-
+
Intrinsic::ID ID = Intrinsic::not_intrinsic;
LLVMContext &Ctx = CGF.CGM.getLLVMContext();
-
+
if (useMXCSR) {
ID = Intrinsic::experimental_constrained_nearbyint;
-
+
auto PE_metatadata = updatePE ? "fpexcept.strict" : "fpexcept.ignore";
- Value *ExceptMode = MetadataAsValue::get(
- Ctx,
- MDString::get(Ctx, PE_metatadata)
- );
+ Value *ExceptMode =
+ MetadataAsValue::get(Ctx, MDString::get(Ctx, PE_metatadata));
- Value *RoundingMode = MetadataAsValue::get(
- Ctx,
- MDString::get(Ctx, "rounding.dynamic")
- );
+ Value *RoundingMode =
+ MetadataAsValue::get(Ctx, MDString::get(Ctx, "rounding.dynamic"));
Function *F = CGF.CGM.getIntrinsic(ID, X->getType());
return CGF.Builder.CreateCall(F, {X, ExceptMode, RoundingMode});
- }
+ }
if (updatePE) {
switch (roundingMode) {
- case 0b00: ID = Intrinsic::experimental_constrained_roundeven; break;
- case 0b01: ID = Intrinsic::experimental_constrained_floor; break;
- case 0b10: ID = Intrinsic::experimental_constrained_ceil; break;
- case 0b11: ID = Intrinsic::experimental_constrained_trunc; break;
- default: llvm_unreachable("Invalid rounding mode");
+ case 0b00:
+ ID = Intrinsic::experimental_constrained_roundeven;
+ break;
+ case 0b01:
+ ID = Intrinsic::experimental_constrained_floor;
+ break;
+ case 0b10:
+ ID = Intrinsic::experimental_constrained_ceil;
+ break;
+ case 0b11:
+ ID = Intrinsic::experimental_constrained_trunc;
+ break;
+ default:
+ llvm_unreachable("Invalid rounding mode");
}
- Value *ExceptMode =MetadataAsValue::get(
- Ctx,
- MDString::get(Ctx, "fpexcept.strict")
- );
+ Value *ExceptMode =
+ MetadataAsValue::get(Ctx, MDString::get(Ctx, "fpexcept.strict"));
Function *F = CGF.CGM.getIntrinsic(ID, X->getType());
return CGF.Builder.CreateCall(F, {X, ExceptMode});
@@ -128,11 +129,20 @@ static Value *emitX86Round(CodeGenFunction &CGF,
// Otherwise we can use the standard ops
switch (roundingMode) {
- case 0b00: ID = Intrinsic::roundeven; break;
- case 0b01: ID = Intrinsic::floor; break;
- case 0b10: ID = Intrinsic::ceil; break;
- case 0b11: ID = Intrinsic::trunc; break;
- default: llvm_unreachable("Invalid rounding mode");
+ case 0b00:
+ ID = Intrinsic::roundeven;
+ break;
+ case 0b01:
+ ID = Intrinsic::floor;
+ break;
+ case 0b10:
+ ID = Intrinsic::ceil;
+ break;
+ case 0b11:
+ ID = Intrinsic::trunc;
+ break;
+ default:
+ llvm_unreachable("Invalid rounding mode");
}
Function *F = CGF.CGM.getIntrinsic(ID, X->getType());
@@ -907,18 +917,18 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_roundps:
case X86::BI__builtin_ia32_roundpd:
case X86::BI__builtin_ia32_roundps256:
- case X86::BI__builtin_ia32_roundpd256: {
+ case X86::BI__builtin_ia32_roundpd256: {
unsigned M = cast<ConstantInt>(Ops[1])->getZExtValue();
return emitX86Round(*this, Ops[0], M);
}
case X86::BI__builtin_ia32_roundss:
case X86::BI__builtin_ia32_roundsd: {
unsigned M = cast<ConstantInt>(Ops[2])->getZExtValue();
-
+
Value *idx = Builder.getInt32(0);
Value *ValAt0 = Builder.CreateExtractElement(Ops[1], idx);
Value *RoundedAt0 = emitX86Round(*this, ValAt0, M);
-
+
return Builder.CreateInsertElement(Ops[0], RoundedAt0, idx);
}
case X86::BI__builtin_ia32_lzcnt_u16:
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 6369e97f807fb..7838e410badd7 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -626,14 +626,20 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
// FP rounding ops
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
- def int_x86_sse41_round_ss : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
- llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
- def int_x86_sse41_round_ps : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
- llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
- def int_x86_sse41_round_sd : DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
- llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
- def int_x86_sse41_round_pd : DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
- llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+ def int_x86_sse41_round_ss
+ : DefaultAttrsIntrinsic<[llvm_v4f32_ty],
+ [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+ def int_x86_sse41_round_ps
+ : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+ def int_x86_sse41_round_sd
+ : DefaultAttrsIntrinsic<[llvm_v2f64_ty],
+ [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+ def int_x86_sse41_round_pd
+ : DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<1>>]>;
}
// Vector min element
@@ -917,10 +923,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx_rcp_ps_256 : ClangBuiltin<"__builtin_ia32_rcpps256">,
DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
- def int_x86_avx_round_pd_256 : DefaultAttrsIntrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty],
- [IntrNoMem, ImmArg<ArgIndex<1>>]>;
- def int_x86_avx_round_ps_256 : DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty],
- [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+ def int_x86_avx_round_pd_256
+ : DefaultAttrsIntrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+ def int_x86_avx_round_ps_256
+ : DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<1>>]>;
}
// Horizontal ops
>From c4eff0d6076ebaeaaef481f4adc914bfc349ec4a Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Tue, 9 Dec 2025 18:04:23 +0000
Subject: [PATCH 07/15] resolving comments
---
clang/lib/CodeGen/TargetBuiltins/X86.cpp | 43 ++++--------------------
clang/test/CodeGen/X86/avx-builtins.c | 4 +--
clang/test/CodeGen/X86/sse41-builtins.c | 8 ++---
3 files changed, 13 insertions(+), 42 deletions(-)
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index c8b55e855e717..d4c25cdc8b0ab 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -75,14 +75,13 @@ static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
return MaskVec;
}
-static Value *emitX86Round(CodeGenFunction &CGF, Value *X, unsigned M) {
- unsigned RoundingMask = 0b11;
- unsigned UpdatePEBit = 0b100;
- unsigned UseMXCSRBit = 0b1000;
+// Emit rounding for the value X according to the rounding RoundingControl.
+static Value *emitX86Round(CodeGenFunction &CGF, Value *X, unsigned RoundingControl) {
+ unsigned roundingMask = 0b11;
+ unsigned useMXCSRBit = 0b1000;
- unsigned roundingMode = M & RoundingMask;
- bool updatePE = M & UpdatePEBit;
- bool useMXCSR = M & UseMXCSRBit;
+ unsigned roundingMode = RoundingControl & roundingMask;
+ bool useMXCSR = RoundingControl & useMXCSRBit;
Intrinsic::ID ID = Intrinsic::not_intrinsic;
LLVMContext &Ctx = CGF.CGM.getLLVMContext();
@@ -90,10 +89,8 @@ static Value *emitX86Round(CodeGenFunction &CGF, Value *X, unsigned M) {
if (useMXCSR) {
ID = Intrinsic::experimental_constrained_nearbyint;
- auto PE_metatadata = updatePE ? "fpexcept.strict" : "fpexcept.ignore";
-
Value *ExceptMode =
- MetadataAsValue::get(Ctx, MDString::get(Ctx, PE_metatadata));
+ MetadataAsValue::get(Ctx, MDString::get(Ctx, "fpexcept.ignore"));
Value *RoundingMode =
MetadataAsValue::get(Ctx, MDString::get(Ctx, "rounding.dynamic"));
@@ -102,32 +99,6 @@ static Value *emitX86Round(CodeGenFunction &CGF, Value *X, unsigned M) {
return CGF.Builder.CreateCall(F, {X, ExceptMode, RoundingMode});
}
- if (updatePE) {
- switch (roundingMode) {
- case 0b00:
- ID = Intrinsic::experimental_constrained_roundeven;
- break;
- case 0b01:
- ID = Intrinsic::experimental_constrained_floor;
- break;
- case 0b10:
- ID = Intrinsic::experimental_constrained_ceil;
- break;
- case 0b11:
- ID = Intrinsic::experimental_constrained_trunc;
- break;
- default:
- llvm_unreachable("Invalid rounding mode");
- }
-
- Value *ExceptMode =
- MetadataAsValue::get(Ctx, MDString::get(Ctx, "fpexcept.strict"));
-
- Function *F = CGF.CGM.getIntrinsic(ID, X->getType());
- return CGF.Builder.CreateCall(F, {X, ExceptMode});
- }
-
- // Otherwise we can use the standard ops
switch (roundingMode) {
case 0b00:
ID = Intrinsic::roundeven;
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index f3844adf0a498..e6b8b57b8cb30 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1511,13 +1511,13 @@ __m256 test_mm256_rcp_ps(__m256 A) {
__m256d test_mm256_round_pd(__m256d x) {
// CHECK-LABEL: test_mm256_round_pd
- // CHECK: %{{.*}} = call <4 x double> @llvm.experimental.constrained.roundeven.v4f64(<4 x double> %{{.*}}, metadata !"fpexcept.strict")
+ // CHECK: %{{.*}} = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %{{.*}})
return _mm256_round_pd(x, 4);
}
__m256 test_mm256_round_ps(__m256 x) {
// CHECK-LABEL: test_mm256_round_ps
- // CHECK: %{{.*}} = call <8 x float> @llvm.experimental.constrained.roundeven.v8f32(<8 x float> %{{.*}}, metadata !"fpexcept.strict")
+ // CHECK: %{{.*}} = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %{{.*}})
return _mm256_round_ps(x, 4);
}
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index f084e1dfade15..3d2eacfcb9287 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -438,20 +438,20 @@ TEST_CONSTEXPR(match_v8hi(_mm_packus_epi32((__m128i)(__v4si){40000, -50000, 3276
__m128d test_mm_round_pd(__m128d x) {
// CHECK-LABEL: test_mm_round_pd
- // CHECK: %{{.*}} = call <2 x double> @llvm.experimental.constrained.roundeven.v2f64(<2 x double> %{{.*}}, metadata !"fpexcept.strict")
+ // CHECK: %{{.*}} = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %{{.*}})
return _mm_round_pd(x, 4);
}
__m128 test_mm_round_ps(__m128 x) {
// CHECK-LABEL: test_mm_round_ps
- // CHECK: %{{.*}} = call <4 x float> @llvm.experimental.constrained.roundeven.v4f32(<4 x float> %{{.*}}, metadata !"fpexcept.strict")
+ // CHECK: %{{.*}} = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %{{.*}})
return _mm_round_ps(x, 4);
}
__m128d test_mm_round_sd(__m128d x, __m128d y) {
// CHECK-LABEL: test_mm_round_sd
// CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0
- // CHECK: %[[B:.*]] = call double @llvm.experimental.constrained.roundeven.f64(double %[[A:.*]], metadata !"fpexcept.strict")
+ // CHECK: %[[B:.*]] = call double @llvm.roundeven.f64(double %[[A:.*]])
// CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0
return _mm_round_sd(x, y, 4);
}
@@ -459,7 +459,7 @@ __m128d test_mm_round_sd(__m128d x, __m128d y) {
__m128 test_mm_round_ss(__m128 x, __m128 y) {
// CHECK-LABEL: test_mm_round_ss
// CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0
- // CHECK: %[[B:.*]] = call float @llvm.experimental.constrained.roundeven.f32(float %[[A:.*]], metadata !"fpexcept.strict")
+ // CHECK: %[[B:.*]] = call float @llvm.roundeven.f32(float %[[A:.*]])
// CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0
return _mm_round_ss(x, y, 4);
}
>From 9b2cda2ccaa0631133d0dee1e378951d1db09cce Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Tue, 9 Dec 2025 18:04:39 +0000
Subject: [PATCH 08/15] format
---
clang/lib/CodeGen/TargetBuiltins/X86.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index d4c25cdc8b0ab..fc10f460e6dc4 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -76,7 +76,8 @@ static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
}
// Emit rounding for the value X according to the rounding RoundingControl.
-static Value *emitX86Round(CodeGenFunction &CGF, Value *X, unsigned RoundingControl) {
+static Value *emitX86Round(CodeGenFunction &CGF, Value *X,
+ unsigned RoundingControl) {
unsigned roundingMask = 0b11;
unsigned useMXCSRBit = 0b1000;
>From 4202a3a74c1a45660236351477c9707f74251233 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 11 Dec 2025 01:24:00 +0000
Subject: [PATCH 09/15] save
---
llvm/lib/Target/X86/X86InstrSSE.td | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index e4aaa1e1b594a..fada8ccb9808a 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -5707,6 +5707,23 @@ let Predicates = [UseSSE41, OptForSize] in {
(ROUNDSDmi addr:$src1, timm:$src2)>;
}
+multiclass test<SDPatternOperator OpNode, string OpcPrefix, SDNode Move,
+ ValueType VT, Predicate BasePredicate> {
+ let Predicates = [BasePredicate] in {
+ def : Pat<(VT (insertelt VT:$dst, (OpNode (extractelt VT:$src, 0)), 0)),
+ (!cast<Instruction>(OpcPrefix#ri_Int) $dst, $src, 0)>;
+ }
+
+ // Repeat for AVX versions of the instructions.
+ let Predicates = [UseAVX] in {
+ def : Pat<(VT (Move VT:$dst, (scalar_to_vector
+ (OpNode (extractelt VT:$src, 0))))),
+ (!cast<Instruction>("V"#OpcPrefix#ri_Int) VT:$dst, VT:$src, 0)>;
+ }
+}
+
+defm : test<any_fceil, "ROUNDSS", X86Movss, v4f32, UseSSE41>;
+
//===----------------------------------------------------------------------===//
// SSE4.1 - Packed Bit Test
//===----------------------------------------------------------------------===//
>From a1825fb75bccb95560333b3b624acfccaa3363a2 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 11 Dec 2025 17:05:57 +0000
Subject: [PATCH 10/15] add pattern to eliminate round + blend in asm
---
llvm/lib/Target/X86/X86InstrSSE.td | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index fada8ccb9808a..24d19b40ad41d 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -5710,19 +5710,20 @@ let Predicates = [UseSSE41, OptForSize] in {
multiclass test<SDPatternOperator OpNode, string OpcPrefix, SDNode Move,
ValueType VT, Predicate BasePredicate> {
let Predicates = [BasePredicate] in {
- def : Pat<(VT (insertelt VT:$dst, (OpNode (extractelt VT:$src, 0)), 0)),
- (!cast<Instruction>(OpcPrefix#ri_Int) $dst, $src, 0)>;
+ def : Pat<(VT (Move VT:$dst, (scalar_to_vector
+ (OpNode (extractelt VT:$src, (i64 0)), i32:$imm)))),
+ (!cast<Instruction>(OpcPrefix#ri_Int) VT:$dst, VT:$src, i32:$imm)>;
}
// Repeat for AVX versions of the instructions.
let Predicates = [UseAVX] in {
def : Pat<(VT (Move VT:$dst, (scalar_to_vector
- (OpNode (extractelt VT:$src, 0))))),
- (!cast<Instruction>("V"#OpcPrefix#ri_Int) VT:$dst, VT:$src, 0)>;
+ (OpNode (extractelt VT:$src, (i64 0)), i32:$imm)))),
+ (!cast<Instruction>("V"#OpcPrefix#ri_Int) VT:$dst, VT:$src, i32:$imm)>;
}
}
-
-defm : test<any_fceil, "ROUNDSS", X86Movss, v4f32, UseSSE41>;
+defm : test<X86any_VRndScale, "ROUNDSS", X86Movss, v4f32, UseSSE41>;
+defm : test<X86any_VRndScale, "ROUNDSD", X86Movsd, v2f64, UseSSE41>;
//===----------------------------------------------------------------------===//
// SSE4.1 - Packed Bit Test
>From 7f84d4919098c7b1e80596f3ff45079e7d83aba1 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 11 Dec 2025 17:07:41 +0000
Subject: [PATCH 11/15] style: varnames are capitalised
---
clang/lib/CodeGen/TargetBuiltins/X86.cpp | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index fc10f460e6dc4..8d43fd90a5247 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -78,16 +78,16 @@ static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
// Emit rounding for the value X according to the rounding RoundingControl.
static Value *emitX86Round(CodeGenFunction &CGF, Value *X,
unsigned RoundingControl) {
- unsigned roundingMask = 0b11;
- unsigned useMXCSRBit = 0b1000;
+ unsigned RoundingMask = 0b11;
+ unsigned UseMXCSRBit = 0b1000;
- unsigned roundingMode = RoundingControl & roundingMask;
- bool useMXCSR = RoundingControl & useMXCSRBit;
+ unsigned RoundingMode = RoundingControl & RoundingMask;
+ bool UseMXCSR = RoundingControl & UseMXCSRBit;
Intrinsic::ID ID = Intrinsic::not_intrinsic;
LLVMContext &Ctx = CGF.CGM.getLLVMContext();
- if (useMXCSR) {
+ if (UseMXCSR) {
ID = Intrinsic::experimental_constrained_nearbyint;
Value *ExceptMode =
@@ -100,7 +100,7 @@ static Value *emitX86Round(CodeGenFunction &CGF, Value *X,
return CGF.Builder.CreateCall(F, {X, ExceptMode, RoundingMode});
}
- switch (roundingMode) {
+ switch (RoundingMode) {
case 0b00:
ID = Intrinsic::roundeven;
break;
>From c9cfd3df5cd6c50d61d916a8e107d1d71a0ea4b1 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 11 Dec 2025 22:57:19 +0000
Subject: [PATCH 12/15] fix: correct order and value for metadata args
---
clang/lib/CodeGen/TargetBuiltins/X86.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index 8d43fd90a5247..8b804ab561e09 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -90,14 +90,14 @@ static Value *emitX86Round(CodeGenFunction &CGF, Value *X,
if (UseMXCSR) {
ID = Intrinsic::experimental_constrained_nearbyint;
+ Value *RoundingMode =
+ MetadataAsValue::get(Ctx, MDString::get(Ctx, "round.dynamic"));
+
Value *ExceptMode =
MetadataAsValue::get(Ctx, MDString::get(Ctx, "fpexcept.ignore"));
- Value *RoundingMode =
- MetadataAsValue::get(Ctx, MDString::get(Ctx, "rounding.dynamic"));
-
Function *F = CGF.CGM.getIntrinsic(ID, X->getType());
- return CGF.Builder.CreateCall(F, {X, ExceptMode, RoundingMode});
+ return CGF.Builder.CreateCall(F, {X, RoundingMode, ExceptMode});
}
switch (RoundingMode) {
>From c9923c145b8bb60fe24a388ce37d7f09c42f048d Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 11 Dec 2025 22:58:34 +0000
Subject: [PATCH 13/15] test: add tests for constrained ops
---
.../CodeGen/X86/avx-builtins-constrainted.c | 36 ++++++++++++++
.../CodeGen/X86/sse41-builtins-constrained.c | 49 +++++++++++++++++++
2 files changed, 85 insertions(+)
create mode 100644 clang/test/CodeGen/X86/avx-builtins-constrainted.c
create mode 100644 clang/test/CodeGen/X86/sse41-builtins-constrained.c
diff --git a/clang/test/CodeGen/X86/avx-builtins-constrainted.c b/clang/test/CodeGen/X86/avx-builtins-constrainted.c
new file mode 100644
index 0000000000000..cbd4060364139
--- /dev/null
+++ b/clang/test/CodeGen/X86/avx-builtins-constrainted.c
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+
+#include <immintrin.h>
+#include "builtin_test_helpers.h"
+
+__m256d test_mm256_round_pd(__m256d x) {
+ // CHECK-LABEL: test_mm256_round_pd
+ // CHECK: %{{.*}} = call <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(<4 x double> %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ return _mm256_round_pd(x, 8);
+}
+
+__m256 test_mm256_round_ps(__m256 x) {
+ // CHECK-LABEL: test_mm256_round_ps
+ // CHECK: %{{.*}} = call <8 x float> @llvm.experimental.constrained.nearbyint.v8f32(<8 x float> %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ return _mm256_round_ps(x, 8);
+}
\ No newline at end of file
diff --git a/clang/test/CodeGen/X86/sse41-builtins-constrained.c b/clang/test/CodeGen/X86/sse41-builtins-constrained.c
new file mode 100644
index 0000000000000..ed6c95d37872f
--- /dev/null
+++ b/clang/test/CodeGen/X86/sse41-builtins-constrained.c
@@ -0,0 +1,49 @@
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+
+
+#include <immintrin.h>
+#include "builtin_test_helpers.h"
+
+__m128d test_mm_round_pd(__m128d x) {
+ // CHECK-LABEL: test_mm_round_pd
+ // CHECK: %{{.*}} = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double> %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ return _mm_round_pd(x, 8);
+}
+
+__m128 test_mm_round_ps(__m128 x) {
+ // CHECK-LABEL: test_mm_round_ps
+ // CHECK: %{{.*}} = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float> %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ return _mm_round_ps(x, 8);
+}
+
+__m128d test_mm_round_sd(__m128d x, __m128d y) {
+ // CHECK-LABEL: test_mm_round_sd
+ // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0
+ // CHECK: %[[B:.*]] = call double @llvm.experimental.constrained.nearbyint.f64(double %[[A:.*]], metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0
+ return _mm_round_sd(x, y, 8);
+}
+
+__m128 test_mm_round_ss(__m128 x, __m128 y) {
+ // CHECK-LABEL: test_mm_round_ss
+ // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: %[[B:.*]] = call float @llvm.experimental.constrained.nearbyint.f32(float %[[A:.*]], metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0
+ return _mm_round_ss(x, y, 8);
+}
\ No newline at end of file
>From 0cc45b674d42d84586f54b4b0d0f65dd7c17abff Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 11 Dec 2025 23:02:21 +0000
Subject: [PATCH 14/15] format
---
llvm/lib/Target/X86/X86InstrSSE.td | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 24d19b40ad41d..d4996c27dbe8d 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -5708,18 +5708,22 @@ let Predicates = [UseSSE41, OptForSize] in {
}
multiclass test<SDPatternOperator OpNode, string OpcPrefix, SDNode Move,
- ValueType VT, Predicate BasePredicate> {
+ ValueType VT, Predicate BasePredicate> {
let Predicates = [BasePredicate] in {
- def : Pat<(VT (Move VT:$dst, (scalar_to_vector
- (OpNode (extractelt VT:$src, (i64 0)), i32:$imm)))),
- (!cast<Instruction>(OpcPrefix#ri_Int) VT:$dst, VT:$src, i32:$imm)>;
+ def : Pat<(VT(Move VT:$dst, (scalar_to_vector(OpNode
+ (extractelt VT:$src, (i64 0)),
+ i32:$imm)))),
+ (!cast<Instruction>(OpcPrefix#ri_Int) VT:$dst, VT:$src,
+ i32:$imm)>;
}
// Repeat for AVX versions of the instructions.
let Predicates = [UseAVX] in {
- def : Pat<(VT (Move VT:$dst, (scalar_to_vector
- (OpNode (extractelt VT:$src, (i64 0)), i32:$imm)))),
- (!cast<Instruction>("V"#OpcPrefix#ri_Int) VT:$dst, VT:$src, i32:$imm)>;
+ def : Pat<(VT(Move VT:$dst, (scalar_to_vector(OpNode
+ (extractelt VT:$src, (i64 0)),
+ i32:$imm)))),
+ (!cast<Instruction>("V"#OpcPrefix#ri_Int) VT:$dst, VT:$src,
+ i32:$imm)>;
}
}
defm : test<X86any_VRndScale, "ROUNDSS", X86Movss, v4f32, UseSSE41>;
>From d69d36a8b50a3ea63433c59343b9cc2daadd127e Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 11 Dec 2025 23:48:50 +0000
Subject: [PATCH 15/15] tests: fix vec_floor tests
---
llvm/test/CodeGen/X86/vec_floor.ll | 12 ++++--------
1 file changed, 4 insertions(+), 8 deletions(-)
diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll
index 7f4ed3394d10d..2327036a8e1d2 100644
--- a/llvm/test/CodeGen/X86/vec_floor.ll
+++ b/llvm/test/CodeGen/X86/vec_floor.ll
@@ -821,8 +821,7 @@ define <4 x float> @const_trunc_v4f32() {
define <4 x float> @floor_ss(<4 x float> %x, <4 x float> %y) nounwind {
; SSE41-LABEL: floor_ss:
; SSE41: ## %bb.0:
-; SSE41-NEXT: roundss $9, %xmm0, %xmm0
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT: roundss $9, %xmm0, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: floor_ss:
@@ -846,8 +845,7 @@ declare float @llvm.floor.f32(float %s)
define <2 x double> @floor_sd(<2 x double> %x, <2 x double> %y) nounwind {
; SSE41-LABEL: floor_sd:
; SSE41: ## %bb.0:
-; SSE41-NEXT: roundsd $9, %xmm0, %xmm0
-; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT: roundsd $9, %xmm0, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: floor_sd:
@@ -1811,8 +1809,7 @@ define <2 x double> @floor_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) noun
define <4 x float> @ceil_ss(<4 x float> %x, <4 x float> %y) nounwind {
; SSE41-LABEL: ceil_ss:
; SSE41: ## %bb.0:
-; SSE41-NEXT: roundss $10, %xmm0, %xmm0
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT: roundss $10, %xmm0, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: ceil_ss:
@@ -1836,8 +1833,7 @@ declare float @llvm.ceil.f32(float %s)
define <2 x double> @ceil_sd(<2 x double> %x, <2 x double> %y) nounwind {
; SSE41-LABEL: ceil_sd:
; SSE41: ## %bb.0:
-; SSE41-NEXT: roundsd $10, %xmm0, %xmm0
-; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT: roundsd $10, %xmm0, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: ceil_sd:
More information about the llvm-commits
mailing list