[clang] [CIR] X86 vector fcmp-sse vector builtins (PR #167125)
via cfe-commits
cfe-commits at lists.llvm.org
Thu Nov 13 23:48:21 PST 2025
https://github.com/woruyu updated https://github.com/llvm/llvm-project/pull/167125
>From 7357dc60c17d55f5d0ecffdd958e23e8bc677240 Mon Sep 17 00:00:00 2001
From: liuzhenya <zyliu at siorigin.com>
Date: Thu, 13 Nov 2025 21:47:38 -1000
Subject: [PATCH] [CIR] X84 vector fcmp-sse vector builtins
---
.../CIR/Dialect/Builder/CIRBaseBuilder.h | 18 ++
clang/include/clang/CIR/MissingFeatures.h | 1 +
clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 65 +++++-
clang/test/CIR/CodeGen/builtin-fcmp-sse.c | 213 ++++++++++++++++++
4 files changed, 286 insertions(+), 11 deletions(-)
create mode 100644 clang/test/CIR/CodeGen/builtin-fcmp-sse.c
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index 6c7e3d055456a..9bf8975414951 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -131,6 +131,14 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
return cir::IntType::get(getContext(), n, false);
}
+ static unsigned getCIRIntOrFloatBitWidth(mlir::Type eltTy) {
+ if (auto intType = mlir::dyn_cast<cir::IntTypeInterface>(eltTy))
+ return intType.getWidth();
+ if (auto floatType = mlir::dyn_cast<cir::FPTypeInterface>(eltTy))
+ return floatType.getWidth();
+
+ llvm_unreachable("Unsupported type in getCIRIntOrFloatBitWidth");
+ }
cir::IntType getSIntNTy(int n) {
return cir::IntType::get(getContext(), n, true);
}
@@ -584,6 +592,16 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
return cir::CmpOp::create(*this, loc, getBoolTy(), kind, lhs, rhs);
}
+ cir::VecCmpOp createVecCompare(mlir::Location loc, cir::CmpOpKind kind,
+ mlir::Value lhs, mlir::Value rhs) {
+ VectorType vecCast = mlir::cast<VectorType>(lhs.getType());
+ IntType integralTy =
+ getSIntNTy(getCIRIntOrFloatBitWidth(vecCast.getElementType()));
+ VectorType integralVecTy =
+ VectorType::get(context, integralTy, vecCast.getSize());
+ return cir::VecCmpOp::create(*this, loc, integralVecTy, kind, lhs, rhs);
+ }
+
mlir::Value createIsNaN(mlir::Location loc, mlir::Value operand) {
return createCompare(loc, cir::CmpOpKind::ne, operand, operand);
}
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 567c79a27c07b..525d2b00392ff 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -258,6 +258,7 @@ struct MissingFeatures {
static bool emitBranchThroughCleanup() { return false; }
static bool emitCheckedInBoundsGEP() { return false; }
static bool emitCondLikelihoodViaExpectIntrinsic() { return false; }
+ static bool emitConstrainedFPCall() { return false; }
static bool emitLifetimeMarkers() { return false; }
static bool emitLValueAlignmentAssumption() { return false; }
static bool emitNullCheckForDeleteCalls() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 2d6cf30fa2ded..c366b7e61f0f9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -34,18 +34,53 @@ static mlir::Value emitIntrinsicCallOp(CIRGenFunction &cgf, const CallExpr *e,
.getResult();
}
+// OG has unordered comparison as a form of optimization in addition to
+// ordered comparison, while CIR doesn't.
+//
+// This means that we can't encode the comparison code of UGT (unordered
+// greater than), at least not at the CIR level.
+//
+// The boolean shouldInvert compensates for this.
+// For example: to get to the comparison code UGT, we pass in
+// emitVectorFCmp (OLE, shouldInvert = true) since OLE is the inverse of UGT.
+
+// There are several ways to support this otherwise:
+// - register extra CmpOpKind for unordered comparison types and build the
+// translation code for
+// to go from CIR -> LLVM dialect. Notice we get this naturally with
+// shouldInvert, benefiting from existing infrastructure, albeit having to
+// generate an extra `not` at CIR).
+// - Just add extra comparison code to a new VecCmpOpKind instead of
+// cluttering CmpOpKind.
+// - Add a boolean in VecCmpOp to indicate if it's doing unordered or ordered
+// comparison
+// - Just emit the intrinsics call instead of calling this helper, see how the
+// LLVM lowering handles this.
+static mlir::Value emitVectorFCmp(CIRGenBuilderTy &builder,
+ llvm::SmallVector<mlir::Value> &ops,
+ mlir::Location loc, cir::CmpOpKind pred,
+ bool shouldInvert) {
+ assert(!cir::MissingFeatures::cgFPOptionsRAII());
+ // TODO(cir): Add isSignaling boolean once emitConstrainedFPCall implemented
+ assert(!cir::MissingFeatures::emitConstrainedFPCall());
+ mlir::Value cmp = builder.createVecCompare(loc, pred, ops[0], ops[1]);
+ mlir::Value bitCast = builder.createBitcast(
+ shouldInvert ? builder.createNot(cmp) : cmp, ops[0].getType());
+ return bitCast;
+}
+
mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
- const CallExpr *e) {
+ const CallExpr *expr) {
if (builtinID == Builtin::BI__builtin_cpu_is) {
- cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_is");
+ cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_is");
return {};
}
if (builtinID == Builtin::BI__builtin_cpu_supports) {
- cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_supports");
+ cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_supports");
return {};
}
if (builtinID == Builtin::BI__builtin_cpu_init) {
- cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_init");
+ cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_init");
return {};
}
@@ -66,7 +101,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
getContext().GetBuiltinType(builtinID, error, &iceArguments);
assert(error == ASTContext::GE_None && "Error while getting builtin type.");
- for (auto [idx, arg] : llvm::enumerate(e->arguments())) {
+ for (auto [idx, arg] : llvm::enumerate(expr->arguments())) {
ops.push_back(emitScalarOrConstFoldImmArg(iceArguments, idx, arg));
}
@@ -77,15 +112,15 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
default:
return {};
case X86::BI_mm_clflush:
- return emitIntrinsicCallOp(*this, e, "x86.sse2.clflush", voidTy, ops[0]);
+ return emitIntrinsicCallOp(*this, expr, "x86.sse2.clflush", voidTy, ops[0]);
case X86::BI_mm_lfence:
- return emitIntrinsicCallOp(*this, e, "x86.sse2.lfence", voidTy);
+ return emitIntrinsicCallOp(*this, expr, "x86.sse2.lfence", voidTy);
case X86::BI_mm_pause:
- return emitIntrinsicCallOp(*this, e, "x86.sse2.pause", voidTy);
+ return emitIntrinsicCallOp(*this, expr, "x86.sse2.pause", voidTy);
case X86::BI_mm_mfence:
- return emitIntrinsicCallOp(*this, e, "x86.sse2.mfence", voidTy);
+ return emitIntrinsicCallOp(*this, expr, "x86.sse2.mfence", voidTy);
case X86::BI_mm_sfence:
- return emitIntrinsicCallOp(*this, e, "x86.sse.sfence", voidTy);
+ return emitIntrinsicCallOp(*this, expr, "x86.sse.sfence", voidTy);
case X86::BI_mm_prefetch:
case X86::BI__rdtsc:
case X86::BI__builtin_ia32_rdtscp:
@@ -741,10 +776,18 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
case X86::BI__builtin_ia32_cmpunordpd:
case X86::BI__builtin_ia32_cmpneqps:
case X86::BI__builtin_ia32_cmpneqpd:
+ cgm.errorNYI(expr->getSourceRange(),
+ std::string("unimplemented X86 builtin call: ") +
+ getContext().BuiltinInfo.getName(builtinID));
+ return {};
case X86::BI__builtin_ia32_cmpnltps:
case X86::BI__builtin_ia32_cmpnltpd:
+ return emitVectorFCmp(builder, ops, getLoc(expr->getExprLoc()),
+ cir::CmpOpKind::lt, /*shouldInvert=*/true);
case X86::BI__builtin_ia32_cmpnleps:
case X86::BI__builtin_ia32_cmpnlepd:
+ return emitVectorFCmp(builder, ops, getLoc(expr->getExprLoc()),
+ cir::CmpOpKind::le, /*shouldInvert=*/true);
case X86::BI__builtin_ia32_cmpordps:
case X86::BI__builtin_ia32_cmpordpd:
case X86::BI__builtin_ia32_cmpph128_mask:
@@ -829,7 +872,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3:
case X86::BI__builtin_ia32_vfmaddcsh_round_mask3:
case X86::BI__builtin_ia32_prefetchi:
- cgm.errorNYI(e->getSourceRange(),
+ cgm.errorNYI(expr->getSourceRange(),
std::string("unimplemented X86 builtin call: ") +
getContext().BuiltinInfo.getName(builtinID));
return {};
diff --git a/clang/test/CIR/CodeGen/builtin-fcmp-sse.c b/clang/test/CIR/CodeGen/builtin-fcmp-sse.c
new file mode 100644
index 0000000000000..c273d6b3fca0e
--- /dev/null
+++ b/clang/test/CIR/CodeGen/builtin-fcmp-sse.c
@@ -0,0 +1,213 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
+typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
+
+__m128 test_cmpnleps(__m128 A, __m128 B) {
+ // CIR-LABEL: cir.func dso_local @test_cmpnleps(
+ // CIR: %[[ARG0:.*]]: !cir.vector<4 x !cir.float> {{.*}}, %[[ARG1:.*]]: !cir.vector<4 x !cir.float> {{.*}}) -> !cir.vector<4 x !cir.float> inline(never) {
+ // CIR: %[[ALLOCA_0:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>, ["A", init] {alignment = 16 : i64}
+ // CIR: %[[ALLOCA_1:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>, ["B", init] {alignment = 16 : i64}
+ // CIR: %[[ALLOCA_2:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>, ["__retval"] {alignment = 16 : i64}
+ // CIR: cir.store %[[ARG0]], %[[ALLOCA_0]] : !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>
+ // CIR: cir.store %[[ARG1]], %[[ALLOCA_1]] : !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>
+ // CIR: %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float>
+ // CIR: %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float>
+ // CIR: %[[VEC_0:.*]] = cir.vec.cmp(le, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+ // CIR: %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+ // CIR: %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<4 x !s32i> -> !cir.vector<4 x !cir.float>
+ // CIR: cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>
+ // CIR: %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float>
+ // CIR: cir.return %[[LOAD_2]] : !cir.vector<4 x !cir.float>
+ // CIR: }
+
+ // LLVM-LABEL: define dso_local <4 x float> @test_cmpnleps(
+ // LLVM-SAME: <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+ // LLVM-NEXT: [[TMP3:%.*]] = alloca <4 x float>, i64 1, align 16
+ // LLVM-NEXT: [[TMP4:%.*]] = alloca <4 x float>, i64 1, align 16
+ // LLVM-NEXT: [[TMP5:%.*]] = alloca <4 x float>, i64 1, align 16
+ // LLVM-NEXT: store <4 x float> [[TMP0]], ptr [[TMP3]], align 16
+ // LLVM-NEXT: store <4 x float> [[TMP1]], ptr [[TMP4]], align 16
+ // LLVM-NEXT: [[TMP6:%.*]] = load <4 x float>, ptr [[TMP3]], align 16
+ // LLVM-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[TMP4]], align 16
+ // LLVM-NEXT: [[TMP8:%.*]] = fcmp ole <4 x float> [[TMP6]], [[TMP7]]
+ // LLVM-NEXT: [[TMP9:%.*]] = sext <4 x i1> [[TMP8]] to <4 x i32>
+ // LLVM-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], splat (i32 -1)
+ // LLVM-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <4 x float>
+ // LLVM-NEXT: store <4 x float> [[TMP11]], ptr [[TMP5]], align 16
+ // LLVM-NEXT: [[TMP12:%.*]] = load <4 x float>, ptr [[TMP5]], align 16
+ // LLVM-NEXT: ret <4 x float> [[TMP12]]
+
+ // OGCG-LABEL: define dso_local <4 x float> @test_cmpnleps(
+ // OGCG-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+ // OGCG-NEXT: [[ENTRY:.*:]]
+ // OGCG-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+ // OGCG-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+ // OGCG-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+ // OGCG-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+ // OGCG-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+ // OGCG-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+ // OGCG-NEXT: [[TMP2:%.*]] = fcmp ugt <4 x float> [[TMP0]], [[TMP1]]
+ // OGCG-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+ // OGCG-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+ // OGCG-NEXT: ret <4 x float> [[TMP4]]
+ return __builtin_ia32_cmpnleps(A, B);
+}
+
+__m128d test_cmpnlepd(__m128d A, __m128d B) {
+ // CIR-LABEL: cir.func dso_local @test_cmpnlepd(
+ // CIR: %[[ARG0:.*]]: !cir.vector<2 x !cir.double> {{.*}}, %[[ARG1:.*]]: !cir.vector<2 x !cir.double> {{.*}}) -> !cir.vector<2 x !cir.double> inline(never) {
+ // CIR: %[[ALLOCA_0:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>>, ["A", init] {alignment = 16 : i64}
+ // CIR: %[[ALLOCA_1:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>>, ["B", init] {alignment = 16 : i64}
+ // CIR: %[[ALLOCA_2:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>>, ["__retval"] {alignment = 16 : i64}
+ // CIR: cir.store %[[ARG0]], %[[ALLOCA_0]] : !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>>
+ // CIR: cir.store %[[ARG1]], %[[ALLOCA_1]] : !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>>
+ // CIR: %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr<!cir.vector<2 x !cir.double>>, !cir.vector<2 x !cir.double>
+ // CIR: %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr<!cir.vector<2 x !cir.double>>, !cir.vector<2 x !cir.double>
+ // CIR: %[[VEC_0:.*]] = cir.vec.cmp(le, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<2 x !cir.double>, !cir.vector<2 x !s64i>
+ // CIR: %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>
+ // CIR: %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<2 x !s64i> -> !cir.vector<2 x !cir.double>
+ // CIR: cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>>
+ // CIR: %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr<!cir.vector<2 x !cir.double>>, !cir.vector<2 x !cir.double>
+ // CIR: cir.return %[[LOAD_2]] : !cir.vector<2 x !cir.double>
+ // CIR: }
+
+ // LLVM-LABEL: define dso_local <2 x double> @test_cmpnlepd(
+ // LLVM-SAME: <2 x double> [[TMP0:%.*]], <2 x double> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+ // LLVM-NEXT: [[TMP3:%.*]] = alloca <2 x double>, i64 1, align 16
+ // LLVM-NEXT: [[TMP4:%.*]] = alloca <2 x double>, i64 1, align 16
+ // LLVM-NEXT: [[TMP5:%.*]] = alloca <2 x double>, i64 1, align 16
+ // LLVM-NEXT: store <2 x double> [[TMP0]], ptr [[TMP3]], align 16
+ // LLVM-NEXT: store <2 x double> [[TMP1]], ptr [[TMP4]], align 16
+ // LLVM-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP3]], align 16
+ // LLVM-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[TMP4]], align 16
+ // LLVM-NEXT: [[TMP8:%.*]] = fcmp ole <2 x double> [[TMP6]], [[TMP7]]
+ // LLVM-NEXT: [[TMP9:%.*]] = sext <2 x i1> [[TMP8]] to <2 x i64>
+ // LLVM-NEXT: [[TMP10:%.*]] = xor <2 x i64> [[TMP9]], splat (i64 -1)
+ // LLVM-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <2 x double>
+ // LLVM-NEXT: store <2 x double> [[TMP11]], ptr [[TMP5]], align 16
+ // LLVM-NEXT: [[TMP12:%.*]] = load <2 x double>, ptr [[TMP5]], align 16
+ // LLVM-NEXT: ret <2 x double> [[TMP12]]
+
+ // OGCG-LABEL: define dso_local <2 x double> @test_cmpnlepd(
+ // OGCG-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+ // OGCG-NEXT: [[ENTRY:.*:]]
+ // OGCG-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+ // OGCG-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+ // OGCG-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+ // OGCG-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+ // OGCG-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+ // OGCG-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+ // OGCG-NEXT: [[TMP2:%.*]] = fcmp ugt <2 x double> [[TMP0]], [[TMP1]]
+ // OGCG-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+ // OGCG-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+ // OGCG-NEXT: ret <2 x double> [[TMP4]]
+ return __builtin_ia32_cmpnlepd(A, B);
+}
+
+__m128 test_cmpnltps(__m128 A, __m128 B) {
+ // CIR-LABEL: cir.func dso_local @test_cmpnltps(
+ // CIR-SAME: %[[ARG0:.*]]: !cir.vector<4 x !cir.float> {{.*}}, %[[ARG1:.*]]: !cir.vector<4 x !cir.float> {{.*}}) -> !cir.vector<4 x !cir.float> inline(never) {
+ // CIR: %[[ALLOCA_0:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>, ["A", init] {alignment = 16 : i64}
+ // CIR: %[[ALLOCA_1:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>, ["B", init] {alignment = 16 : i64}
+ // CIR: %[[ALLOCA_2:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>, ["__retval"] {alignment = 16 : i64}
+ // CIR: cir.store %[[ARG0]], %[[ALLOCA_0]] : !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>
+ // CIR: cir.store %[[ARG1]], %[[ALLOCA_1]] : !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>
+ // CIR: %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float>
+ // CIR: %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float>
+ // CIR: %[[VEC_0:.*]] = cir.vec.cmp(lt, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+ // CIR: %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+ // CIR: %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<4 x !s32i> -> !cir.vector<4 x !cir.float>
+ // CIR: cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>
+ // CIR: %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float>
+ // CIR: cir.return %[[LOAD_2]] : !cir.vector<4 x !cir.float>
+ // CIR: }
+
+ // LLVM-LABEL: define dso_local <4 x float> @test_cmpnltps(
+ // LLVM-SAME: <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+ // LLVM-NEXT: [[TMP3:%.*]] = alloca <4 x float>, i64 1, align 16
+ // LLVM-NEXT: [[TMP4:%.*]] = alloca <4 x float>, i64 1, align 16
+ // LLVM-NEXT: [[TMP5:%.*]] = alloca <4 x float>, i64 1, align 16
+ // LLVM-NEXT: store <4 x float> [[TMP0]], ptr [[TMP3]], align 16
+ // LLVM-NEXT: store <4 x float> [[TMP1]], ptr [[TMP4]], align 16
+ // LLVM-NEXT: [[TMP6:%.*]] = load <4 x float>, ptr [[TMP3]], align 16
+ // LLVM-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[TMP4]], align 16
+ // LLVM-NEXT: [[TMP8:%.*]] = fcmp olt <4 x float> [[TMP6]], [[TMP7]]
+ // LLVM-NEXT: [[TMP9:%.*]] = sext <4 x i1> [[TMP8]] to <4 x i32>
+ // LLVM-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], splat (i32 -1)
+ // LLVM-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <4 x float>
+ // LLVM-NEXT: store <4 x float> [[TMP11]], ptr [[TMP5]], align 16
+ // LLVM-NEXT: [[TMP12:%.*]] = load <4 x float>, ptr [[TMP5]], align 16
+ // LLVM-NEXT: ret <4 x float> [[TMP12]]
+
+ // OGCG-LABEL: define dso_local <4 x float> @test_cmpnltps(
+ // OGCG-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+ // OGCG-NEXT: [[ENTRY:.*:]]
+ // OGCG-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+ // OGCG-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+ // OGCG-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+ // OGCG-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+ // OGCG-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+ // OGCG-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+ // OGCG-NEXT: [[TMP2:%.*]] = fcmp uge <4 x float> [[TMP0]], [[TMP1]]
+ // OGCG-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+ // OGCG-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+ // OGCG-NEXT: ret <4 x float> [[TMP4]]
+ return __builtin_ia32_cmpnltps(A, B);
+}
+
+__m128d test_cmpnltpd(__m128d A, __m128d B) {
+ // CIR-LABEL: cir.func dso_local @test_cmpnltpd(
+ // CIR: %[[ARG0:.*]]: !cir.vector<2 x !cir.double> {{.*}}, %[[ARG1:.*]]: !cir.vector<2 x !cir.double> {{.*}}) -> !cir.vector<2 x !cir.double> inline(never) {
+ // CIR: %[[ALLOCA_0:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>>, ["A", init] {alignment = 16 : i64}
+ // CIR: %[[ALLOCA_1:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>>, ["B", init] {alignment = 16 : i64}
+ // CIR: %[[ALLOCA_2:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>>, ["__retval"] {alignment = 16 : i64}
+ // CIR: cir.store %[[ARG0]], %[[ALLOCA_0]] : !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>>
+ // CIR: cir.store %[[ARG1]], %[[ALLOCA_1]] : !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>>
+ // CIR: %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr<!cir.vector<2 x !cir.double>>, !cir.vector<2 x !cir.double>
+ // CIR: %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr<!cir.vector<2 x !cir.double>>, !cir.vector<2 x !cir.double>
+ // CIR: %[[VEC_0:.*]] = cir.vec.cmp(lt, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<2 x !cir.double>, !cir.vector<2 x !s64i>
+ // CIR: %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>
+ // CIR: %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<2 x !s64i> -> !cir.vector<2 x !cir.double>
+ // CIR: cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>>
+ // CIR: %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr<!cir.vector<2 x !cir.double>>, !cir.vector<2 x !cir.double>
+ // CIR: cir.return %[[LOAD_2]] : !cir.vector<2 x !cir.double>
+ // CIR: }
+
+ // LLVM-LABEL: define dso_local <2 x double> @test_cmpnltpd(
+ // LLVM-SAME: <2 x double> [[TMP0:%.*]], <2 x double> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+ // LLVM-NEXT: [[TMP3:%.*]] = alloca <2 x double>, i64 1, align 16
+ // LLVM-NEXT: [[TMP4:%.*]] = alloca <2 x double>, i64 1, align 16
+ // LLVM-NEXT: [[TMP5:%.*]] = alloca <2 x double>, i64 1, align 16
+ // LLVM-NEXT: store <2 x double> [[TMP0]], ptr [[TMP3]], align 16
+ // LLVM-NEXT: store <2 x double> [[TMP1]], ptr [[TMP4]], align 16
+ // LLVM-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP3]], align 16
+ // LLVM-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[TMP4]], align 16
+ // LLVM-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP6]], [[TMP7]]
+ // LLVM-NEXT: [[TMP9:%.*]] = sext <2 x i1> [[TMP8]] to <2 x i64>
+ // LLVM-NEXT: [[TMP10:%.*]] = xor <2 x i64> [[TMP9]], splat (i64 -1)
+ // LLVM-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <2 x double>
+ // LLVM-NEXT: store <2 x double> [[TMP11]], ptr [[TMP5]], align 16
+ // LLVM-NEXT: [[TMP12:%.*]] = load <2 x double>, ptr [[TMP5]], align 16
+ // LLVM-NEXT: ret <2 x double> [[TMP12]]
+
+ // OGCG-LABEL: define dso_local <2 x double> @test_cmpnltpd(
+ // OGCG-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+ // OGCG-NEXT: [[ENTRY:.*:]]
+ // OGCG-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+ // OGCG-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+ // OGCG-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+ // OGCG-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+ // OGCG-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+ // OGCG-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+ // OGCG-NEXT: [[TMP2:%.*]] = fcmp uge <2 x double> [[TMP0]], [[TMP1]]
+ // OGCG-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+ // OGCG-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+ // OGCG-NEXT: ret <2 x double> [[TMP4]]
+ return __builtin_ia32_cmpnltpd(A, B);
+}
More information about the cfe-commits
mailing list