[llvm] 8217c2e - [VectorCombine] foldShuffleOfBinops - extend to handle icmp/fcmp ops as well (#120075)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 16 09:23:09 PST 2024
Author: Simon Pilgrim
Date: 2024-12-16T17:23:04Z
New Revision: 8217c2eaef2f93427735a45c45c7fd91178e2ed8
URL: https://github.com/llvm/llvm-project/commit/8217c2eaef2f93427735a45c45c7fd91178e2ed8
DIFF: https://github.com/llvm/llvm-project/commit/8217c2eaef2f93427735a45c45c7fd91178e2ed8.diff
LOG: [VectorCombine] foldShuffleOfBinops - extend to handle icmp/fcmp ops as well (#120075)
Extend binary instructions matching to match compare instructions + predicate as well.
Added:
Modified:
llvm/lib/Transforms/Vectorize/VectorCombine.cpp
llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index e0304944df3c0c..db77d6c955792c 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1628,7 +1628,7 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
}
/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
-/// TODO: Handle "shuffle (cmp), (cmp)" into "cmp (shuffle), (shuffle)".
+/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
ArrayRef<int> OldMask;
Instruction *LHS, *RHS;
@@ -1636,31 +1636,36 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
return false;
- BinaryOperator *B0, *B1;
- if (!match(LHS, m_BinOp(B0)) || !match(RHS, m_BinOp(B1)))
- return false;
-
- // Don't introduce poison into div/rem.
- if (llvm::is_contained(OldMask, PoisonMaskElem) && B0->isIntDivRem())
+ // TODO: Add support for addlike etc.
+ if (LHS->getOpcode() != RHS->getOpcode())
return false;
- // TODO: Add support for addlike etc.
- Instruction::BinaryOps Opcode = B0->getOpcode();
- if (Opcode != B1->getOpcode())
+ Value *X, *Y, *Z, *W;
+ bool IsCommutative = false;
+ CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+ if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
+ match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
+ auto *BO = cast<BinaryOperator>(LHS);
+ // Don't introduce poison into div/rem.
+ if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
+ return false;
+ IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
+ } else if (match(LHS, m_Cmp(Pred, m_Value(X), m_Value(Y))) &&
+ match(RHS, m_SpecificCmp(Pred, m_Value(Z), m_Value(W)))) {
+ IsCommutative = cast<CmpInst>(LHS)->isCommutative();
+ } else
return false;
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
- auto *BinOpTy = dyn_cast<FixedVectorType>(LHS->getType());
- if (!ShuffleDstTy || !BinOpTy)
+ auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
+ auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
+ if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
return false;
unsigned NumSrcElts = BinOpTy->getNumElements();
// If we have something like "add X, Y" and "add Z, X", swap ops to match.
- Value *X = LHS->getOperand(0), *Y = LHS->getOperand(1);
- Value *Z = RHS->getOperand(0), *W = RHS->getOperand(1);
- if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W &&
- (X == W || Y == Z))
+ if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
std::swap(X, Y);
auto ConvertToUnary = [NumSrcElts](int &M) {
@@ -1688,13 +1693,22 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
InstructionCost OldCost =
TTI.getInstructionCost(LHS, CostKind) +
TTI.getInstructionCost(RHS, CostKind) +
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy,
+ TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinResTy,
OldMask, CostKind, 0, nullptr, {LHS, RHS}, &I);
InstructionCost NewCost =
TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) +
- TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}) +
- TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
+ TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W});
+
+ if (Pred == CmpInst::BAD_ICMP_PREDICATE) {
+ NewCost +=
+ TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
+ } else {
+ auto *ShuffleCmpTy =
+ FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
+ NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
+ ShuffleDstTy, Pred, CostKind);
+ }
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
@@ -1704,7 +1718,10 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
- Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1);
+ Value *NewBO = Pred == CmpInst::BAD_ICMP_PREDICATE
+ ? Builder.CreateBinOp(
+ cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
+ : Builder.CreateCmp(Pred, Shuf0, Shuf1);
// Intersect flags from the old binops.
if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
index 008c1e7e694b96..b3360b61e66e81 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
@@ -1,21 +1,37 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse4.2 | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx512vl | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
declare void @use(<4 x i1>)
; icmp - eq v4i32 is cheap
define <4 x i32> @shuf_icmp_eq_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %w) {
-; CHECK-LABEL: define <4 x i32> @shuf_icmp_eq_v4i32(
-; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[C0:%.*]] = icmp eq <4 x i32> [[X]], [[Y]]
-; CHECK-NEXT: [[C1:%.*]] = icmp eq <4 x i32> [[Z]], [[W]]
-; CHECK-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[C0]], <4 x i1> [[C1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; CHECK-NEXT: ret <4 x i32> [[R]]
+; SSE-LABEL: define <4 x i32> @shuf_icmp_eq_v4i32(
+; SSE-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
+; SSE-NEXT: [[C0:%.*]] = icmp eq <4 x i32> [[X]], [[Y]]
+; SSE-NEXT: [[C1:%.*]] = icmp eq <4 x i32> [[Z]], [[W]]
+; SSE-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[C0]], <4 x i1> [[C1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; SSE-NEXT: ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: define <4 x i32> @shuf_icmp_eq_v4i32(
+; AVX2-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX2-NEXT: [[C0:%.*]] = icmp eq <4 x i32> [[X]], [[Y]]
+; AVX2-NEXT: [[C1:%.*]] = icmp eq <4 x i32> [[Z]], [[W]]
+; AVX2-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[C0]], <4 x i1> [[C1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX2-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX2-NEXT: ret <4 x i32> [[R]]
+;
+; AVX512-LABEL: define <4 x i32> @shuf_icmp_eq_v4i32(
+; AVX512-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> [[Z]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[W]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX512-NEXT: [[S:%.*]] = icmp eq <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX512-NEXT: ret <4 x i32> [[R]]
;
%c0 = icmp eq <4 x i32> %x, %y
%c1 = icmp eq <4 x i32> %z, %w
@@ -27,13 +43,37 @@ define <4 x i32> @shuf_icmp_eq_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <
; icmp - eq v2i64 is only cheap on SSE4+ targets with PCMPEQQ
define <2 x i64> @shuf_icmp_eq_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z, <2 x i64> %w) {
-; CHECK-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
-; CHECK-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[C0:%.*]] = icmp eq <2 x i64> [[X]], [[Y]]
-; CHECK-NEXT: [[C1:%.*]] = icmp eq <2 x i64> [[Z]], [[W]]
-; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i1> [[C0]], <2 x i1> [[C1]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
-; CHECK-NEXT: ret <2 x i64> [[R]]
+; SSE2-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
+; SSE2-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[X]], <2 x i64> [[Z]], <2 x i32> <i32 1, i32 3>
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[Y]], <2 x i64> [[W]], <2 x i32> <i32 1, i32 3>
+; SSE2-NEXT: [[S:%.*]] = icmp eq <2 x i64> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
+; SSE2-NEXT: ret <2 x i64> [[R]]
+;
+; SSE4-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
+; SSE4-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
+; SSE4-NEXT: [[C0:%.*]] = icmp eq <2 x i64> [[X]], [[Y]]
+; SSE4-NEXT: [[C1:%.*]] = icmp eq <2 x i64> [[Z]], [[W]]
+; SSE4-NEXT: [[S:%.*]] = shufflevector <2 x i1> [[C0]], <2 x i1> [[C1]], <2 x i32> <i32 1, i32 3>
+; SSE4-NEXT: [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
+; SSE4-NEXT: ret <2 x i64> [[R]]
+;
+; AVX2-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
+; AVX2-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[C0:%.*]] = icmp eq <2 x i64> [[X]], [[Y]]
+; AVX2-NEXT: [[C1:%.*]] = icmp eq <2 x i64> [[Z]], [[W]]
+; AVX2-NEXT: [[S:%.*]] = shufflevector <2 x i1> [[C0]], <2 x i1> [[C1]], <2 x i32> <i32 1, i32 3>
+; AVX2-NEXT: [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
+; AVX2-NEXT: ret <2 x i64> [[R]]
+;
+; AVX512-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
+; AVX512-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[X]], <2 x i64> [[Z]], <2 x i32> <i32 1, i32 3>
+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[Y]], <2 x i64> [[W]], <2 x i32> <i32 1, i32 3>
+; AVX512-NEXT: [[S:%.*]] = icmp eq <2 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
+; AVX512-NEXT: ret <2 x i64> [[R]]
;
%c0 = icmp eq <2 x i64> %x, %y
%c1 = icmp eq <2 x i64> %z, %w
@@ -46,10 +86,10 @@ define <2 x i64> @shuf_icmp_eq_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z, <
define <4 x i32> @shuf_icmp_ugt_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %w) {
; CHECK-LABEL: define <4 x i32> @shuf_icmp_ugt_v4i32(
-; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[C0:%.*]] = icmp ugt <4 x i32> [[X]], [[Y]]
-; CHECK-NEXT: [[C1:%.*]] = icmp ugt <4 x i32> [[Z]], [[W]]
-; CHECK-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[C0]], <4 x i1> [[C1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> [[Z]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[W]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[S:%.*]] = icmp ugt <4 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
; CHECK-NEXT: ret <4 x i32> [[R]]
;
@@ -60,16 +100,32 @@ define <4 x i32> @shuf_icmp_ugt_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z,
ret <4 x i32> %r
}
-; Common operand is op0 of the fcmps.
+; Common operand is op0 of the fcmps (CMPPS cheaper on SSE4+).
define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
-; CHECK-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
-; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
-; CHECK-NEXT: [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
-; CHECK-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
-; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; CHECK-NEXT: ret <4 x i32> [[R]]
+; SSE2-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
+; SSE2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 0>
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+; SSE2-NEXT: [[S:%.*]] = fcmp oeq <4 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; SSE2-NEXT: ret <4 x i32> [[R]]
+;
+; SSE4-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
+; SSE4-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; SSE4-NEXT: [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
+; SSE4-NEXT: [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
+; SSE4-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+; SSE4-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; SSE4-NEXT: ret <4 x i32> [[R]]
+;
+; AVX-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
+; AVX-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX-NEXT: [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
+; AVX-NEXT: [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
+; AVX-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+; AVX-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX-NEXT: ret <4 x i32> [[R]]
;
%b0 = fcmp oeq <4 x float> %x, %y
%b1 = fcmp oeq <4 x float> %x, %z
@@ -81,13 +137,29 @@ define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float
; For commutative instructions, common operand may be swapped
define <4 x i32> @shuf_fcmp_one_v4f32_swap(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
-; CHECK-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
-; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[B0:%.*]] = fcmp one <4 x float> [[X]], [[Y]]
-; CHECK-NEXT: [[B1:%.*]] = fcmp one <4 x float> [[Z]], [[X]]
-; CHECK-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; CHECK-NEXT: ret <4 x i32> [[R]]
+; SSE-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
+; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; SSE-NEXT: [[S:%.*]] = fcmp one <4 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; SSE-NEXT: ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
+; AVX2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[B0:%.*]] = fcmp one <4 x float> [[X]], [[Y]]
+; AVX2-NEXT: [[B1:%.*]] = fcmp one <4 x float> [[Z]], [[X]]
+; AVX2-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; AVX2-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX2-NEXT: ret <4 x i32> [[R]]
+;
+; AVX512-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
+; AVX512-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; AVX512-NEXT: [[S:%.*]] = fcmp one <4 x float> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX512-NEXT: ret <4 x i32> [[R]]
;
%b0 = fcmp one <4 x float> %x, %y
%b1 = fcmp one <4 x float> %z, %x
@@ -99,13 +171,29 @@ define <4 x i32> @shuf_fcmp_one_v4f32_swap(<4 x float> %x, <4 x float> %y, <4 x
; non-commutative pred, but common op0
define <4 x i32> @shuf_icmp_sgt_v4i32_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
-; CHECK-LABEL: define <4 x i32> @shuf_icmp_sgt_v4i32_swap(
-; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[B0:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
-; CHECK-NEXT: [[B1:%.*]] = icmp sgt <4 x i32> [[X]], [[Z]]
-; CHECK-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
-; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; CHECK-NEXT: ret <4 x i32> [[R]]
+; SSE-LABEL: define <4 x i32> @shuf_icmp_sgt_v4i32_swap(
+; SSE-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[B0:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
+; SSE-NEXT: [[B1:%.*]] = icmp sgt <4 x i32> [[X]], [[Z]]
+; SSE-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
+; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; SSE-NEXT: ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: define <4 x i32> @shuf_icmp_sgt_v4i32_swap(
+; AVX2-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[B0:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
+; AVX2-NEXT: [[B1:%.*]] = icmp sgt <4 x i32> [[X]], [[Z]]
+; AVX2-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
+; AVX2-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX2-NEXT: ret <4 x i32> [[R]]
+;
+; AVX512-LABEL: define <4 x i32> @shuf_icmp_sgt_v4i32_swap(
+; AVX512-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 2>
+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[Z]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
+; AVX512-NEXT: [[S:%.*]] = icmp sgt <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX512-NEXT: ret <4 x i32> [[R]]
;
%b0 = icmp sgt <4 x i32> %x, %y
%b1 = icmp sgt <4 x i32> %x, %z
More information about the llvm-commits
mailing list