[llvm] [AMDGPU] Implement hasAndNot for scalar bitwise AND-NOT operations. (PR #112647)
Harrison Hao via llvm-commits
llvm-commits at lists.llvm.org
Tue May 13 02:16:55 PDT 2025
https://github.com/harrisonGPU updated https://github.com/llvm/llvm-project/pull/112647
>From d4260d3ee4fbcd802a9adc46a086c2cbda3bd29e Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 09:59:03 +0800
Subject: [PATCH 01/11] [AMDGPU] Implement hasAndNot for scalar bitwise AND-NOT
operations.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 8 ++++++++
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 ++
2 files changed, 10 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 236c373e70250..0b94fbf869fc8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3731,6 +3731,14 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
return DAG.getBuildVector(VT, DL, Args);
}
+bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const {
+ if (Op->isDivergent())
+ return false;
+
+ EVT VT = Op.getValueType();
+ return VT == MVT::i32 || VT == MVT::i64;
+}
+
//===----------------------------------------------------------------------===//
// Custom DAG optimizations
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index a42214865ccfd..3302cb533fc96 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -99,6 +99,8 @@ class AMDGPUTargetLowering : public TargetLowering {
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+ bool hasAndNot(SDValue Y) const override;
+
protected:
bool shouldCombineMemoryType(EVT VT) const;
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
>From 890ffe780dfd3dedf1ccb887de2dc00d75587255 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 10:31:26 +0800
Subject: [PATCH 02/11] [AMDGPU] Update value name.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 3302cb533fc96..f54954b678dac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -99,7 +99,7 @@ class AMDGPUTargetLowering : public TargetLowering {
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
- bool hasAndNot(SDValue Y) const override;
+ bool hasAndNot(SDValue Op) const override;
protected:
bool shouldCombineMemoryType(EVT VT) const;
>From 6e24fd2cc0fb88f3b88d12daacfe69694149d6c2 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 12:07:09 +0800
Subject: [PATCH 03/11] [AMDGPU] Update patch.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 16 ++++++++--------
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 9 +++++++--
2 files changed, 15 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 0b94fbf869fc8..e85597091bf97 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3731,14 +3731,6 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
return DAG.getBuildVector(VT, DL, Args);
}
-bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const {
- if (Op->isDivergent())
- return false;
-
- EVT VT = Op.getValueType();
- return VT == MVT::i32 || VT == MVT::i64;
-}
-
//===----------------------------------------------------------------------===//
// Custom DAG optimizations
//===----------------------------------------------------------------------===//
@@ -6097,3 +6089,11 @@ bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
Register N0, Register N1) const {
return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
}
+
+bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const {
+ if (Op->isDivergent())
+ return false;
+
+ EVT VT = Op.getValueType();
+ return VT == MVT::i32 || VT == MVT::i64;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index f54954b678dac..1d327755171d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -99,8 +99,6 @@ class AMDGPUTargetLowering : public TargetLowering {
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
- bool hasAndNot(SDValue Op) const override;
-
protected:
bool shouldCombineMemoryType(EVT VT) const;
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -387,6 +385,13 @@ class AMDGPUTargetLowering : public TargetLowering {
MVT getFenceOperandTy(const DataLayout &DL) const override {
return MVT::i32;
}
+
+ /// Return true if the target supports a bitwise and-not operation:
+ /// X = ~A & B
+ /// This function checks if the operation can be directly mapped to the
+ /// target's native instructions, potentially simplifying select or other
+ /// related instructions by using more efficient hardware-specific operations.
+ bool hasAndNot(SDValue Op) const override;
};
namespace AMDGPUISD {
>From 30f40a6dfd148c415c18e27a9eb662d5c155269a Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 12:54:13 +0800
Subject: [PATCH 04/11] [AMDGPU] Move to SIISelLowering.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 8 --------
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 7 -------
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 ++++++++
llvm/lib/Target/AMDGPU/SIISelLowering.h | 7 +++++++
4 files changed, 15 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e85597091bf97..236c373e70250 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -6089,11 +6089,3 @@ bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
Register N0, Register N1) const {
return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
}
-
-bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const {
- if (Op->isDivergent())
- return false;
-
- EVT VT = Op.getValueType();
- return VT == MVT::i32 || VT == MVT::i64;
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 1d327755171d1..a42214865ccfd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -385,13 +385,6 @@ class AMDGPUTargetLowering : public TargetLowering {
MVT getFenceOperandTy(const DataLayout &DL) const override {
return MVT::i32;
}
-
- /// Return true if the target supports a bitwise and-not operation:
- /// X = ~A & B
- /// This function checks if the operation can be directly mapped to the
- /// target's native instructions, potentially simplifying select or other
- /// related instructions by using more efficient hardware-specific operations.
- bool hasAndNot(SDValue Op) const override;
};
namespace AMDGPUISD {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c05ba42d999e9..cd7fbf0d796cc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17525,3 +17525,11 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
AI->eraseFromParent();
return LI;
}
+
+bool SITargetLowering::hasAndNot(SDValue Op) const {
+ if (Op->isDivergent())
+ return false;
+
+ EVT VT = Op.getValueType();
+ return VT == MVT::i32 || VT == MVT::i64;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c42366a1c04c8..484e0a221b4a8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -611,6 +611,13 @@ class SITargetLowering final : public AMDGPUTargetLowering {
MachineMemOperand::Flags
getTargetMMOFlags(const Instruction &I) const override;
+
+ /// Return true if the target supports a bitwise and-not operation:
+ /// X = ~A & B
+ /// This function checks if the operation can be directly mapped to the
+ /// target's native instructions, potentially simplifying select or other
+ /// related instructions by using more efficient hardware-specific operations.
+ bool hasAndNot(SDValue Op) const override;
};
// Returns true if argument is a boolean value which is not serialized into
>From 0e68f59b4e375e635f2e913e97a9d891f5598e24 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 14:22:34 +0800
Subject: [PATCH 05/11] [AMDGPU] Update comments.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 ++
llvm/lib/Target/AMDGPU/SIISelLowering.h | 6 ------
2 files changed, 2 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cd7fbf0d796cc..8858252790fd8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17527,6 +17527,8 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
}
bool SITargetLowering::hasAndNot(SDValue Op) const {
+ // Return false if the operation is divergent, as AND-NOT optimization
+ // requires uniform behavior across threads.
if (Op->isDivergent())
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 484e0a221b4a8..b348702a5bd8d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -611,12 +611,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
MachineMemOperand::Flags
getTargetMMOFlags(const Instruction &I) const override;
-
- /// Return true if the target supports a bitwise and-not operation:
- /// X = ~A & B
- /// This function checks if the operation can be directly mapped to the
- /// target's native instructions, potentially simplifying select or other
- /// related instructions by using more efficient hardware-specific operations.
bool hasAndNot(SDValue Op) const override;
};
>From 8919a2144804006ffa408267e2a8077650c0fe43 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Fri, 18 Oct 2024 13:21:18 +0800
Subject: [PATCH 06/11] [AMDGPU] Add a lit test for hasAndNot.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 86 +++++++++++++++++++++--
llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 +
llvm/test/CodeGen/AMDGPU/andornot.ll | 39 ++++++++++
3 files changed, 122 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/andornot.ll
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8858252790fd8..d42ddc4612d53 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7029,6 +7029,81 @@ static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
}
}
+SDValue SITargetLowering::combineAnd(SDValue Op,
+ DAGCombinerInfo &DCI) const {
+ const unsigned Opc = Op.getOpcode();
+ assert(Opc == ISD::AND);
+
+ auto &DAG = DCI.DAG;
+ SDLoc DL(Op);
+
+ if(hasAndNot(Op)) {
+ SDValue LHS = Op->getOperand(0);
+ SDValue RHS = Op->getOperand(1);
+
+ // (and LHS, (or Y, ~Z))
+ if (RHS.getOpcode() == ISD::OR && RHS.hasOneUse()) {
+ SDValue Y = RHS->getOperand(0);
+ SDValue NotZ = RHS->getOperand(1);
+
+ if (NotZ.getOpcode() == ISD::XOR && isAllOnesConstant(NotZ->getOperand(1))) {
+ SDValue Z = NotZ->getOperand(0);
+
+ if (!isa<ConstantSDNode>(Y)) {
+ SDValue NotY = DAG.getNOT(DL, Y, Y.getValueType());
+ SDValue AndNotYZ = DAG.getNode(ISD::AND, DL, Y.getValueType(), NotY, Z);
+ SDValue NotAndNotYZ = DAG.getNOT(DL, AndNotYZ, AndNotYZ.getValueType());
+ SDValue NewAnd = DAG.getNode(ISD::AND, DL, Op.getValueType(), LHS, NotAndNotYZ);
+ return NewAnd;
+ }
+ }
+ }
+ }
+
+ EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
+ : Op->getOperand(0).getValueType();
+ auto ExtTy = OpTy.changeElementType(MVT::i32);
+
+ if (DCI.isBeforeLegalizeOps() ||
+ isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
+ return SDValue();
+
+ SDValue LHS;
+ SDValue RHS;
+ if (Opc == ISD::SELECT) {
+ LHS = Op->getOperand(1);
+ RHS = Op->getOperand(2);
+ } else {
+ LHS = Op->getOperand(0);
+ RHS = Op->getOperand(1);
+ }
+
+ const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
+ LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
+
+ // Special case: for shifts, the RHS always needs a zext.
+ if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
+ RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
+ else
+ RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
+
+ // setcc always return i1/i1 vec so no need to truncate after.
+ if (Opc == ISD::SETCC) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
+ }
+
+ // For other ops, we extend the operation's return type as well so we need to
+ // truncate back to the original type.
+ SDValue NewVal;
+ if (Opc == ISD::SELECT)
+ NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
+ else
+ NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
+
+ return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
+}
+
SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
DAGCombinerInfo &DCI) const {
const unsigned Opc = Op.getOpcode();
@@ -15244,13 +15319,17 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) {
+ case ISD::AND:
+ if (auto Res = combineAnd(SDValue(N, 0), DCI))
+ return Res;
+ break;
case ISD::ADD:
case ISD::SUB:
case ISD::SHL:
case ISD::SRL:
case ISD::SRA:
- case ISD::AND:
case ISD::OR:
case ISD::XOR:
case ISD::MUL:
@@ -15356,7 +15435,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case AMDGPUISD::CLAMP:
return performClampCombine(N, DCI);
case ISD::SCALAR_TO_VECTOR: {
- SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
// v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
@@ -17527,8 +17605,8 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
}
bool SITargetLowering::hasAndNot(SDValue Op) const {
- // Return false if the operation is divergent, as AND-NOT optimization
- // requires uniform behavior across threads.
+ // Return false if the operation is divergent, as AND-NOT is a scalar-only
+ // instruction.
if (Op->isDivergent())
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index b348702a5bd8d..a418bae67ebc4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -148,6 +148,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue combineAnd(SDValue Op, DAGCombinerInfo &DCI) const;
SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AMDGPU/andornot.ll b/llvm/test/CodeGen/AMDGPU/andornot.ll
new file mode 100644
index 0000000000000..821709847ab8d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/andornot.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}scalar_and_or_not_i16
+; GCN: s_not_b32
+; GCN-NEXT: s_lshr_b32
+; GCN-NEXT: s_and_b32
+; GCN-NEXT: s_andn2_b32
+define amdgpu_kernel void @scalar_and_or_not_i16(ptr addrspace(1) %out, i16 %x, i16 %y, i16 %z) {
+entry:
+ %not_z = xor i16 %z, -1
+ %or_y_not_z = or i16 %y, %not_z
+ %and_result = and i16 %x, %or_y_not_z
+ store i16 %and_result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}scalar_and_or_not_i32
+; GCN: s_andn2_b32
+; GCN-NEXT: s_andn2_b32
+define amdgpu_kernel void @scalar_and_or_not_i32(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) {
+entry:
+ %not_z = xor i32 %z, -1
+ %or_y_not_z = or i32 %y, %not_z
+ %and_result = and i32 %x, %or_y_not_z
+ store i32 %and_result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}scalar_and_or_not_i64
+; GCN: s_andn2_b64
+; GCN-NEXT: s_andn2_b64
+define amdgpu_kernel void @scalar_and_or_not_i64(ptr addrspace(1) %out, i64 %x, i64 %y, i64 %z) {
+entry:
+ %not_z = xor i64 %z, -1
+ %or_y_not_z = or i64 %y, %not_z
+ %and_result = and i64 %x, %or_y_not_z
+ store i64 %and_result, ptr addrspace(1) %out, align 4
+ ret void
+}
>From 92427f6f948768febcad43a78fdc91121ebd7a3e Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Fri, 18 Oct 2024 13:37:51 +0800
Subject: [PATCH 07/11] [AMDGPU] Fix clang format issue.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 19 +++++++++++--------
1 file changed, 11 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d42ddc4612d53..f7df921f41da6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7029,15 +7029,14 @@ static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
}
}
-SDValue SITargetLowering::combineAnd(SDValue Op,
- DAGCombinerInfo &DCI) const {
+SDValue SITargetLowering::combineAnd(SDValue Op, DAGCombinerInfo &DCI) const {
const unsigned Opc = Op.getOpcode();
assert(Opc == ISD::AND);
auto &DAG = DCI.DAG;
SDLoc DL(Op);
- if(hasAndNot(Op)) {
+ if (hasAndNot(Op)) {
SDValue LHS = Op->getOperand(0);
SDValue RHS = Op->getOperand(1);
@@ -7046,20 +7045,24 @@ SDValue SITargetLowering::combineAnd(SDValue Op,
SDValue Y = RHS->getOperand(0);
SDValue NotZ = RHS->getOperand(1);
- if (NotZ.getOpcode() == ISD::XOR && isAllOnesConstant(NotZ->getOperand(1))) {
+ if (NotZ.getOpcode() == ISD::XOR &&
+ isAllOnesConstant(NotZ->getOperand(1))) {
SDValue Z = NotZ->getOperand(0);
if (!isa<ConstantSDNode>(Y)) {
SDValue NotY = DAG.getNOT(DL, Y, Y.getValueType());
- SDValue AndNotYZ = DAG.getNode(ISD::AND, DL, Y.getValueType(), NotY, Z);
- SDValue NotAndNotYZ = DAG.getNOT(DL, AndNotYZ, AndNotYZ.getValueType());
- SDValue NewAnd = DAG.getNode(ISD::AND, DL, Op.getValueType(), LHS, NotAndNotYZ);
+ SDValue AndNotYZ =
+ DAG.getNode(ISD::AND, DL, Y.getValueType(), NotY, Z);
+ SDValue NotAndNotYZ =
+ DAG.getNOT(DL, AndNotYZ, AndNotYZ.getValueType());
+ SDValue NewAnd =
+ DAG.getNode(ISD::AND, DL, Op.getValueType(), LHS, NotAndNotYZ);
return NewAnd;
}
}
}
}
-
+
EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
: Op->getOperand(0).getValueType();
auto ExtTy = OpTy.changeElementType(MVT::i32);
>From b448b133f730a7a2d37b618f0f965da11b560012 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Fri, 18 Oct 2024 14:27:15 +0800
Subject: [PATCH 08/11] [AMDGPU] Remove combineAnd.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 87 +----------------------
llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 -
llvm/test/CodeGen/AMDGPU/andorn2.ll | 44 ++++++++++++
llvm/test/CodeGen/AMDGPU/andornot.ll | 39 ----------
4 files changed, 47 insertions(+), 124 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/andornot.ll
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f7df921f41da6..e1082e878769f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7029,84 +7029,6 @@ static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
}
}
-SDValue SITargetLowering::combineAnd(SDValue Op, DAGCombinerInfo &DCI) const {
- const unsigned Opc = Op.getOpcode();
- assert(Opc == ISD::AND);
-
- auto &DAG = DCI.DAG;
- SDLoc DL(Op);
-
- if (hasAndNot(Op)) {
- SDValue LHS = Op->getOperand(0);
- SDValue RHS = Op->getOperand(1);
-
- // (and LHS, (or Y, ~Z))
- if (RHS.getOpcode() == ISD::OR && RHS.hasOneUse()) {
- SDValue Y = RHS->getOperand(0);
- SDValue NotZ = RHS->getOperand(1);
-
- if (NotZ.getOpcode() == ISD::XOR &&
- isAllOnesConstant(NotZ->getOperand(1))) {
- SDValue Z = NotZ->getOperand(0);
-
- if (!isa<ConstantSDNode>(Y)) {
- SDValue NotY = DAG.getNOT(DL, Y, Y.getValueType());
- SDValue AndNotYZ =
- DAG.getNode(ISD::AND, DL, Y.getValueType(), NotY, Z);
- SDValue NotAndNotYZ =
- DAG.getNOT(DL, AndNotYZ, AndNotYZ.getValueType());
- SDValue NewAnd =
- DAG.getNode(ISD::AND, DL, Op.getValueType(), LHS, NotAndNotYZ);
- return NewAnd;
- }
- }
- }
- }
-
- EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
- : Op->getOperand(0).getValueType();
- auto ExtTy = OpTy.changeElementType(MVT::i32);
-
- if (DCI.isBeforeLegalizeOps() ||
- isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
- return SDValue();
-
- SDValue LHS;
- SDValue RHS;
- if (Opc == ISD::SELECT) {
- LHS = Op->getOperand(1);
- RHS = Op->getOperand(2);
- } else {
- LHS = Op->getOperand(0);
- RHS = Op->getOperand(1);
- }
-
- const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
- LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
-
- // Special case: for shifts, the RHS always needs a zext.
- if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
- RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
- else
- RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
-
- // setcc always return i1/i1 vec so no need to truncate after.
- if (Opc == ISD::SETCC) {
- ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
- return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
- }
-
- // For other ops, we extend the operation's return type as well so we need to
- // truncate back to the original type.
- SDValue NewVal;
- if (Opc == ISD::SELECT)
- NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
- else
- NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
-
- return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
-}
-
SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
DAGCombinerInfo &DCI) const {
const unsigned Opc = Op.getOpcode();
@@ -15322,17 +15244,13 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
- SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) {
- case ISD::AND:
- if (auto Res = combineAnd(SDValue(N, 0), DCI))
- return Res;
- break;
case ISD::ADD:
case ISD::SUB:
case ISD::SHL:
case ISD::SRL:
case ISD::SRA:
+ case ISD::AND:
case ISD::OR:
case ISD::XOR:
case ISD::MUL:
@@ -15438,6 +15356,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case AMDGPUISD::CLAMP:
return performClampCombine(N, DCI);
case ISD::SCALAR_TO_VECTOR: {
+ SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
// v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
@@ -17610,7 +17529,7 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
bool SITargetLowering::hasAndNot(SDValue Op) const {
// Return false if the operation is divergent, as AND-NOT is a scalar-only
// instruction.
- if (Op->isDivergent())
+ if (Op->isDivergent() || !Op->isMachineOpcode())
return false;
EVT VT = Op.getValueType();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index a418bae67ebc4..b348702a5bd8d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -148,7 +148,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
- SDValue combineAnd(SDValue Op, DAGCombinerInfo &DCI) const;
SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AMDGPU/andorn2.ll b/llvm/test/CodeGen/AMDGPU/andorn2.ll
index 3226a77bb9d34..e1fdddf4438b6 100644
--- a/llvm/test/CodeGen/AMDGPU/andorn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorn2.ll
@@ -25,6 +25,28 @@ entry:
ret void
}
+; GCN-LABEL: {{^}}scalar_andn2_i32_one_sgpr
+; GCN: s_andn2_b32
+define amdgpu_kernel void @scalar_andn2_i32_one_sgpr(
+ ptr addrspace(1) %r0, i32 inreg %a, i32 inreg %b) {
+entry:
+ %nb = xor i32 %b, -1
+ %r0.val = and i32 %a, %nb
+ store i32 %r0.val, ptr addrspace(1) %r0
+ ret void
+}
+
+; GCN-LABEL: {{^}}scalar_andn2_i64_one_sgpr
+; GCN: s_andn2_b64
+define amdgpu_kernel void @scalar_andn2_i64_one_sgpr(
+ ptr addrspace(1) %r0, i64 inreg %a, i64 inreg %b) {
+entry:
+ %nb = xor i64 %b, -1
+ %r0.val = and i64 %a, %nb
+ store i64 %r0.val, ptr addrspace(1) %r0
+ ret void
+}
+
; GCN-LABEL: {{^}}scalar_orn2_i32_one_use
; GCN: s_orn2_b32
define amdgpu_kernel void @scalar_orn2_i32_one_use(
@@ -47,6 +69,28 @@ entry:
ret void
}
+; GCN-LABEL: {{^}}scalar_orn2_i32_one_use_sgpr
+; GCN: s_orn2_b32
+define amdgpu_kernel void @scalar_orn2_i32_one_use_sgpr(
+ ptr addrspace(1) %r0, i32 inreg %a, i32 inreg %b) {
+entry:
+ %nb = xor i32 %b, -1
+ %r0.val = or i32 %a, %nb
+ store i32 %r0.val, ptr addrspace(1) %r0
+ ret void
+}
+
+; GCN-LABEL: {{^}}scalar_orn2_i64_one_use_sgpr
+; GCN: s_orn2_b64
+define amdgpu_kernel void @scalar_orn2_i64_one_use_sgpr(
+ ptr addrspace(1) %r0, i64 inreg %a, i64 inreg %b) {
+entry:
+ %nb = xor i64 %b, -1
+ %r0.val = or i64 %a, %nb
+ store i64 %r0.val, ptr addrspace(1) %r0
+ ret void
+}
+
; GCN-LABEL: {{^}}vector_andn2_i32_s_v_one_use
; GCN: v_not_b32
; GCN: v_and_b32
diff --git a/llvm/test/CodeGen/AMDGPU/andornot.ll b/llvm/test/CodeGen/AMDGPU/andornot.ll
deleted file mode 100644
index 821709847ab8d..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/andornot.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-
-; GCN-LABEL: {{^}}scalar_and_or_not_i16
-; GCN: s_not_b32
-; GCN-NEXT: s_lshr_b32
-; GCN-NEXT: s_and_b32
-; GCN-NEXT: s_andn2_b32
-define amdgpu_kernel void @scalar_and_or_not_i16(ptr addrspace(1) %out, i16 %x, i16 %y, i16 %z) {
-entry:
- %not_z = xor i16 %z, -1
- %or_y_not_z = or i16 %y, %not_z
- %and_result = and i16 %x, %or_y_not_z
- store i16 %and_result, ptr addrspace(1) %out, align 4
- ret void
-}
-
-; GCN-LABEL: {{^}}scalar_and_or_not_i32
-; GCN: s_andn2_b32
-; GCN-NEXT: s_andn2_b32
-define amdgpu_kernel void @scalar_and_or_not_i32(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) {
-entry:
- %not_z = xor i32 %z, -1
- %or_y_not_z = or i32 %y, %not_z
- %and_result = and i32 %x, %or_y_not_z
- store i32 %and_result, ptr addrspace(1) %out, align 4
- ret void
-}
-
-; GCN-LABEL: {{^}}scalar_and_or_not_i64
-; GCN: s_andn2_b64
-; GCN-NEXT: s_andn2_b64
-define amdgpu_kernel void @scalar_and_or_not_i64(ptr addrspace(1) %out, i64 %x, i64 %y, i64 %z) {
-entry:
- %not_z = xor i64 %z, -1
- %or_y_not_z = or i64 %y, %not_z
- %and_result = and i64 %x, %or_y_not_z
- store i64 %and_result, ptr addrspace(1) %out, align 4
- ret void
-}
>From f23fa09c31d06acfc4d2b11d42479506c519124d Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Mon, 21 Oct 2024 15:59:26 +0800
Subject: [PATCH 09/11] [AMDGPU] Update lit test.
---
llvm/test/CodeGen/AMDGPU/andorn2.ll | 32 +++++++++++------------------
1 file changed, 12 insertions(+), 20 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/andorn2.ll b/llvm/test/CodeGen/AMDGPU/andorn2.ll
index e1fdddf4438b6..4fe7e21b2adea 100644
--- a/llvm/test/CodeGen/AMDGPU/andorn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorn2.ll
@@ -27,24 +27,20 @@ entry:
; GCN-LABEL: {{^}}scalar_andn2_i32_one_sgpr
; GCN: s_andn2_b32
-define amdgpu_kernel void @scalar_andn2_i32_one_sgpr(
- ptr addrspace(1) %r0, i32 inreg %a, i32 inreg %b) {
+define i32 @scalar_andn2_i32_one_sgpr(i32 inreg %a, i32 inreg %b) {
entry:
%nb = xor i32 %b, -1
- %r0.val = and i32 %a, %nb
- store i32 %r0.val, ptr addrspace(1) %r0
- ret void
+ %and = and i32 %a, %nb
+ ret i32 %and
}
; GCN-LABEL: {{^}}scalar_andn2_i64_one_sgpr
; GCN: s_andn2_b64
-define amdgpu_kernel void @scalar_andn2_i64_one_sgpr(
- ptr addrspace(1) %r0, i64 inreg %a, i64 inreg %b) {
+define i64 @scalar_andn2_i64_one_sgpr(i64 inreg %a, i64 inreg %b) {
entry:
%nb = xor i64 %b, -1
- %r0.val = and i64 %a, %nb
- store i64 %r0.val, ptr addrspace(1) %r0
- ret void
+ %and = and i64 %a, %nb
+ ret i64 %and
}
; GCN-LABEL: {{^}}scalar_orn2_i32_one_use
@@ -71,24 +67,20 @@ entry:
; GCN-LABEL: {{^}}scalar_orn2_i32_one_use_sgpr
; GCN: s_orn2_b32
-define amdgpu_kernel void @scalar_orn2_i32_one_use_sgpr(
- ptr addrspace(1) %r0, i32 inreg %a, i32 inreg %b) {
+define i32 @scalar_orn2_i32_one_use_sgpr(i32 inreg %a, i32 inreg %b) {
entry:
%nb = xor i32 %b, -1
- %r0.val = or i32 %a, %nb
- store i32 %r0.val, ptr addrspace(1) %r0
- ret void
+ %or = or i32 %a, %nb
+ ret i32 %or;
}
; GCN-LABEL: {{^}}scalar_orn2_i64_one_use_sgpr
; GCN: s_orn2_b64
-define amdgpu_kernel void @scalar_orn2_i64_one_use_sgpr(
- ptr addrspace(1) %r0, i64 inreg %a, i64 inreg %b) {
+define i64 @scalar_orn2_i64_one_use_sgpr(i64 inreg %a, i64 inreg %b) {
entry:
%nb = xor i64 %b, -1
- %r0.val = or i64 %a, %nb
- store i64 %r0.val, ptr addrspace(1) %r0
- ret void
+ %or = or i64 %a, %nb
+ ret i64 %or;
}
; GCN-LABEL: {{^}}vector_andn2_i32_s_v_one_use
>From 307a27581aefe048a1c68e349d23d54ab216c14e Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Sun, 27 Apr 2025 18:03:24 +0800
Subject: [PATCH 10/11] [AMDGPU] Add unfold test.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
llvm/test/CodeGen/AMDGPU/andorn2.ll | 36 -
...unfold-masked-merge-scalar-variablemask.ll | 764 ++++++++++++++++++
3 files changed, 765 insertions(+), 37 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e1082e878769f..970ae28f39d0c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17529,7 +17529,7 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
bool SITargetLowering::hasAndNot(SDValue Op) const {
// Return false if the operation is divergent, as AND-NOT is a scalar-only
// instruction.
- if (Op->isDivergent() || !Op->isMachineOpcode())
+ if (Op->isDivergent())
return false;
EVT VT = Op.getValueType();
diff --git a/llvm/test/CodeGen/AMDGPU/andorn2.ll b/llvm/test/CodeGen/AMDGPU/andorn2.ll
index 4fe7e21b2adea..3226a77bb9d34 100644
--- a/llvm/test/CodeGen/AMDGPU/andorn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorn2.ll
@@ -25,24 +25,6 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}scalar_andn2_i32_one_sgpr
-; GCN: s_andn2_b32
-define i32 @scalar_andn2_i32_one_sgpr(i32 inreg %a, i32 inreg %b) {
-entry:
- %nb = xor i32 %b, -1
- %and = and i32 %a, %nb
- ret i32 %and
-}
-
-; GCN-LABEL: {{^}}scalar_andn2_i64_one_sgpr
-; GCN: s_andn2_b64
-define i64 @scalar_andn2_i64_one_sgpr(i64 inreg %a, i64 inreg %b) {
-entry:
- %nb = xor i64 %b, -1
- %and = and i64 %a, %nb
- ret i64 %and
-}
-
; GCN-LABEL: {{^}}scalar_orn2_i32_one_use
; GCN: s_orn2_b32
define amdgpu_kernel void @scalar_orn2_i32_one_use(
@@ -65,24 +47,6 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}scalar_orn2_i32_one_use_sgpr
-; GCN: s_orn2_b32
-define i32 @scalar_orn2_i32_one_use_sgpr(i32 inreg %a, i32 inreg %b) {
-entry:
- %nb = xor i32 %b, -1
- %or = or i32 %a, %nb
- ret i32 %or;
-}
-
-; GCN-LABEL: {{^}}scalar_orn2_i64_one_use_sgpr
-; GCN: s_orn2_b64
-define i64 @scalar_orn2_i64_one_use_sgpr(i64 inreg %a, i64 inreg %b) {
-entry:
- %nb = xor i64 %b, -1
- %or = or i64 %a, %nb
- ret i64 %or;
-}
-
; GCN-LABEL: {{^}}vector_andn2_i32_s_v_one_use
; GCN: v_not_b32
; GCN: v_and_b32
diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
new file mode 100644
index 0000000000000..6eeeb71006399
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
@@ -0,0 +1,764 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+
+define i32 @out32(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: out32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s0, s0, s2
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %mx = and i32 %x, %mask
+ %notmask = xor i32 %mask, -1
+ %my = and i32 %y, %notmask
+ %r = or i32 %mx, %my
+ ret i32 %r
+}
+
+define i64 @out64(i64 inreg %x, i64 inreg %y, i64 inreg %mask) {
+; GCN-LABEL: out64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[16:17]
+; GCN-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[16:17]
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %mx = and i64 %x, %mask
+ %notmask = xor i64 %mask, -1
+ %my = and i64 %y, %notmask
+ %r = or i64 %mx, %my
+ ret i64 %r
+}
+
+define i32 @in32(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
+; GCN-NEXT: s_and_b32 s0, s0, s2
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %n0, %mask
+ %r = xor i32 %n1, %y
+ ret i32 %r
+}
+
+define i64 @in64(i64 inreg %x, i64 inreg %y, i64 inreg %mask) {
+; GCN-LABEL: in64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[16:17]
+; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[16:17]
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %n0 = xor i64 %x, %y
+ %n1 = and i64 %n0, %mask
+ %r = xor i64 %n1, %y
+ ret i64 %r
+}
+; ============================================================================ ;
+; Commutativity tests.
+; ============================================================================ ;
+define i32 @in_commutativity_0_0_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_commutativity_0_0_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
+; GCN-NEXT: s_and_b32 s0, s0, s2
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %mask, %n0
+ %r = xor i32 %n1, %y
+ ret i32 %r
+}
+
+define i32 @in_commutativity_0_1_0(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_commutativity_0_1_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
+; GCN-NEXT: s_and_b32 s0, s0, s2
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %n0, %mask
+ %r = xor i32 %y, %n1
+ ret i32 %r
+}
+
+define i32 @in_commutativity_0_1_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_commutativity_0_1_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
+; GCN-NEXT: s_and_b32 s0, s0, s2
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %mask, %n0
+ %r = xor i32 %y, %n1
+ ret i32 %r
+}
+
+define i32 @in_commutativity_1_0_0(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_commutativity_1_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_not1_b32 s0, s0, s2
+; GCN-NEXT: s_and_b32 s1, s1, s2
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s1, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %n0, %mask
+ %r = xor i32 %n1, %x
+ ret i32 %r
+}
+
+define i32 @in_commutativity_1_0_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_commutativity_1_0_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_not1_b32 s0, s0, s2
+; GCN-NEXT: s_and_b32 s1, s1, s2
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s1, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %mask, %n0
+ %r = xor i32 %n1, %x
+ ret i32 %r
+}
+
+define i32 @in_commutativity_1_1_0(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_commutativity_1_1_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_not1_b32 s0, s0, s2
+; GCN-NEXT: s_and_b32 s1, s1, s2
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s1, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %n0, %mask
+ %r = xor i32 %x, %n1
+ ret i32 %r
+}
+
+define i32 @in_commutativity_1_1_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_commutativity_1_1_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_not1_b32 s0, s0, s2
+; GCN-NEXT: s_and_b32 s1, s1, s2
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s1, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %mask, %n0
+ %r = xor i32 %x, %n1
+ ret i32 %r
+}
+; ============================================================================ ;
+; Y is an 'and' too.
+; ============================================================================ ;
+define i32 @in_complex_y0(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, i32 inreg %mask) {
+; GCN-LABEL: in_complex_y0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s1, s1, s2
+; GCN-NEXT: s_and_b32 s0, s0, s3
+; GCN-NEXT: s_and_not1_b32 s1, s1, s3
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %y = and i32 %y_hi, %y_low
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %n0, %mask
+ %r = xor i32 %n1, %y
+ ret i32 %r
+}
+
+define i32 @in_complex_y1(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, i32 inreg %mask) {
+; GCN-LABEL: in_complex_y1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s1, s1, s2
+; GCN-NEXT: s_and_b32 s0, s0, s3
+; GCN-NEXT: s_and_not1_b32 s1, s1, s3
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %y = and i32 %y_hi, %y_low
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %n0, %mask
+ %r = xor i32 %y, %n1
+ ret i32 %r
+}
+; ============================================================================ ;
+; M is an 'xor' too.
+; ============================================================================ ;
+define i32 @in_complex_m0(i32 inreg %x, i32 inreg %y, i32 inreg %m_a, i32 inreg %m_b) {
+; GCN-LABEL: in_complex_m0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_xor_b32 s2, s2, s3
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
+; GCN-NEXT: s_and_b32 s0, s0, s2
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %mask = xor i32 %m_a, %m_b
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %n0, %mask
+ %r = xor i32 %n1, %y
+ ret i32 %r
+}
+
+define i32 @in_complex_m1(i32 inreg %x, i32 inreg %y, i32 inreg %m_a, i32 inreg %m_b) {
+; GCN-LABEL: in_complex_m1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_xor_b32 s2, s2, s3
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
+; GCN-NEXT: s_and_b32 s0, s0, s2
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %mask = xor i32 %m_a, %m_b
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %mask, %n0
+ %r = xor i32 %n1, %y
+ ret i32 %r
+}
+; ============================================================================ ;
+; Both Y and M are complex.
+; ============================================================================ ;
+define i32 @in_complex_y0_m0(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, i32 inreg %m_a, i32 inreg %m_b) {
+; GCN-LABEL: in_complex_y0_m0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s1, s1, s2
+; GCN-NEXT: s_xor_b32 s2, s3, s16
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
+; GCN-NEXT: s_and_b32 s0, s0, s2
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %y = and i32 %y_hi, %y_low
+ %mask = xor i32 %m_a, %m_b
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %n0, %mask
+ %r = xor i32 %n1, %y
+ ret i32 %r
+}
+
+define i32 @in_complex_y1_m0(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, i32 inreg %m_a, i32 inreg %m_b) {
+; GCN-LABEL: in_complex_y1_m0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s1, s1, s2
+; GCN-NEXT: s_xor_b32 s2, s3, s16
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
+; GCN-NEXT: s_and_b32 s0, s0, s2
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %y = and i32 %y_hi, %y_low
+ %mask = xor i32 %m_a, %m_b
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %n0, %mask
+ %r = xor i32 %y, %n1
+ ret i32 %r
+}
+
+define i32 @in_complex_y0_m1(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, i32 inreg %m_a, i32 inreg %m_b) {
+; GCN-LABEL: in_complex_y0_m1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s1, s1, s2
+; GCN-NEXT: s_xor_b32 s2, s3, s16
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
+; GCN-NEXT: s_and_b32 s0, s0, s2
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %y = and i32 %y_hi, %y_low
+ %mask = xor i32 %m_a, %m_b
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %mask, %n0
+ %r = xor i32 %n1, %y
+ ret i32 %r
+}
+
+define i32 @in_complex_y1_m1(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, i32 inreg %m_a, i32 inreg %m_b) {
+; GCN-LABEL: in_complex_y1_m1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s1, s1, s2
+; GCN-NEXT: s_xor_b32 s2, s3, s16
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
+; GCN-NEXT: s_and_b32 s0, s0, s2
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %y = and i32 %y_hi, %y_low
+ %mask = xor i32 %m_a, %m_b
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %mask, %n0
+ %r = xor i32 %y, %n1
+ ret i32 %r
+}
+; ============================================================================ ;
+; Various cases with %x and/or %y being a constant
+; ============================================================================ ;
+define i32 @out_constant_varx_mone(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: out_constant_varx_mone:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s0, s2, s0
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_not1_b32 s0, s0, s2
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %notmask = xor i32 %mask, -1
+ %mx = and i32 %mask, %x
+ %my = and i32 %notmask, -1
+ %r = or i32 %mx, %my
+ ret i32 %r
+}
+
+define i32 @in_constant_varx_mone(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_constant_varx_mone:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_not_b32 s0, s0
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_nand_b32 s0, s0, s2
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %n0 = xor i32 %x, -1
+ %n1 = and i32 %n0, %mask
+ %r = xor i32 %n1, -1
+ ret i32 %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define i32 @out_constant_varx_mone_invmask(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: out_constant_varx_mone_invmask:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_or_b32 s0, s0, s2
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %notmask = xor i32 %mask, -1
+ %mx = and i32 %notmask, %x
+ %my = and i32 %mask, -1
+ %r = or i32 %mx, %my
+ ret i32 %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define i32 @in_constant_varx_mone_invmask(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_constant_varx_mone_invmask:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_not_b32 s1, s2
+; GCN-NEXT: s_not_b32 s0, s0
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_nand_b32 s0, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %notmask = xor i32 %mask, -1
+ %n0 = xor i32 %x, -1
+ %n1 = and i32 %n0, %notmask
+ %r = xor i32 %n1, -1
+ ret i32 %r
+}
+
+define i32 @out_constant_varx_42(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: out_constant_varx_42:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s0, s2, s0
+; GCN-NEXT: s_and_not1_b32 s1, 42, s2
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %notmask = xor i32 %mask, -1
+ %mx = and i32 %mask, %x
+ %my = and i32 %notmask, 42
+ %r = or i32 %mx, %my
+ ret i32 %r
+}
+
+define i32 @in_constant_varx_42(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_constant_varx_42:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_not1_b32 s1, 42, s2
+; GCN-NEXT: s_and_b32 s0, s0, s2
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %n0 = xor i32 %x, 42
+ %n1 = and i32 %n0, %mask
+ %r = xor i32 %n1, 42
+ ret i32 %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define i32 @out_constant_varx_42_invmask(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: out_constant_varx_42_invmask:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_not1_b32 s0, s0, s2
+; GCN-NEXT: s_and_b32 s1, s2, 42
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %notmask = xor i32 %mask, -1
+ %mx = and i32 %notmask, %x
+ %my = and i32 %mask, 42
+ %r = or i32 %mx, %my
+ ret i32 %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define i32 @in_constant_varx_42_invmask(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_constant_varx_42_invmask:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s1, s2, 42
+; GCN-NEXT: s_and_not1_b32 s0, s0, s2
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %notmask = xor i32 %mask, -1
+ %n0 = xor i32 %x, 42
+ %n1 = and i32 %n0, %notmask
+ %r = xor i32 %n1, 42
+ ret i32 %r
+}
+
+define i32 @out_constant_mone_vary(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: out_constant_mone_vary:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_or_b32 s0, s1, s2
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %notmask = xor i32 %mask, -1
+ %mx = and i32 %mask, -1
+ %my = and i32 %notmask, %y
+ %r = or i32 %mx, %my
+ ret i32 %r
+}
+
+define i32 @in_constant_mone_vary(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_constant_mone_vary:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_or_b32 s0, s2, s1
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %n0 = xor i32 -1, %y
+ %n1 = and i32 %n0, %mask
+ %r = xor i32 %n1, %y
+ ret i32 %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define i32 @out_constant_mone_vary_invmask(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: out_constant_mone_vary_invmask:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s0, s2, s1
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_not1_b32 s0, s0, s2
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %notmask = xor i32 %mask, -1
+ %mx = and i32 %notmask, -1
+ %my = and i32 %mask, %y
+ %r = or i32 %mx, %my
+ ret i32 %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define i32 @in_constant_mone_vary_invmask(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_constant_mone_vary_invmask:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_or_not1_b32 s0, s1, s2
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %notmask = xor i32 %mask, -1
+ %n0 = xor i32 -1, %y
+ %n1 = and i32 %n0, %notmask
+ %r = xor i32 %n1, %y
+ ret i32 %r
+}
+
+define i32 @out_constant_42_vary(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: out_constant_42_vary:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s0, s2, 42
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %notmask = xor i32 %mask, -1
+ %mx = and i32 %mask, 42
+ %my = and i32 %notmask, %y
+ %r = or i32 %mx, %my
+ ret i32 %r
+}
+
+define i32 @in_constant_42_vary(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_constant_42_vary:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_not1_b32 s0, s1, s2
+; GCN-NEXT: s_and_b32 s1, s2, 42
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s1, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %n0 = xor i32 42, %y
+ %n1 = and i32 %n0, %mask
+ %r = xor i32 %n1, %y
+ ret i32 %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define i32 @out_constant_42_vary_invmask(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: out_constant_42_vary_invmask:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_not1_b32 s0, 42, s2
+; GCN-NEXT: s_and_b32 s1, s2, s1
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %notmask = xor i32 %mask, -1
+ %mx = and i32 %notmask, 42
+ %my = and i32 %mask, %y
+ %r = or i32 %mx, %my
+ ret i32 %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define i32 @in_constant_42_vary_invmask(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_constant_42_vary_invmask:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s0, s1, s2
+; GCN-NEXT: s_and_not1_b32 s1, 42, s2
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s1, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %notmask = xor i32 %mask, -1
+ %n0 = xor i32 42, %y
+ %n1 = and i32 %n0, %notmask
+ %r = xor i32 %n1, %y
+ ret i32 %r
+}
+; ============================================================================ ;
+; Negative tests. Should not be folded.
+; ============================================================================ ;
+; Multi-use tests.
+declare void @use32(i32) nounwind
+define i32 @in_multiuse_A(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg %mask) nounwind {
+; GCN-LABEL: in_multiuse_A:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s2, s33
+; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: s_or_saveexec_b32 s16, -1
+; GCN-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b32 exec_lo, s16
+; GCN-NEXT: v_writelane_b32 v40, s2, 4
+; GCN-NEXT: s_add_i32 s32, s32, 16
+; GCN-NEXT: s_getpc_b64 s[16:17]
+; GCN-NEXT: s_add_u32 s16, s16, use32 at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s17, s17, use32 at gotpcrel32@hi+12
+; GCN-NEXT: s_xor_b32 s0, s0, s1
+; GCN-NEXT: s_load_b64 s[16:17], s[16:17], 0x0
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
+; GCN-NEXT: v_writelane_b32 v40, s34, 2
+; GCN-NEXT: s_mov_b32 s34, s1
+; GCN-NEXT: v_writelane_b32 v40, s35, 3
+; GCN-NEXT: s_and_b32 s35, s0, s3
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v0, s35
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT: s_xor_b32 s0, s35, s34
+; GCN-NEXT: v_readlane_b32 s35, v40, 3
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_readlane_b32 s34, v40, 2
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
+; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: s_mov_b32 s32, s33
+; GCN-NEXT: v_readlane_b32 s0, v40, 4
+; GCN-NEXT: s_or_saveexec_b32 s1, -1
+; GCN-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b32 exec_lo, s1
+; GCN-NEXT: s_mov_b32 s33, s0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %n0, %mask
+ call void @use32(i32 %n1)
+ %r = xor i32 %n1, %y
+ ret i32 %r
+}
+
+define i32 @in_multiuse_B(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg %mask) nounwind {
+; GCN-LABEL: in_multiuse_B:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s2, s33
+; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: s_or_saveexec_b32 s16, -1
+; GCN-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b32 exec_lo, s16
+; GCN-NEXT: s_add_i32 s32, s32, 16
+; GCN-NEXT: s_getpc_b64 s[16:17]
+; GCN-NEXT: s_add_u32 s16, s16, use32 at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s17, s17, use32 at gotpcrel32@hi+12
+; GCN-NEXT: v_writelane_b32 v40, s2, 4
+; GCN-NEXT: s_load_b64 s[16:17], s[16:17], 0x0
+; GCN-NEXT: s_xor_b32 s0, s0, s1
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
+; GCN-NEXT: v_writelane_b32 v40, s34, 2
+; GCN-NEXT: s_mov_b32 s34, s1
+; GCN-NEXT: v_writelane_b32 v40, s35, 3
+; GCN-NEXT: s_and_b32 s35, s0, s3
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT: s_xor_b32 s0, s35, s34
+; GCN-NEXT: v_readlane_b32 s35, v40, 3
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_readlane_b32 s34, v40, 2
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
+; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: s_mov_b32 s32, s33
+; GCN-NEXT: v_readlane_b32 s0, v40, 4
+; GCN-NEXT: s_or_saveexec_b32 s1, -1
+; GCN-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b32 exec_lo, s1
+; GCN-NEXT: s_mov_b32 s33, s0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %n0, %mask
+ call void @use32(i32 %n0)
+ %r = xor i32 %n1, %y
+ ret i32 %r
+}
+
+; Various bad variants
+define i32 @n0_badmask(i32 inreg %x, i32 inreg %y, i32 inreg %mask, i32 inreg %mask2) {
+; GCN-LABEL: n0_badmask:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s0, s0, s2
+; GCN-NEXT: s_and_not1_b32 s1, s1, s3
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %mx = and i32 %x, %mask
+ %notmask = xor i32 %mask2, -1
+ %my = and i32 %y, %notmask
+ %r = or i32 %mx, %my
+ ret i32 %r
+}
+
+define i32 @n0_badxor(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: n0_badxor:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_xor_b32 s3, s2, 1
+; GCN-NEXT: s_and_b32 s0, s0, s2
+; GCN-NEXT: s_and_b32 s1, s1, s3
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %mx = and i32 %x, %mask
+ %notmask = xor i32 %mask, 1
+ %my = and i32 %y, %notmask
+ %r = or i32 %mx, %my
+ ret i32 %r
+}
+
+define i32 @n1_thirdvar(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg %mask) {
+; GCN-LABEL: n1_thirdvar:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_xor_b32 s0, s0, s1
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_b32 s0, s0, s3
+; GCN-NEXT: s_xor_b32 s0, s0, s2
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %n0 = xor i32 %x, %y
+ %n1 = and i32 %n0, %mask
+ %r = xor i32 %n1, %z
+ ret i32 %r
+}
>From 5e6480140600702350d19c67aa24260aad45ca4b Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Sun, 27 Apr 2025 18:57:18 +0800
Subject: [PATCH 11/11] [AMDGPU] Update.
---
llvm/test/CodeGen/AMDGPU/bfi_int.ll | 109 ++-
llvm/test/CodeGen/AMDGPU/commute-compares.ll | 866 +++++++++++++++++++
2 files changed, 933 insertions(+), 42 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index 201b97d479c68..6e9cd8807b379 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -135,9 +135,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_xor_b32 s1, s1, s2
-; GFX7-NEXT: s_and_b32 s0, s0, s1
-; GFX7-NEXT: s_xor_b32 s0, s2, s0
+; GFX7-NEXT: s_andn2_b32 s2, s2, s0
+; GFX7-NEXT: s_and_b32 s0, s1, s0
+; GFX7-NEXT: s_or_b32 s0, s0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -147,9 +147,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b32 s1, s1, s2
-; GFX8-NEXT: s_and_b32 s0, s0, s1
-; GFX8-NEXT: s_xor_b32 s0, s2, s0
+; GFX8-NEXT: s_andn2_b32 s2, s2, s0
+; GFX8-NEXT: s_and_b32 s0, s1, s0
+; GFX8-NEXT: s_or_b32 s0, s0, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s0
@@ -163,9 +163,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_xor_b32 s1, s1, s2
-; GFX10-NEXT: s_and_b32 s0, s0, s1
-; GFX10-NEXT: s_xor_b32 s0, s2, s0
+; GFX10-NEXT: s_andn2_b32 s2, s2, s0
+; GFX10-NEXT: s_and_b32 s0, s1, s0
+; GFX10-NEXT: s_or_b32 s0, s0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
@@ -317,19 +317,26 @@ entry:
define amdgpu_ps float @s_s_v_bfi_sha256_ch(i32 inreg %x, i32 inreg %y, i32 %z) {
; GFX7-LABEL: s_s_v_bfi_sha256_ch:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: v_mov_b32_e32 v1, s0
-; GFX7-NEXT: v_bfi_b32 v0, v1, s1, v0
+; GFX7-NEXT: s_not_b32 s1, s1
+; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX7-NEXT: s_nand_b32 s0, s1, s0
+; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_s_v_bfi_sha256_ch:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: v_bfi_b32 v0, v1, s1, v0
+; GFX8-NEXT: s_not_b32 s1, s1
+; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT: s_nand_b32 s0, s1, s0
+; GFX8-NEXT: v_and_b32_e32 v0, s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_s_v_bfi_sha256_ch:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: v_bfi_b32 v0, s0, s1, v0
+; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT: s_not_b32 s1, s1
+; GFX10-NEXT: s_nand_b32 s0, s1, s0
+; GFX10-NEXT: v_and_b32_e32 v0, s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: s_s_v_bfi_sha256_ch:
@@ -350,30 +357,40 @@ entry:
ret float %cast
}
-define amdgpu_ps float @s_v_v_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 %z) {
+define amdgpu_ps float @s_v_v_bfi_sha256_ch(i32 inreg %x, i32 inreg %y, i32 %z) {
; GFX7-LABEL: s_v_v_bfi_sha256_ch:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1
+; GFX7-NEXT: s_not_b32 s1, s1
+; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX7-NEXT: s_nand_b32 s0, s1, s0
+; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_v_v_bfi_sha256_ch:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1
+; GFX8-NEXT: s_not_b32 s1, s1
+; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT: s_nand_b32 s0, s1, s0
+; GFX8-NEXT: v_and_b32_e32 v0, s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_v_v_bfi_sha256_ch:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: v_bfi_b32 v0, s0, v0, v1
+; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT: s_not_b32 s1, s1
+; GFX10-NEXT: s_nand_b32 s0, s1, s0
+; GFX10-NEXT: v_and_b32_e32 v0, s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: s_v_v_bfi_sha256_ch:
; GFX8-GISEL: ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT: v_bfi_b32 v0, s0, v0, v1
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-GISEL-NEXT: v_bfi_b32 v0, v1, s1, v0
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: s_v_v_bfi_sha256_ch:
; GFX10-GISEL: ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, v0, v1
+; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, s1, v0
; GFX10-GISEL-NEXT: ; return to shader part epilog
entry:
%xor0 = xor i32 %y, %z
@@ -1008,24 +1025,32 @@ define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_1(i64 %a, i64 inreg %b, i6
define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_1(i64 inreg %a, i64 inreg %b, i64 %mask) {
; GFX7-LABEL: s_s_v_bitselect_i64_pat_1:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: v_bfi_b32 v1, s3, v2, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: v_bfi_b32 v0, s2, v2, v0
+; GFX7-NEXT: s_not_b64 s[0:1], s[0:1]
+; GFX7-NEXT: v_or_b32_e32 v1, s3, v1
+; GFX7-NEXT: v_or_b32_e32 v0, s2, v0
+; GFX7-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
+; GFX7-NEXT: v_and_b32_e32 v1, s1, v1
+; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_s_v_bitselect_i64_pat_1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_bfi_b32 v1, s3, v2, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_bfi_b32 v0, s2, v2, v0
+; GFX8-NEXT: s_not_b64 s[0:1], s[0:1]
+; GFX8-NEXT: v_or_b32_e32 v1, s3, v1
+; GFX8-NEXT: v_or_b32_e32 v0, s2, v0
+; GFX8-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT: v_and_b32_e32 v1, s1, v1
+; GFX8-NEXT: v_and_b32_e32 v0, s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_s_v_bitselect_i64_pat_1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_bfi_b32 v0, s2, s0, v0
-; GFX10-NEXT: v_bfi_b32 v1, s3, s1, v1
+; GFX10-NEXT: v_or_b32_e32 v1, s3, v1
+; GFX10-NEXT: v_or_b32_e32 v0, s2, v0
+; GFX10-NEXT: s_not_b64 s[0:1], s[0:1]
+; GFX10-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX10-NEXT: v_and_b32_e32 v1, s1, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: s_s_v_bitselect_i64_pat_1:
@@ -1495,9 +1520,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; GFX7-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX7-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
+; GFX7-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX7-NEXT: s_add_u32 s0, s0, 10
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
@@ -1510,9 +1535,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1526,9 +1551,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX10-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
+; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
@@ -1583,9 +1608,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; GFX7-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX7-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
+; GFX7-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX7-NEXT: s_add_u32 s0, s0, 10
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
@@ -1598,9 +1623,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1614,9 +1639,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX10-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
+; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
index fcb871cedd0cb..a14cd8b7a12bb 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
@@ -9,6 +10,22 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
; GCN-LABEL: {{^}}commute_eq_64_i32:
; GCN: v_cmp_eq_u32_e32 vcc, 64, v{{[0-9]+}}
define amdgpu_kernel void @commute_eq_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_eq_64_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 64, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -22,6 +39,22 @@ define amdgpu_kernel void @commute_eq_64_i32(ptr addrspace(1) %out, ptr addrspac
; GCN-LABEL: {{^}}commute_ne_64_i32:
; GCN: v_cmp_ne_u32_e32 vcc, 64, v{{[0-9]+}}
define amdgpu_kernel void @commute_ne_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ne_64_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 64, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -37,6 +70,23 @@ define amdgpu_kernel void @commute_ne_64_i32(ptr addrspace(1) %out, ptr addrspac
; GCN: s_movk_i32 [[K:s[0-9]+]], 0x3039
; GCN: v_cmp_ne_u32_e32 vcc, [[K]], v{{[0-9]+}}
define amdgpu_kernel void @commute_ne_litk_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ne_litk_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_movk_i32 s4, 0x3039
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, s4, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -50,6 +100,22 @@ define amdgpu_kernel void @commute_ne_litk_i32(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_ugt_64_i32:
; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}}
define amdgpu_kernel void @commute_ugt_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ugt_64_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 64, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -63,6 +129,22 @@ define amdgpu_kernel void @commute_ugt_64_i32(ptr addrspace(1) %out, ptr addrspa
; GCN-LABEL: {{^}}commute_uge_64_i32:
; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}}
define amdgpu_kernel void @commute_uge_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_uge_64_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 63, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -76,6 +158,22 @@ define amdgpu_kernel void @commute_uge_64_i32(ptr addrspace(1) %out, ptr addrspa
; GCN-LABEL: {{^}}commute_ult_64_i32:
; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
define amdgpu_kernel void @commute_ult_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ult_64_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -89,6 +187,22 @@ define amdgpu_kernel void @commute_ult_64_i32(ptr addrspace(1) %out, ptr addrspa
; GCN-LABEL: {{^}}commute_ule_63_i32:
; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
define amdgpu_kernel void @commute_ule_63_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ule_63_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -103,6 +217,23 @@ define amdgpu_kernel void @commute_ule_63_i32(ptr addrspace(1) %out, ptr addrspa
; GCN: s_movk_i32 [[K:s[0-9]+]], 0x41{{$}}
; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}}
define amdgpu_kernel void @commute_ule_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ule_64_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_movk_i32 s4, 0x41
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_u32_e32 vcc, s4, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -116,6 +247,22 @@ define amdgpu_kernel void @commute_ule_64_i32(ptr addrspace(1) %out, ptr addrspa
; GCN-LABEL: {{^}}commute_sgt_neg1_i32:
; GCN: v_ashrrev_i32_e32 v2, 31, v2
define amdgpu_kernel void @commute_sgt_neg1_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_sgt_neg1_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_not_b32_e32 v2, v2
+; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v2
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -129,6 +276,22 @@ define amdgpu_kernel void @commute_sgt_neg1_i32(ptr addrspace(1) %out, ptr addrs
; GCN-LABEL: {{^}}commute_sge_neg2_i32:
; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}}
define amdgpu_kernel void @commute_sge_neg2_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_sge_neg2_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -3, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -142,6 +305,22 @@ define amdgpu_kernel void @commute_sge_neg2_i32(ptr addrspace(1) %out, ptr addrs
; GCN-LABEL: {{^}}commute_slt_neg16_i32:
; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}}
define amdgpu_kernel void @commute_slt_neg16_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_slt_neg16_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_i32_e32 vcc, -16, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -155,6 +334,22 @@ define amdgpu_kernel void @commute_slt_neg16_i32(ptr addrspace(1) %out, ptr addr
; GCN-LABEL: {{^}}commute_sle_5_i32:
; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}}
define amdgpu_kernel void @commute_sle_5_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_sle_5_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 6, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -172,6 +367,23 @@ define amdgpu_kernel void @commute_sle_5_i32(ptr addrspace(1) %out, ptr addrspac
; GCN-LABEL: {{^}}commute_eq_64_i64:
; GCN: v_cmp_eq_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_eq_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_eq_64_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 64, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -185,6 +397,23 @@ define amdgpu_kernel void @commute_eq_64_i64(ptr addrspace(1) %out, ptr addrspac
; GCN-LABEL: {{^}}commute_ne_64_i64:
; GCN: v_cmp_ne_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ne_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ne_64_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 64, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -198,6 +427,23 @@ define amdgpu_kernel void @commute_ne_64_i64(ptr addrspace(1) %out, ptr addrspac
; GCN-LABEL: {{^}}commute_ugt_64_i64:
; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ugt_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ugt_64_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lt_u64_e32 vcc, 64, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -211,6 +457,23 @@ define amdgpu_kernel void @commute_ugt_64_i64(ptr addrspace(1) %out, ptr addrspa
; GCN-LABEL: {{^}}commute_uge_64_i64:
; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_uge_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_uge_64_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -224,6 +487,23 @@ define amdgpu_kernel void @commute_uge_64_i64(ptr addrspace(1) %out, ptr addrspa
; GCN-LABEL: {{^}}commute_ult_64_i64:
; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ult_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ult_64_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -237,6 +517,23 @@ define amdgpu_kernel void @commute_ult_64_i64(ptr addrspace(1) %out, ptr addrspa
; GCN-LABEL: {{^}}commute_ule_63_i64:
; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ule_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ule_63_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -253,6 +550,24 @@ define amdgpu_kernel void @commute_ule_63_i64(ptr addrspace(1) %out, ptr addrspa
; GCN: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x41
; GCN: v_cmp_gt_u64_e32 vcc, [[K]], v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ule_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ule_64_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[4:5], 0x41
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -266,6 +581,24 @@ define amdgpu_kernel void @commute_ule_64_i64(ptr addrspace(1) %out, ptr addrspa
; GCN-LABEL: {{^}}commute_sgt_neg1_i64:
; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_sgt_neg1_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_sgt_neg1_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4
+; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GCN-NEXT: v_not_b32_e32 v0, v0
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -279,6 +612,23 @@ define amdgpu_kernel void @commute_sgt_neg1_i64(ptr addrspace(1) %out, ptr addrs
; GCN-LABEL: {{^}}commute_sge_neg2_i64:
; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_sge_neg2_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_sge_neg2_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -3, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -292,6 +642,23 @@ define amdgpu_kernel void @commute_sge_neg2_i64(ptr addrspace(1) %out, ptr addrs
; GCN-LABEL: {{^}}commute_slt_neg16_i64:
; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_slt_neg16_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_slt_neg16_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_i64_e32 vcc, -16, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -305,6 +672,23 @@ define amdgpu_kernel void @commute_slt_neg16_i64(ptr addrspace(1) %out, ptr addr
; GCN-LABEL: {{^}}commute_sle_5_i64:
; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_sle_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_sle_5_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 6, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -323,6 +707,22 @@ define amdgpu_kernel void @commute_sle_5_i64(ptr addrspace(1) %out, ptr addrspac
; GCN-LABEL: {{^}}commute_oeq_2.0_f32:
; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_oeq_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_oeq_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -337,6 +737,22 @@ define amdgpu_kernel void @commute_oeq_2.0_f32(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_ogt_2.0_f32:
; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_ogt_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ogt_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -350,6 +766,22 @@ define amdgpu_kernel void @commute_ogt_2.0_f32(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_oge_2.0_f32:
; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_oge_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_oge_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_le_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -363,6 +795,22 @@ define amdgpu_kernel void @commute_oge_2.0_f32(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_olt_2.0_f32:
; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_olt_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_olt_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -376,6 +824,22 @@ define amdgpu_kernel void @commute_olt_2.0_f32(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_ole_2.0_f32:
; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_ole_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ole_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ge_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -389,6 +853,22 @@ define amdgpu_kernel void @commute_ole_2.0_f32(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_one_2.0_f32:
; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_one_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_one_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lg_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -402,6 +882,22 @@ define amdgpu_kernel void @commute_one_2.0_f32(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_ord_2.0_f32:
; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
define amdgpu_kernel void @commute_ord_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ord_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -415,6 +911,22 @@ define amdgpu_kernel void @commute_ord_2.0_f32(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_ueq_2.0_f32:
; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_ueq_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ueq_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_nlg_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -428,6 +940,22 @@ define amdgpu_kernel void @commute_ueq_2.0_f32(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_ugt_2.0_f32:
; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_ugt_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ugt_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_nge_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -441,6 +969,22 @@ define amdgpu_kernel void @commute_ugt_2.0_f32(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_uge_2.0_f32:
; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_uge_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_uge_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -454,6 +998,22 @@ define amdgpu_kernel void @commute_uge_2.0_f32(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_ult_2.0_f32:
; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_ult_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ult_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_nle_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -467,6 +1027,22 @@ define amdgpu_kernel void @commute_ult_2.0_f32(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_ule_2.0_f32:
; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_ule_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ule_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -480,6 +1056,22 @@ define amdgpu_kernel void @commute_ule_2.0_f32(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_une_2.0_f32:
; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_une_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_une_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_neq_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -493,6 +1085,22 @@ define amdgpu_kernel void @commute_une_2.0_f32(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_uno_2.0_f32:
; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
define amdgpu_kernel void @commute_uno_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_uno_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -511,6 +1119,23 @@ define amdgpu_kernel void @commute_uno_2.0_f32(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_oeq_2.0_f64:
; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_oeq_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_oeq_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_eq_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -525,6 +1150,23 @@ define amdgpu_kernel void @commute_oeq_2.0_f64(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_ogt_2.0_f64:
; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ogt_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ogt_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lt_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -538,6 +1180,23 @@ define amdgpu_kernel void @commute_ogt_2.0_f64(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_oge_2.0_f64:
; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_oge_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_oge_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_le_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -551,6 +1210,23 @@ define amdgpu_kernel void @commute_oge_2.0_f64(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_olt_2.0_f64:
; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_olt_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_olt_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -564,6 +1240,23 @@ define amdgpu_kernel void @commute_olt_2.0_f64(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_ole_2.0_f64:
; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ole_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ole_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ge_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -577,6 +1270,23 @@ define amdgpu_kernel void @commute_ole_2.0_f64(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_one_2.0_f64:
; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_one_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_one_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lg_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -590,6 +1300,23 @@ define amdgpu_kernel void @commute_one_2.0_f64(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_ord_2.0_f64:
; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
define amdgpu_kernel void @commute_ord_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ord_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_o_f64_e32 vcc, v[3:4], v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -603,6 +1330,23 @@ define amdgpu_kernel void @commute_ord_2.0_f64(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_ueq_2.0_f64:
; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ueq_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ueq_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_nlg_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -616,6 +1360,23 @@ define amdgpu_kernel void @commute_ueq_2.0_f64(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_ugt_2.0_f64:
; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ugt_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ugt_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_nge_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -629,6 +1390,23 @@ define amdgpu_kernel void @commute_ugt_2.0_f64(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_uge_2.0_f64:
; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_uge_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_uge_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ngt_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -642,6 +1420,23 @@ define amdgpu_kernel void @commute_uge_2.0_f64(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_ult_2.0_f64:
; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ult_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ult_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_nle_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -655,6 +1450,23 @@ define amdgpu_kernel void @commute_ult_2.0_f64(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_ule_2.0_f64:
; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ule_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ule_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -668,6 +1480,23 @@ define amdgpu_kernel void @commute_ule_2.0_f64(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_une_2.0_f64:
; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_une_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_une_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_neq_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -681,6 +1510,23 @@ define amdgpu_kernel void @commute_une_2.0_f64(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: {{^}}commute_uno_2.0_f64:
; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
define amdgpu_kernel void @commute_uno_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_uno_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[3:4], v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -702,6 +1548,26 @@ define amdgpu_kernel void @commute_uno_2.0_f64(ptr addrspace(1) %out, ptr addrsp
; GCN: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}}
define amdgpu_kernel void @commute_frameindex(ptr addrspace(1) nocapture %out) #0 {
+; GCN-LABEL: commute_frameindex:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN-NEXT: s_mov_b32 s14, -1
+; GCN-NEXT: s_mov_b32 s15, 0xe8f000
+; GCN-NEXT: s_add_u32 s12, s12, s11
+; GCN-NEXT: s_addc_u32 s13, s13, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_mov_b32 s4, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
entry:
%stack0 = alloca i32, addrspace(5)
%ptr0 = load volatile ptr addrspace(5), ptr addrspace(1) poison
More information about the llvm-commits
mailing list