[llvm] [AMDGPU] Implement hasAndNot for scalar bitwise AND-NOT operations. (PR #112647)
Harrison Hao via llvm-commits
llvm-commits at lists.llvm.org
Mon May 19 09:42:14 PDT 2025
https://github.com/harrisonGPU updated https://github.com/llvm/llvm-project/pull/112647
>From a29b0b83bab0dd603c516078d0b927ee0030e12e Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 09:59:03 +0800
Subject: [PATCH 01/11] [AMDGPU] Implement hasAndNot for scalar bitwise AND-NOT
operations.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 8 ++++++++
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 ++
2 files changed, 10 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 7ed055e8da2b6..a3d176e75e989 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3744,6 +3744,14 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
return DAG.getBuildVector(VT, DL, Args);
}
+bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const {
+ if (Op->isDivergent())
+ return false;
+
+ EVT VT = Op.getValueType();
+ return VT == MVT::i32 || VT == MVT::i64;
+}
+
//===----------------------------------------------------------------------===//
// Custom DAG optimizations
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 0dd2183b72b24..edde293500ab1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -102,6 +102,8 @@ class AMDGPUTargetLowering : public TargetLowering {
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+ bool hasAndNot(SDValue Y) const override;
+
protected:
bool shouldCombineMemoryType(EVT VT) const;
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
>From e0ddab1f55e2c350b286487bdda37bff99d008cf Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 10:31:26 +0800
Subject: [PATCH 02/11] [AMDGPU] Update value name.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index edde293500ab1..fb4c29d4f0de3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -102,7 +102,7 @@ class AMDGPUTargetLowering : public TargetLowering {
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
- bool hasAndNot(SDValue Y) const override;
+ bool hasAndNot(SDValue Op) const override;
protected:
bool shouldCombineMemoryType(EVT VT) const;
>From 7c900c2fbe3631c2ef30edcb7f6aaf0164f1bc61 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 12:07:09 +0800
Subject: [PATCH 03/11] [AMDGPU] Update patch.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 16 ++++++++--------
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 9 +++++++--
2 files changed, 15 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index a3d176e75e989..e2aa94a242dbe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3744,14 +3744,6 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
return DAG.getBuildVector(VT, DL, Args);
}
-bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const {
- if (Op->isDivergent())
- return false;
-
- EVT VT = Op.getValueType();
- return VT == MVT::i32 || VT == MVT::i64;
-}
-
//===----------------------------------------------------------------------===//
// Custom DAG optimizations
//===----------------------------------------------------------------------===//
@@ -6116,3 +6108,11 @@ bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
Register N0, Register N1) const {
return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
}
+
+bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const {
+ if (Op->isDivergent())
+ return false;
+
+ EVT VT = Op.getValueType();
+ return VT == MVT::i32 || VT == MVT::i64;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index fb4c29d4f0de3..3c95bdaa8387c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -102,8 +102,6 @@ class AMDGPUTargetLowering : public TargetLowering {
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
- bool hasAndNot(SDValue Op) const override;
-
protected:
bool shouldCombineMemoryType(EVT VT) const;
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -390,6 +388,13 @@ class AMDGPUTargetLowering : public TargetLowering {
MVT getFenceOperandTy(const DataLayout &DL) const override {
return MVT::i32;
}
+
+ /// Return true if the target supports a bitwise and-not operation:
+ /// X = ~A & B
+ /// This function checks if the operation can be directly mapped to the
+ /// target's native instructions, potentially simplifying select or other
+ /// related instructions by using more efficient hardware-specific operations.
+ bool hasAndNot(SDValue Op) const override;
};
namespace AMDGPUISD {
>From 563de337813767afc5aa8048c96591710ec627d7 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 12:54:13 +0800
Subject: [PATCH 04/11] [AMDGPU] Move to SIISelLowering.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 8 --------
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 7 -------
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 ++++++++
llvm/lib/Target/AMDGPU/SIISelLowering.h | 7 +++++++
4 files changed, 15 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e2aa94a242dbe..7ed055e8da2b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -6108,11 +6108,3 @@ bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
Register N0, Register N1) const {
return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
}
-
-bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const {
- if (Op->isDivergent())
- return false;
-
- EVT VT = Op.getValueType();
- return VT == MVT::i32 || VT == MVT::i64;
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 3c95bdaa8387c..0dd2183b72b24 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -388,13 +388,6 @@ class AMDGPUTargetLowering : public TargetLowering {
MVT getFenceOperandTy(const DataLayout &DL) const override {
return MVT::i32;
}
-
- /// Return true if the target supports a bitwise and-not operation:
- /// X = ~A & B
- /// This function checks if the operation can be directly mapped to the
- /// target's native instructions, potentially simplifying select or other
- /// related instructions by using more efficient hardware-specific operations.
- bool hasAndNot(SDValue Op) const override;
};
namespace AMDGPUISD {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ba7e11a853347..54fc06f2ac29d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17577,3 +17577,11 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
AI->eraseFromParent();
return LI;
}
+
+bool SITargetLowering::hasAndNot(SDValue Op) const {
+ if (Op->isDivergent())
+ return false;
+
+ EVT VT = Op.getValueType();
+ return VT == MVT::i32 || VT == MVT::i64;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c42366a1c04c8..484e0a221b4a8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -611,6 +611,13 @@ class SITargetLowering final : public AMDGPUTargetLowering {
MachineMemOperand::Flags
getTargetMMOFlags(const Instruction &I) const override;
+
+ /// Return true if the target supports a bitwise and-not operation:
+ /// X = ~A & B
+ /// This function checks if the operation can be directly mapped to the
+ /// target's native instructions, potentially simplifying select or other
+ /// related instructions by using more efficient hardware-specific operations.
+ bool hasAndNot(SDValue Op) const override;
};
// Returns true if argument is a boolean value which is not serialized into
>From b06240ea60483cb30f511b2ba045401c10fd9527 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 14:22:34 +0800
Subject: [PATCH 05/11] [AMDGPU] Update comments.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 ++
llvm/lib/Target/AMDGPU/SIISelLowering.h | 6 ------
2 files changed, 2 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 54fc06f2ac29d..4e8305646c0e4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17579,6 +17579,8 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
}
bool SITargetLowering::hasAndNot(SDValue Op) const {
+ // Return false if the operation is divergent, as AND-NOT optimization
+ // requires uniform behavior across threads.
if (Op->isDivergent())
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 484e0a221b4a8..b348702a5bd8d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -611,12 +611,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
MachineMemOperand::Flags
getTargetMMOFlags(const Instruction &I) const override;
-
- /// Return true if the target supports a bitwise and-not operation:
- /// X = ~A & B
- /// This function checks if the operation can be directly mapped to the
- /// target's native instructions, potentially simplifying select or other
- /// related instructions by using more efficient hardware-specific operations.
bool hasAndNot(SDValue Op) const override;
};
>From 244612de314bd3d972159d592bed8d4637e96159 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Fri, 18 Oct 2024 13:21:18 +0800
Subject: [PATCH 06/11] [AMDGPU] Add a lit test for hasAndNot.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 86 +++++++++++++++++++++--
llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 +
llvm/test/CodeGen/AMDGPU/andornot.ll | 39 ++++++++++
3 files changed, 122 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/andornot.ll
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4e8305646c0e4..a033523276992 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7061,6 +7061,81 @@ static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
}
}
+SDValue SITargetLowering::combineAnd(SDValue Op,
+ DAGCombinerInfo &DCI) const {
+ const unsigned Opc = Op.getOpcode();
+ assert(Opc == ISD::AND);
+
+ auto &DAG = DCI.DAG;
+ SDLoc DL(Op);
+
+ if(hasAndNot(Op)) {
+ SDValue LHS = Op->getOperand(0);
+ SDValue RHS = Op->getOperand(1);
+
+ // (and LHS, (or Y, ~Z))
+ if (RHS.getOpcode() == ISD::OR && RHS.hasOneUse()) {
+ SDValue Y = RHS->getOperand(0);
+ SDValue NotZ = RHS->getOperand(1);
+
+ if (NotZ.getOpcode() == ISD::XOR && isAllOnesConstant(NotZ->getOperand(1))) {
+ SDValue Z = NotZ->getOperand(0);
+
+ if (!isa<ConstantSDNode>(Y)) {
+ SDValue NotY = DAG.getNOT(DL, Y, Y.getValueType());
+ SDValue AndNotYZ = DAG.getNode(ISD::AND, DL, Y.getValueType(), NotY, Z);
+ SDValue NotAndNotYZ = DAG.getNOT(DL, AndNotYZ, AndNotYZ.getValueType());
+ SDValue NewAnd = DAG.getNode(ISD::AND, DL, Op.getValueType(), LHS, NotAndNotYZ);
+ return NewAnd;
+ }
+ }
+ }
+ }
+
+ EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
+ : Op->getOperand(0).getValueType();
+ auto ExtTy = OpTy.changeElementType(MVT::i32);
+
+ if (DCI.isBeforeLegalizeOps() ||
+ isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
+ return SDValue();
+
+ SDValue LHS;
+ SDValue RHS;
+ if (Opc == ISD::SELECT) {
+ LHS = Op->getOperand(1);
+ RHS = Op->getOperand(2);
+ } else {
+ LHS = Op->getOperand(0);
+ RHS = Op->getOperand(1);
+ }
+
+ const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
+ LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
+
+ // Special case: for shifts, the RHS always needs a zext.
+ if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
+ RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
+ else
+ RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
+
+ // setcc always return i1/i1 vec so no need to truncate after.
+ if (Opc == ISD::SETCC) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
+ }
+
+ // For other ops, we extend the operation's return type as well so we need to
+ // truncate back to the original type.
+ SDValue NewVal;
+ if (Opc == ISD::SELECT)
+ NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
+ else
+ NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
+
+ return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
+}
+
SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
DAGCombinerInfo &DCI) const {
const unsigned Opc = Op.getOpcode();
@@ -15294,13 +15369,17 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) {
+ case ISD::AND:
+ if (auto Res = combineAnd(SDValue(N, 0), DCI))
+ return Res;
+ break;
case ISD::ADD:
case ISD::SUB:
case ISD::SHL:
case ISD::SRL:
case ISD::SRA:
- case ISD::AND:
case ISD::OR:
case ISD::XOR:
case ISD::MUL:
@@ -15408,7 +15487,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case AMDGPUISD::CLAMP:
return performClampCombine(N, DCI);
case ISD::SCALAR_TO_VECTOR: {
- SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
// v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
@@ -17579,8 +17657,8 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
}
bool SITargetLowering::hasAndNot(SDValue Op) const {
- // Return false if the operation is divergent, as AND-NOT optimization
- // requires uniform behavior across threads.
+ // Return false if the operation is divergent, as AND-NOT is a scalar-only
+ // instruction.
if (Op->isDivergent())
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index b348702a5bd8d..a418bae67ebc4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -148,6 +148,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue combineAnd(SDValue Op, DAGCombinerInfo &DCI) const;
SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AMDGPU/andornot.ll b/llvm/test/CodeGen/AMDGPU/andornot.ll
new file mode 100644
index 0000000000000..821709847ab8d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/andornot.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}scalar_and_or_not_i16
+; GCN: s_not_b32
+; GCN-NEXT: s_lshr_b32
+; GCN-NEXT: s_and_b32
+; GCN-NEXT: s_andn2_b32
+define amdgpu_kernel void @scalar_and_or_not_i16(ptr addrspace(1) %out, i16 %x, i16 %y, i16 %z) {
+entry:
+ %not_z = xor i16 %z, -1
+ %or_y_not_z = or i16 %y, %not_z
+ %and_result = and i16 %x, %or_y_not_z
+ store i16 %and_result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}scalar_and_or_not_i32
+; GCN: s_andn2_b32
+; GCN-NEXT: s_andn2_b32
+define amdgpu_kernel void @scalar_and_or_not_i32(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) {
+entry:
+ %not_z = xor i32 %z, -1
+ %or_y_not_z = or i32 %y, %not_z
+ %and_result = and i32 %x, %or_y_not_z
+ store i32 %and_result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}scalar_and_or_not_i64
+; GCN: s_andn2_b64
+; GCN-NEXT: s_andn2_b64
+define amdgpu_kernel void @scalar_and_or_not_i64(ptr addrspace(1) %out, i64 %x, i64 %y, i64 %z) {
+entry:
+ %not_z = xor i64 %z, -1
+ %or_y_not_z = or i64 %y, %not_z
+ %and_result = and i64 %x, %or_y_not_z
+ store i64 %and_result, ptr addrspace(1) %out, align 4
+ ret void
+}
>From 70d8ac0a21294220985e67826d8971c28f88da1c Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Fri, 18 Oct 2024 13:37:51 +0800
Subject: [PATCH 07/11] [AMDGPU] Fix clang format issue.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 19 +++++++++++--------
1 file changed, 11 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a033523276992..751b344bae37a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7061,15 +7061,14 @@ static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
}
}
-SDValue SITargetLowering::combineAnd(SDValue Op,
- DAGCombinerInfo &DCI) const {
+SDValue SITargetLowering::combineAnd(SDValue Op, DAGCombinerInfo &DCI) const {
const unsigned Opc = Op.getOpcode();
assert(Opc == ISD::AND);
auto &DAG = DCI.DAG;
SDLoc DL(Op);
- if(hasAndNot(Op)) {
+ if (hasAndNot(Op)) {
SDValue LHS = Op->getOperand(0);
SDValue RHS = Op->getOperand(1);
@@ -7078,20 +7077,24 @@ SDValue SITargetLowering::combineAnd(SDValue Op,
SDValue Y = RHS->getOperand(0);
SDValue NotZ = RHS->getOperand(1);
- if (NotZ.getOpcode() == ISD::XOR && isAllOnesConstant(NotZ->getOperand(1))) {
+ if (NotZ.getOpcode() == ISD::XOR &&
+ isAllOnesConstant(NotZ->getOperand(1))) {
SDValue Z = NotZ->getOperand(0);
if (!isa<ConstantSDNode>(Y)) {
SDValue NotY = DAG.getNOT(DL, Y, Y.getValueType());
- SDValue AndNotYZ = DAG.getNode(ISD::AND, DL, Y.getValueType(), NotY, Z);
- SDValue NotAndNotYZ = DAG.getNOT(DL, AndNotYZ, AndNotYZ.getValueType());
- SDValue NewAnd = DAG.getNode(ISD::AND, DL, Op.getValueType(), LHS, NotAndNotYZ);
+ SDValue AndNotYZ =
+ DAG.getNode(ISD::AND, DL, Y.getValueType(), NotY, Z);
+ SDValue NotAndNotYZ =
+ DAG.getNOT(DL, AndNotYZ, AndNotYZ.getValueType());
+ SDValue NewAnd =
+ DAG.getNode(ISD::AND, DL, Op.getValueType(), LHS, NotAndNotYZ);
return NewAnd;
}
}
}
}
-
+
EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
: Op->getOperand(0).getValueType();
auto ExtTy = OpTy.changeElementType(MVT::i32);
>From ee5ca4e00a0c06200fd11901a85d3ec59fb57eef Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Fri, 18 Oct 2024 14:27:15 +0800
Subject: [PATCH 08/11] [AMDGPU] Remove combineAnd.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 87 +----------------------
llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 -
llvm/test/CodeGen/AMDGPU/andorn2.ll | 44 ++++++++++++
llvm/test/CodeGen/AMDGPU/andornot.ll | 39 ----------
4 files changed, 47 insertions(+), 124 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/andornot.ll
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 751b344bae37a..9e1f887449dc3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7061,84 +7061,6 @@ static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
}
}
-SDValue SITargetLowering::combineAnd(SDValue Op, DAGCombinerInfo &DCI) const {
- const unsigned Opc = Op.getOpcode();
- assert(Opc == ISD::AND);
-
- auto &DAG = DCI.DAG;
- SDLoc DL(Op);
-
- if (hasAndNot(Op)) {
- SDValue LHS = Op->getOperand(0);
- SDValue RHS = Op->getOperand(1);
-
- // (and LHS, (or Y, ~Z))
- if (RHS.getOpcode() == ISD::OR && RHS.hasOneUse()) {
- SDValue Y = RHS->getOperand(0);
- SDValue NotZ = RHS->getOperand(1);
-
- if (NotZ.getOpcode() == ISD::XOR &&
- isAllOnesConstant(NotZ->getOperand(1))) {
- SDValue Z = NotZ->getOperand(0);
-
- if (!isa<ConstantSDNode>(Y)) {
- SDValue NotY = DAG.getNOT(DL, Y, Y.getValueType());
- SDValue AndNotYZ =
- DAG.getNode(ISD::AND, DL, Y.getValueType(), NotY, Z);
- SDValue NotAndNotYZ =
- DAG.getNOT(DL, AndNotYZ, AndNotYZ.getValueType());
- SDValue NewAnd =
- DAG.getNode(ISD::AND, DL, Op.getValueType(), LHS, NotAndNotYZ);
- return NewAnd;
- }
- }
- }
- }
-
- EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
- : Op->getOperand(0).getValueType();
- auto ExtTy = OpTy.changeElementType(MVT::i32);
-
- if (DCI.isBeforeLegalizeOps() ||
- isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
- return SDValue();
-
- SDValue LHS;
- SDValue RHS;
- if (Opc == ISD::SELECT) {
- LHS = Op->getOperand(1);
- RHS = Op->getOperand(2);
- } else {
- LHS = Op->getOperand(0);
- RHS = Op->getOperand(1);
- }
-
- const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
- LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
-
- // Special case: for shifts, the RHS always needs a zext.
- if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
- RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
- else
- RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
-
- // setcc always return i1/i1 vec so no need to truncate after.
- if (Opc == ISD::SETCC) {
- ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
- return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
- }
-
- // For other ops, we extend the operation's return type as well so we need to
- // truncate back to the original type.
- SDValue NewVal;
- if (Opc == ISD::SELECT)
- NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
- else
- NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
-
- return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
-}
-
SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
DAGCombinerInfo &DCI) const {
const unsigned Opc = Op.getOpcode();
@@ -15372,17 +15294,13 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
- SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) {
- case ISD::AND:
- if (auto Res = combineAnd(SDValue(N, 0), DCI))
- return Res;
- break;
case ISD::ADD:
case ISD::SUB:
case ISD::SHL:
case ISD::SRL:
case ISD::SRA:
+ case ISD::AND:
case ISD::OR:
case ISD::XOR:
case ISD::MUL:
@@ -15490,6 +15408,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case AMDGPUISD::CLAMP:
return performClampCombine(N, DCI);
case ISD::SCALAR_TO_VECTOR: {
+ SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
// v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
@@ -17662,7 +17581,7 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
bool SITargetLowering::hasAndNot(SDValue Op) const {
// Return false if the operation is divergent, as AND-NOT is a scalar-only
// instruction.
- if (Op->isDivergent())
+ if (Op->isDivergent() || !Op->isMachineOpcode())
return false;
EVT VT = Op.getValueType();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index a418bae67ebc4..b348702a5bd8d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -148,7 +148,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
- SDValue combineAnd(SDValue Op, DAGCombinerInfo &DCI) const;
SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AMDGPU/andorn2.ll b/llvm/test/CodeGen/AMDGPU/andorn2.ll
index 3226a77bb9d34..e1fdddf4438b6 100644
--- a/llvm/test/CodeGen/AMDGPU/andorn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorn2.ll
@@ -25,6 +25,28 @@ entry:
ret void
}
+; GCN-LABEL: {{^}}scalar_andn2_i32_one_sgpr
+; GCN: s_andn2_b32
+define amdgpu_kernel void @scalar_andn2_i32_one_sgpr(
+ ptr addrspace(1) %r0, i32 inreg %a, i32 inreg %b) {
+entry:
+ %nb = xor i32 %b, -1
+ %r0.val = and i32 %a, %nb
+ store i32 %r0.val, ptr addrspace(1) %r0
+ ret void
+}
+
+; GCN-LABEL: {{^}}scalar_andn2_i64_one_sgpr
+; GCN: s_andn2_b64
+define amdgpu_kernel void @scalar_andn2_i64_one_sgpr(
+ ptr addrspace(1) %r0, i64 inreg %a, i64 inreg %b) {
+entry:
+ %nb = xor i64 %b, -1
+ %r0.val = and i64 %a, %nb
+ store i64 %r0.val, ptr addrspace(1) %r0
+ ret void
+}
+
; GCN-LABEL: {{^}}scalar_orn2_i32_one_use
; GCN: s_orn2_b32
define amdgpu_kernel void @scalar_orn2_i32_one_use(
@@ -47,6 +69,28 @@ entry:
ret void
}
+; GCN-LABEL: {{^}}scalar_orn2_i32_one_use_sgpr
+; GCN: s_orn2_b32
+define amdgpu_kernel void @scalar_orn2_i32_one_use_sgpr(
+ ptr addrspace(1) %r0, i32 inreg %a, i32 inreg %b) {
+entry:
+ %nb = xor i32 %b, -1
+ %r0.val = or i32 %a, %nb
+ store i32 %r0.val, ptr addrspace(1) %r0
+ ret void
+}
+
+; GCN-LABEL: {{^}}scalar_orn2_i64_one_use_sgpr
+; GCN: s_orn2_b64
+define amdgpu_kernel void @scalar_orn2_i64_one_use_sgpr(
+ ptr addrspace(1) %r0, i64 inreg %a, i64 inreg %b) {
+entry:
+ %nb = xor i64 %b, -1
+ %r0.val = or i64 %a, %nb
+ store i64 %r0.val, ptr addrspace(1) %r0
+ ret void
+}
+
; GCN-LABEL: {{^}}vector_andn2_i32_s_v_one_use
; GCN: v_not_b32
; GCN: v_and_b32
diff --git a/llvm/test/CodeGen/AMDGPU/andornot.ll b/llvm/test/CodeGen/AMDGPU/andornot.ll
deleted file mode 100644
index 821709847ab8d..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/andornot.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-
-; GCN-LABEL: {{^}}scalar_and_or_not_i16
-; GCN: s_not_b32
-; GCN-NEXT: s_lshr_b32
-; GCN-NEXT: s_and_b32
-; GCN-NEXT: s_andn2_b32
-define amdgpu_kernel void @scalar_and_or_not_i16(ptr addrspace(1) %out, i16 %x, i16 %y, i16 %z) {
-entry:
- %not_z = xor i16 %z, -1
- %or_y_not_z = or i16 %y, %not_z
- %and_result = and i16 %x, %or_y_not_z
- store i16 %and_result, ptr addrspace(1) %out, align 4
- ret void
-}
-
-; GCN-LABEL: {{^}}scalar_and_or_not_i32
-; GCN: s_andn2_b32
-; GCN-NEXT: s_andn2_b32
-define amdgpu_kernel void @scalar_and_or_not_i32(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) {
-entry:
- %not_z = xor i32 %z, -1
- %or_y_not_z = or i32 %y, %not_z
- %and_result = and i32 %x, %or_y_not_z
- store i32 %and_result, ptr addrspace(1) %out, align 4
- ret void
-}
-
-; GCN-LABEL: {{^}}scalar_and_or_not_i64
-; GCN: s_andn2_b64
-; GCN-NEXT: s_andn2_b64
-define amdgpu_kernel void @scalar_and_or_not_i64(ptr addrspace(1) %out, i64 %x, i64 %y, i64 %z) {
-entry:
- %not_z = xor i64 %z, -1
- %or_y_not_z = or i64 %y, %not_z
- %and_result = and i64 %x, %or_y_not_z
- store i64 %and_result, ptr addrspace(1) %out, align 4
- ret void
-}
>From 2ec01c60b08fcc65c2fa65929e201099843dfcfe Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Mon, 21 Oct 2024 15:59:26 +0800
Subject: [PATCH 09/11] [AMDGPU] Update lit test.
---
llvm/test/CodeGen/AMDGPU/andorn2.ll | 32 +++++++++++------------------
1 file changed, 12 insertions(+), 20 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/andorn2.ll b/llvm/test/CodeGen/AMDGPU/andorn2.ll
index e1fdddf4438b6..4fe7e21b2adea 100644
--- a/llvm/test/CodeGen/AMDGPU/andorn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorn2.ll
@@ -27,24 +27,20 @@ entry:
; GCN-LABEL: {{^}}scalar_andn2_i32_one_sgpr
; GCN: s_andn2_b32
-define amdgpu_kernel void @scalar_andn2_i32_one_sgpr(
- ptr addrspace(1) %r0, i32 inreg %a, i32 inreg %b) {
+define i32 @scalar_andn2_i32_one_sgpr(i32 inreg %a, i32 inreg %b) {
entry:
%nb = xor i32 %b, -1
- %r0.val = and i32 %a, %nb
- store i32 %r0.val, ptr addrspace(1) %r0
- ret void
+ %and = and i32 %a, %nb
+ ret i32 %and
}
; GCN-LABEL: {{^}}scalar_andn2_i64_one_sgpr
; GCN: s_andn2_b64
-define amdgpu_kernel void @scalar_andn2_i64_one_sgpr(
- ptr addrspace(1) %r0, i64 inreg %a, i64 inreg %b) {
+define i64 @scalar_andn2_i64_one_sgpr(i64 inreg %a, i64 inreg %b) {
entry:
%nb = xor i64 %b, -1
- %r0.val = and i64 %a, %nb
- store i64 %r0.val, ptr addrspace(1) %r0
- ret void
+ %and = and i64 %a, %nb
+ ret i64 %and
}
; GCN-LABEL: {{^}}scalar_orn2_i32_one_use
@@ -71,24 +67,20 @@ entry:
; GCN-LABEL: {{^}}scalar_orn2_i32_one_use_sgpr
; GCN: s_orn2_b32
-define amdgpu_kernel void @scalar_orn2_i32_one_use_sgpr(
- ptr addrspace(1) %r0, i32 inreg %a, i32 inreg %b) {
+define i32 @scalar_orn2_i32_one_use_sgpr(i32 inreg %a, i32 inreg %b) {
entry:
%nb = xor i32 %b, -1
- %r0.val = or i32 %a, %nb
- store i32 %r0.val, ptr addrspace(1) %r0
- ret void
+ %or = or i32 %a, %nb
+ ret i32 %or;
}
; GCN-LABEL: {{^}}scalar_orn2_i64_one_use_sgpr
; GCN: s_orn2_b64
-define amdgpu_kernel void @scalar_orn2_i64_one_use_sgpr(
- ptr addrspace(1) %r0, i64 inreg %a, i64 inreg %b) {
+define i64 @scalar_orn2_i64_one_use_sgpr(i64 inreg %a, i64 inreg %b) {
entry:
%nb = xor i64 %b, -1
- %r0.val = or i64 %a, %nb
- store i64 %r0.val, ptr addrspace(1) %r0
- ret void
+ %or = or i64 %a, %nb
+ ret i64 %or;
}
; GCN-LABEL: {{^}}vector_andn2_i32_s_v_one_use
>From 28ea08444b1b5f591f030f7de266949987ccfa19 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Sun, 27 Apr 2025 18:03:24 +0800
Subject: [PATCH 10/11] [AMDGPU] Add unfold test.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
llvm/test/CodeGen/AMDGPU/andorn2.ll | 36 -----------------------
2 files changed, 1 insertion(+), 37 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9e1f887449dc3..075a6374a5427 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17581,7 +17581,7 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
bool SITargetLowering::hasAndNot(SDValue Op) const {
// Return false if the operation is divergent, as AND-NOT is a scalar-only
// instruction.
- if (Op->isDivergent() || !Op->isMachineOpcode())
+ if (Op->isDivergent())
return false;
EVT VT = Op.getValueType();
diff --git a/llvm/test/CodeGen/AMDGPU/andorn2.ll b/llvm/test/CodeGen/AMDGPU/andorn2.ll
index 4fe7e21b2adea..3226a77bb9d34 100644
--- a/llvm/test/CodeGen/AMDGPU/andorn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorn2.ll
@@ -25,24 +25,6 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}scalar_andn2_i32_one_sgpr
-; GCN: s_andn2_b32
-define i32 @scalar_andn2_i32_one_sgpr(i32 inreg %a, i32 inreg %b) {
-entry:
- %nb = xor i32 %b, -1
- %and = and i32 %a, %nb
- ret i32 %and
-}
-
-; GCN-LABEL: {{^}}scalar_andn2_i64_one_sgpr
-; GCN: s_andn2_b64
-define i64 @scalar_andn2_i64_one_sgpr(i64 inreg %a, i64 inreg %b) {
-entry:
- %nb = xor i64 %b, -1
- %and = and i64 %a, %nb
- ret i64 %and
-}
-
; GCN-LABEL: {{^}}scalar_orn2_i32_one_use
; GCN: s_orn2_b32
define amdgpu_kernel void @scalar_orn2_i32_one_use(
@@ -65,24 +47,6 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}scalar_orn2_i32_one_use_sgpr
-; GCN: s_orn2_b32
-define i32 @scalar_orn2_i32_one_use_sgpr(i32 inreg %a, i32 inreg %b) {
-entry:
- %nb = xor i32 %b, -1
- %or = or i32 %a, %nb
- ret i32 %or;
-}
-
-; GCN-LABEL: {{^}}scalar_orn2_i64_one_use_sgpr
-; GCN: s_orn2_b64
-define i64 @scalar_orn2_i64_one_use_sgpr(i64 inreg %a, i64 inreg %b) {
-entry:
- %nb = xor i64 %b, -1
- %or = or i64 %a, %nb
- ret i64 %or;
-}
-
; GCN-LABEL: {{^}}vector_andn2_i32_s_v_one_use
; GCN: v_not_b32
; GCN: v_and_b32
>From dba615541bbdd6da84068d3450ba07164b8c9f02 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Sun, 27 Apr 2025 18:57:18 +0800
Subject: [PATCH 11/11] [AMDGPU] Update.
---
llvm/test/CodeGen/AMDGPU/bfi_int.ll | 109 ++++++++-----
llvm/test/CodeGen/AMDGPU/commute-compares.ll | 17 ++-
...unfold-masked-merge-scalar-variablemask.ll | 143 ++++++++----------
3 files changed, 140 insertions(+), 129 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index 201b97d479c68..6e9cd8807b379 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -135,9 +135,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_xor_b32 s1, s1, s2
-; GFX7-NEXT: s_and_b32 s0, s0, s1
-; GFX7-NEXT: s_xor_b32 s0, s2, s0
+; GFX7-NEXT: s_andn2_b32 s2, s2, s0
+; GFX7-NEXT: s_and_b32 s0, s1, s0
+; GFX7-NEXT: s_or_b32 s0, s0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -147,9 +147,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b32 s1, s1, s2
-; GFX8-NEXT: s_and_b32 s0, s0, s1
-; GFX8-NEXT: s_xor_b32 s0, s2, s0
+; GFX8-NEXT: s_andn2_b32 s2, s2, s0
+; GFX8-NEXT: s_and_b32 s0, s1, s0
+; GFX8-NEXT: s_or_b32 s0, s0, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s0
@@ -163,9 +163,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_xor_b32 s1, s1, s2
-; GFX10-NEXT: s_and_b32 s0, s0, s1
-; GFX10-NEXT: s_xor_b32 s0, s2, s0
+; GFX10-NEXT: s_andn2_b32 s2, s2, s0
+; GFX10-NEXT: s_and_b32 s0, s1, s0
+; GFX10-NEXT: s_or_b32 s0, s0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
@@ -317,19 +317,26 @@ entry:
define amdgpu_ps float @s_s_v_bfi_sha256_ch(i32 inreg %x, i32 inreg %y, i32 %z) {
; GFX7-LABEL: s_s_v_bfi_sha256_ch:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: v_mov_b32_e32 v1, s0
-; GFX7-NEXT: v_bfi_b32 v0, v1, s1, v0
+; GFX7-NEXT: s_not_b32 s1, s1
+; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX7-NEXT: s_nand_b32 s0, s1, s0
+; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_s_v_bfi_sha256_ch:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: v_bfi_b32 v0, v1, s1, v0
+; GFX8-NEXT: s_not_b32 s1, s1
+; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT: s_nand_b32 s0, s1, s0
+; GFX8-NEXT: v_and_b32_e32 v0, s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_s_v_bfi_sha256_ch:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: v_bfi_b32 v0, s0, s1, v0
+; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT: s_not_b32 s1, s1
+; GFX10-NEXT: s_nand_b32 s0, s1, s0
+; GFX10-NEXT: v_and_b32_e32 v0, s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: s_s_v_bfi_sha256_ch:
@@ -350,30 +357,40 @@ entry:
ret float %cast
}
-define amdgpu_ps float @s_v_v_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 %z) {
+define amdgpu_ps float @s_v_v_bfi_sha256_ch(i32 inreg %x, i32 inreg %y, i32 %z) {
; GFX7-LABEL: s_v_v_bfi_sha256_ch:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1
+; GFX7-NEXT: s_not_b32 s1, s1
+; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX7-NEXT: s_nand_b32 s0, s1, s0
+; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_v_v_bfi_sha256_ch:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1
+; GFX8-NEXT: s_not_b32 s1, s1
+; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT: s_nand_b32 s0, s1, s0
+; GFX8-NEXT: v_and_b32_e32 v0, s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_v_v_bfi_sha256_ch:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: v_bfi_b32 v0, s0, v0, v1
+; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT: s_not_b32 s1, s1
+; GFX10-NEXT: s_nand_b32 s0, s1, s0
+; GFX10-NEXT: v_and_b32_e32 v0, s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: s_v_v_bfi_sha256_ch:
; GFX8-GISEL: ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT: v_bfi_b32 v0, s0, v0, v1
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-GISEL-NEXT: v_bfi_b32 v0, v1, s1, v0
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: s_v_v_bfi_sha256_ch:
; GFX10-GISEL: ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, v0, v1
+; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, s1, v0
; GFX10-GISEL-NEXT: ; return to shader part epilog
entry:
%xor0 = xor i32 %y, %z
@@ -1008,24 +1025,32 @@ define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_1(i64 %a, i64 inreg %b, i6
define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_1(i64 inreg %a, i64 inreg %b, i64 %mask) {
; GFX7-LABEL: s_s_v_bitselect_i64_pat_1:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: v_bfi_b32 v1, s3, v2, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: v_bfi_b32 v0, s2, v2, v0
+; GFX7-NEXT: s_not_b64 s[0:1], s[0:1]
+; GFX7-NEXT: v_or_b32_e32 v1, s3, v1
+; GFX7-NEXT: v_or_b32_e32 v0, s2, v0
+; GFX7-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
+; GFX7-NEXT: v_and_b32_e32 v1, s1, v1
+; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_s_v_bitselect_i64_pat_1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_bfi_b32 v1, s3, v2, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_bfi_b32 v0, s2, v2, v0
+; GFX8-NEXT: s_not_b64 s[0:1], s[0:1]
+; GFX8-NEXT: v_or_b32_e32 v1, s3, v1
+; GFX8-NEXT: v_or_b32_e32 v0, s2, v0
+; GFX8-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT: v_and_b32_e32 v1, s1, v1
+; GFX8-NEXT: v_and_b32_e32 v0, s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_s_v_bitselect_i64_pat_1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_bfi_b32 v0, s2, s0, v0
-; GFX10-NEXT: v_bfi_b32 v1, s3, s1, v1
+; GFX10-NEXT: v_or_b32_e32 v1, s3, v1
+; GFX10-NEXT: v_or_b32_e32 v0, s2, v0
+; GFX10-NEXT: s_not_b64 s[0:1], s[0:1]
+; GFX10-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX10-NEXT: v_and_b32_e32 v1, s1, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: s_s_v_bitselect_i64_pat_1:
@@ -1495,9 +1520,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; GFX7-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX7-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
+; GFX7-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX7-NEXT: s_add_u32 s0, s0, 10
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
@@ -1510,9 +1535,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1526,9 +1551,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX10-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
+; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
@@ -1583,9 +1608,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; GFX7-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX7-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
+; GFX7-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX7-NEXT: s_add_u32 s0, s0, 10
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
@@ -1598,9 +1623,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1614,9 +1639,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX10-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
+; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
index ae8080cf9f06a..f6deb6c6df422 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
@@ -541,19 +541,20 @@ define amdgpu_kernel void @commute_sgt_neg1_i64(ptr addrspace(1) %out, ptr addrs
; GCN-LABEL: commute_sgt_neg1_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4
+; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[3:4]
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GCN-NEXT: v_not_b32_e32 v0, v0
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
index 69724aa75af4f..56990a7b73310 100644
--- a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
@@ -39,11 +39,10 @@ define i32 @s_in32(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
; GCN-LABEL: s_in32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_xor_b32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
; GCN-NEXT: s_and_b32 s0, s0, s2
-; GCN-NEXT: s_xor_b32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%n0 = xor i32 %x, %y
@@ -56,11 +55,10 @@ define i64 @s_in64(i64 inreg %x, i64 inreg %y, i64 inreg %mask) {
; GCN-LABEL: s_in64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[16:17]
; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[16:17]
-; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GCN-NEXT: s_setpc_b64 s[30:31]
%n0 = xor i64 %x, %y
@@ -75,11 +73,10 @@ define i32 @s_in_commutativity_0_0_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask
; GCN-LABEL: s_in_commutativity_0_0_1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_xor_b32 s0, s0, s1
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
+; GCN-NEXT: s_and_b32 s0, s0, s2
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT: s_and_b32 s0, s2, s0
-; GCN-NEXT: s_xor_b32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%n0 = xor i32 %x, %y
@@ -92,11 +89,10 @@ define i32 @s_in_commutativity_0_1_0(i32 inreg %x, i32 inreg %y, i32 inreg %mask
; GCN-LABEL: s_in_commutativity_0_1_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_xor_b32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
; GCN-NEXT: s_and_b32 s0, s0, s2
-; GCN-NEXT: s_xor_b32 s0, s1, s0
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%n0 = xor i32 %x, %y
@@ -109,11 +105,10 @@ define i32 @in_commutativity_0_1_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask)
; GCN-LABEL: in_commutativity_0_1_1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_xor_b32 s0, s0, s1
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
+; GCN-NEXT: s_and_b32 s0, s0, s2
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT: s_and_b32 s0, s2, s0
-; GCN-NEXT: s_xor_b32 s0, s1, s0
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%n0 = xor i32 %x, %y
@@ -126,11 +121,10 @@ define i32 @s_in_commutativity_1_0_0(i32 inreg %x, i32 inreg %y, i32 inreg %mask
; GCN-LABEL: s_in_commutativity_1_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_xor_b32 s1, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_not1_b32 s0, s0, s2
; GCN-NEXT: s_and_b32 s1, s1, s2
-; GCN-NEXT: s_xor_b32 s0, s1, s0
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s1, s0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%n0 = xor i32 %x, %y
@@ -143,11 +137,10 @@ define i32 @s_in_commutativity_1_0_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask
; GCN-LABEL: s_in_commutativity_1_0_1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_xor_b32 s1, s0, s1
+; GCN-NEXT: s_and_not1_b32 s0, s0, s2
+; GCN-NEXT: s_and_b32 s1, s1, s2
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT: s_and_b32 s1, s2, s1
-; GCN-NEXT: s_xor_b32 s0, s1, s0
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s1, s0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%n0 = xor i32 %x, %y
@@ -160,11 +153,10 @@ define i32 @s_in_commutativity_1_1_0(i32 inreg %x, i32 inreg %y, i32 inreg %mask
; GCN-LABEL: s_in_commutativity_1_1_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_xor_b32 s1, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_not1_b32 s0, s0, s2
; GCN-NEXT: s_and_b32 s1, s1, s2
-; GCN-NEXT: s_xor_b32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s1, s0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%n0 = xor i32 %x, %y
@@ -177,11 +169,10 @@ define i32 @s_in_commutativity_1_1_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask
; GCN-LABEL: s_in_commutativity_1_1_1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_xor_b32 s1, s0, s1
+; GCN-NEXT: s_and_not1_b32 s0, s0, s2
+; GCN-NEXT: s_and_b32 s1, s1, s2
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT: s_and_b32 s1, s2, s1
-; GCN-NEXT: s_xor_b32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s1, s0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%n0 = xor i32 %x, %y
@@ -197,11 +188,10 @@ define i32 @s_in_complex_y0(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, i32
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_and_b32 s1, s1, s2
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT: s_xor_b32 s0, s0, s1
; GCN-NEXT: s_and_b32 s0, s0, s3
+; GCN-NEXT: s_and_not1_b32 s1, s1, s3
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT: s_xor_b32 s0, s0, s1
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%y = and i32 %y_hi, %y_low
@@ -216,11 +206,10 @@ define i32 @s_in_complex_y1(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, i32
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_and_b32 s1, s1, s2
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT: s_xor_b32 s0, s0, s1
; GCN-NEXT: s_and_b32 s0, s0, s3
+; GCN-NEXT: s_and_not1_b32 s1, s1, s3
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT: s_xor_b32 s0, s1, s0
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%y = and i32 %y_hi, %y_low
@@ -237,10 +226,10 @@ define i32 @s_in_complex_m0(i32 inreg %x, i32 inreg %y, i32 inreg %m_a, i32 inre
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_xor_b32 s2, s2, s3
-; GCN-NEXT: s_xor_b32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
; GCN-NEXT: s_and_b32 s0, s0, s2
-; GCN-NEXT: s_xor_b32 s0, s0, s1
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -256,10 +245,10 @@ define i32 @s_in_complex_m1(i32 inreg %x, i32 inreg %y, i32 inreg %m_a, i32 inre
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_xor_b32 s2, s2, s3
-; GCN-NEXT: s_xor_b32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT: s_and_b32 s0, s2, s0
-; GCN-NEXT: s_xor_b32 s0, s0, s1
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
+; GCN-NEXT: s_and_b32 s0, s0, s2
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -278,10 +267,10 @@ define i32 @s_in_complex_y0_m0(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low,
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_and_b32 s1, s1, s2
; GCN-NEXT: s_xor_b32 s2, s3, s16
-; GCN-NEXT: s_xor_b32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
; GCN-NEXT: s_and_b32 s0, s0, s2
-; GCN-NEXT: s_xor_b32 s0, s0, s1
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -299,10 +288,10 @@ define i32 @s_in_complex_y1_m0(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low,
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_and_b32 s1, s1, s2
; GCN-NEXT: s_xor_b32 s2, s3, s16
-; GCN-NEXT: s_xor_b32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
; GCN-NEXT: s_and_b32 s0, s0, s2
-; GCN-NEXT: s_xor_b32 s0, s1, s0
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -320,10 +309,10 @@ define i32 @s_in_complex_y0_m1(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low,
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_and_b32 s1, s1, s2
; GCN-NEXT: s_xor_b32 s2, s3, s16
-; GCN-NEXT: s_xor_b32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT: s_and_b32 s0, s2, s0
-; GCN-NEXT: s_xor_b32 s0, s0, s1
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
+; GCN-NEXT: s_and_b32 s0, s0, s2
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -341,10 +330,10 @@ define i32 @s_in_complex_y1_m1(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low,
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_and_b32 s1, s1, s2
; GCN-NEXT: s_xor_b32 s2, s3, s16
-; GCN-NEXT: s_xor_b32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT: s_and_b32 s0, s2, s0
-; GCN-NEXT: s_xor_b32 s0, s1, s0
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_not1_b32 s1, s1, s2
+; GCN-NEXT: s_and_b32 s0, s0, s2
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -444,11 +433,10 @@ define i32 @in_constant_varx_42(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
; GCN-LABEL: in_constant_varx_42:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_xor_b32 s0, s0, 42
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_not1_b32 s1, 42, s2
; GCN-NEXT: s_and_b32 s0, s0, s2
-; GCN-NEXT: s_xor_b32 s0, s0, 42
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%n0 = xor i32 %x, 42
@@ -480,11 +468,10 @@ define i32 @s_in_constant_varx_42_invmask(i32 inreg %x, i32 inreg %y, i32 inreg
; GCN-LABEL: s_in_constant_varx_42_invmask:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_xor_b32 s0, s0, 42
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_and_b32 s1, s2, 42
; GCN-NEXT: s_and_not1_b32 s0, s0, s2
-; GCN-NEXT: s_xor_b32 s0, s0, 42
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%notmask = xor i32 %mask, -1
@@ -577,11 +564,10 @@ define i32 @s_in_constant_42_vary(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
; GCN-LABEL: s_in_constant_42_vary:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_xor_b32 s0, s1, 42
+; GCN-NEXT: s_and_not1_b32 s0, s1, s2
+; GCN-NEXT: s_and_b32 s1, s2, 42
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT: s_and_b32 s0, s0, s2
-; GCN-NEXT: s_xor_b32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s1, s0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%n0 = xor i32 42, %y
@@ -613,11 +599,10 @@ define i32 @s_in_constant_42_vary_invmask(i32 inreg %x, i32 inreg %y, i32 inreg
; GCN-LABEL: s_in_constant_42_vary_invmask:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_xor_b32 s0, s1, 42
+; GCN-NEXT: s_and_b32 s0, s1, s2
+; GCN-NEXT: s_and_not1_b32 s1, 42, s2
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT: s_and_not1_b32 s0, s0, s2
-; GCN-NEXT: s_xor_b32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_or_b32 s0, s1, s0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%notmask = xor i32 %mask, -1
More information about the llvm-commits
mailing list