[llvm] [AMDGPU] using divergent/uniform information in ISel of zext (PR #174539)
Zeng Wu via llvm-commits
llvm-commits at lists.llvm.org
Sat Mar 14 00:49:39 PDT 2026
https://github.com/zwu-2025 updated https://github.com/llvm/llvm-project/pull/174539
>From 73353214f9c7de605c8b8109c705e7d4048eeeea Mon Sep 17 00:00:00 2001
From: root <Zeng.Wu2 at amd.com>
Date: Fri, 6 Mar 2026 04:42:31 +0000
Subject: [PATCH 1/6] changes decision order of UseRC and use
common-sub-regclass
---
llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 17 +++++++++++++++--
1 file changed, 15 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 4ad721bf21959..3096ab5ccb698 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -24,9 +24,11 @@
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/PseudoProbe.h"
+#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -105,8 +107,6 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
MVT VT = Op.getSimpleValueType();
// Stick to the preferred register classes for legal types.
- if (TLI->isTypeLegal(VT))
- UseRC = TLI->getRegClassFor(VT, Op->isDivergent());
for (SDNode *User : Op->users()) {
bool Match = true;
@@ -131,6 +131,7 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
RC = TRI->getAllocatableClass(
TII->getRegClass(II, i + II.getNumDefs()));
}
+
if (!UseRC)
UseRC = RC;
else if (RC) {
@@ -149,6 +150,18 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
break;
}
+ const TargetRegisterClass *RegClassForVT = nullptr;
+ // The check is to be removed in other pending PR, it is kept to make System Z happy.
+ if (TLI->isTypeLegal(VT)) {
+ RegClassForVT = TLI->getRegClassFor(VT, Op->isDivergent());
+ }
+
+ if (UseRC == nullptr || !UseRC->isAllocatable()) {
+ UseRC = RegClassForVT;
+ } else if (auto CommonSubClass = TRI->getCommonSubClass(UseRC, RegClassForVT)) {
+ UseRC = CommonSubClass;
+ }
+
const TargetRegisterClass *SrcRC = nullptr, *DstRC = nullptr;
SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT);
>From 166eb11e087769f2b266f248896b68944f7abe15 Mon Sep 17 00:00:00 2001
From: root <Zeng.Wu2 at amd.com>
Date: Fri, 6 Mar 2026 04:43:45 +0000
Subject: [PATCH 2/6] using divergent/uniform information zext pattern in
tablegen
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index ca5a4d7301bda..349deb8d93633 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3038,12 +3038,28 @@ def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
// instructions resulting in the copies from SCC to these instructions
// will be moved to the VALU.
+def : GCNPat <
+ (i32 (UniformUnaryFrag<zext> i1:$src)),
+ (S_AND_B32 $src, (i32 1))
+>;
+
+// since operand of S_AND_B64 is SREG-64 and it does not support i1.
+let AddedComplexity = 10 in
+def : GCNPat <
+ (i64 (UniformUnaryFrag<zext> i1:$src)),
+ (REG_SEQUENCE SReg_64,
+ (S_MOV_B32 (S_AND_B32 $src, (i32 1))), sub0,
+ (S_MOV_B32 (i32 0)), sub1
+ )
+>;
+
let WaveSizePredicate = isWave64 in {
def : GCNPat <
(i1 (and i1:$src0, i1:$src1)),
(S_AND_B64 $src0, $src1)
>;
+
def : GCNPat <
(i1 (or i1:$src0, i1:$src1)),
(S_OR_B64 $src0, $src1)
>From e399a6d08ea76d442f03755e12c37b8fcaa2ece5 Mon Sep 17 00:00:00 2001
From: root <Zeng.Wu2 at amd.com>
Date: Fri, 6 Mar 2026 04:47:41 +0000
Subject: [PATCH 3/6] uniform propagation in dag combiner
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 ++++++++
llvm/lib/Target/AMDGPU/SOPInstructions.td | 2 +-
2 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 301f2fc8dab45..dc11714f3d519 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16868,6 +16868,14 @@ SDValue SITargetLowering::performSelectCombine(SDNode *N,
if (Cond.getOpcode() != ISD::SETCC)
return SDValue();
+ if (DCI.isAfterLegalizeDAG()) {
+ // in generic combini, (uint_to_fp (setcc x, y, cc)) -> (select (setcc x, y,
+ // cc), 1.0, 0.0)
+ if (N->getValueType(0).isVector()) {
+ DCI.DAG.updateDivergence(N);
+ }
+ }
+
SDValue LHS = Cond.getOperand(0);
SDValue RHS = Cond.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 1931e0be15152..ef09aa4dede3c 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -2005,7 +2005,7 @@ let AddedComplexity = 20 in {
// TODO: The predicate should not be necessary, but enabling this pattern for
// all subtargets generates worse code in some cases.
- let OtherPredicates = [HasPseudoScalarTrans] in
+ // let OtherPredicates = [HasPseudoScalarTrans] in
def : GCNPat<
(f32 (UniformSelect f32:$src0, f32:$src1)),
(S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
>From 686894fe7f3754258803e91072d0461aa2a402fc Mon Sep 17 00:00:00 2001
From: root <Zeng.Wu2 at amd.com>
Date: Wed, 11 Mar 2026 06:32:45 +0000
Subject: [PATCH 4/6] hanle copy to scc
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 37 +++++++++++--
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 2 +-
llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 2 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 1 +
llvm/lib/Target/AMDGPU/SIInstructions.td | 54 +------------------
5 files changed, 38 insertions(+), 58 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 8698e816ddbb9..5601efa3a4356 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -327,10 +327,33 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
return false;
}
-void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
- if (!Subtarget->d16PreservesUnusedBits())
- return;
+bool AMDGPUDAGToDAGISel::preprocessSelect(SDNode *N) const {
+ if (!Subtarget->isWave64()) return false;
+
+ // If VALU is selected for `select`, no changes needed.
+ auto VT = N->getValueType(0);
+ auto ScalarPred = !N->isDivergent() && (VT == MVT::i32 || (VT == MVT::f32 && Subtarget->hasPseudoScalarTrans()));
+ if (!ScalarPred)
+ return false;
+
+ SDValue Cond = N->getOperand(0);
+
+ SDLoc DL(N);
+
+ SDValue Mask = CurDAG->getRegister(AMDGPU::EXEC, MVT::i64);
+ SDValue Ext = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Cond), 0);
+ SDValue NewCond = SDValue(CurDAG->getMachineNode(AMDGPU::S_AND_B64, DL, MVT::i64, MVT::Glue, Mask, Ext), 0);
+ Cond = NewCond;
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ SDValue NewSel =
+ SDValue(CurDAG->getMachineNode(AMDGPU::S_CSELECT_B32, DL, VT, LHS, RHS, SDValue(Cond.getNode(), 1)), 0);
+ CurDAG->ReplaceAllUsesWith(N, NewSel.getNode());
+ return true;
+}
+
+void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
bool MadeChange = false;
@@ -342,8 +365,14 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
switch (N->getOpcode()) {
case ISD::BUILD_VECTOR:
// TODO: Match load d16 from shl (extload:i16), 16
- MadeChange |= matchLoadD16FromBuildVector(N);
+ if (Subtarget->d16PreservesUnusedBits()) {
+ MadeChange |= matchLoadD16FromBuildVector(N);
+ }
+
break;
+ case ISD::SELECT:
+ MadeChange |= preprocessSelect(N);
+ break;
default:
break;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index a86b75458923e..706bd1266f4ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -59,6 +59,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
// Instructions that will be lowered with a final instruction that zeros the
// high result bits.
bool fp16SrcZerosHighBits(unsigned Opc) const;
+ bool preprocessSelect(SDNode *N) const;
public:
AMDGPUDAGToDAGISel() = delete;
@@ -301,7 +302,6 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
class AMDGPUISelDAGToDAGPass : public SelectionDAGISelPass {
public:
AMDGPUISelDAGToDAGPass(TargetMachine &TM);
-
PreservedAnalyses run(MachineFunction &MF,
MachineFunctionAnalysisManager &MFAM);
};
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 39a6a7762eea5..1aa7fdbd0bf33 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -782,7 +782,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
lowerVGPR2SGPRCopies(MF);
// Postprocessing
- fixSCCCopies(MF);
+ // fixSCCCopies(MF);
for (auto *MI : S2VCopies) {
// Check if it is still valid
if (MI->isCopy()) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7535407741f1f..2cd9741199050 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -628,6 +628,7 @@ static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
const DebugLoc &DL, MCRegister DestReg,
MCRegister SrcReg, bool KillSrc,
const char *Msg = "illegal VGPR to SGPR copy") {
+ assert (false && "illegal VGPR to SGPR copy");
MachineFunction *MF = MBB.getParent();
LLVMContext &C = MF->getFunction().getContext();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 349deb8d93633..9f6185c0ee05f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3043,57 +3043,7 @@ def : GCNPat <
(S_AND_B32 $src, (i32 1))
>;
-// since operand of S_AND_B64 is SREG-64 and it does not support i1.
-let AddedComplexity = 10 in
-def : GCNPat <
- (i64 (UniformUnaryFrag<zext> i1:$src)),
- (REG_SEQUENCE SReg_64,
- (S_MOV_B32 (S_AND_B32 $src, (i32 1))), sub0,
- (S_MOV_B32 (i32 0)), sub1
- )
->;
-
-let WaveSizePredicate = isWave64 in {
-def : GCNPat <
- (i1 (and i1:$src0, i1:$src1)),
- (S_AND_B64 $src0, $src1)
->;
-
-
-def : GCNPat <
- (i1 (or i1:$src0, i1:$src1)),
- (S_OR_B64 $src0, $src1)
->;
-
-def : GCNPat <
- (i1 (xor i1:$src0, i1:$src1)),
- (S_XOR_B64 $src0, $src1)
->;
-
-def : GCNPat <
- (i1 (add i1:$src0, i1:$src1)),
- (S_XOR_B64 $src0, $src1)
->;
-
-def : GCNPat <
- (i1 (sub i1:$src0, i1:$src1)),
- (S_XOR_B64 $src0, $src1)
->;
-
-let AddedComplexity = 1 in {
-def : GCNPat <
- (i1 (add i1:$src0, (i1 -1))),
- (S_NOT_B64 $src0)
->;
-
-def : GCNPat <
- (i1 (sub i1:$src0, (i1 -1))),
- (S_NOT_B64 $src0)
->;
-}
-} // end isWave64
-
-let WaveSizePredicate = isWave32 in {
+// let WaveSizePredicate = isWave32 in {
def : GCNPat <
(i1 (and i1:$src0, i1:$src1)),
(S_AND_B32 $src0, $src1)
@@ -3130,7 +3080,7 @@ def : GCNPat <
(S_NOT_B32 $src0)
>;
}
-} // end isWave32
+//} // end isWave32
def : GCNPat <
(i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))),
>From 9879048df874bc7063b046809112358314021b0e Mon Sep 17 00:00:00 2001
From: root <Zeng.Wu2 at amd.com>
Date: Thu, 12 Mar 2026 07:07:45 +0000
Subject: [PATCH 5/6] Add constraints on the instruction selection of i1
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 46 ++++++++++++++++---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 1 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 ----
llvm/lib/Target/AMDGPU/SIInstructions.td | 11 ++++-
llvm/lib/Target/AMDGPU/SOPInstructions.td | 9 ++--
5 files changed, 54 insertions(+), 21 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 5601efa3a4356..8f5ba672a2a0e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -21,8 +21,10 @@
#include "R600RegisterInfo.h"
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -327,24 +329,54 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
return false;
}
+bool AMDGPUDAGToDAGISel::SetCCFromFP(const SDNode *N) const {
+ const SDNode *CondNode = N;
+ if (CondNode->getOpcode() == ISD::FREEZE) {
+ CondNode = CondNode->getOperand(0).getNode();
+ }
+
+ auto SetCCOpcodes = {ISD::SETCC, ISD::FP_TO_UINT};
+ auto LogicalOpcodes = {ISD::AND, ISD::OR, ISD::XOR};
+
+ SmallVector<const SDNode*, 4> SetCCOpnds;
+ SmallVector<const SDNode*, 4> Stack;
+ Stack.push_back(CondNode);
+ while (!Stack.empty()) {
+ auto Node = Stack.pop_back_val();
+ if (llvm::find(SetCCOpcodes, Node->getOpcode()) != SetCCOpcodes.end()) {
+ SetCCOpnds.push_back(Node);
+ } else if (llvm::find(LogicalOpcodes, Node->getOpcode()) != LogicalOpcodes.end()) {
+ Stack.push_back(Node->getOperand(0).getNode());
+ Stack.push_back(Node->getOperand(1).getNode());
+ } else {
+ assert(false);
+ }
+ }
+
+ return llvm::any_of(SetCCOpnds, [](const SDNode* Node) {
+ for (auto It = Node->op_begin(); It != Node->op_end(); ++It) {
+ if (It->getValueType() == MVT::f16 || It->getValueType() == MVT::f32) return true;
+ }
+ return false;
+ });
+}
+
bool AMDGPUDAGToDAGISel::preprocessSelect(SDNode *N) const {
if (!Subtarget->isWave64()) return false;
// If VALU is selected for `select`, no changes needed.
auto VT = N->getValueType(0);
- auto ScalarPred = !N->isDivergent() && (VT == MVT::i32 || (VT == MVT::f32 && Subtarget->hasPseudoScalarTrans()));
- if (!ScalarPred)
- return false;
-
SDValue Cond = N->getOperand(0);
+ const SDNode *CondNode = Cond.getNode();
+ if (SetCCFromFP(CondNode)) {
+ return false;
+ }
SDLoc DL(N);
SDValue Mask = CurDAG->getRegister(AMDGPU::EXEC, MVT::i64);
SDValue Ext = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Cond), 0);
- SDValue NewCond = SDValue(CurDAG->getMachineNode(AMDGPU::S_AND_B64, DL, MVT::i64, MVT::Glue, Mask, Ext), 0);
-
- Cond = NewCond;
+ Cond = SDValue(CurDAG->getMachineNode(AMDGPU::S_AND_B64, DL, MVT::i64, MVT::Glue, Mask, Ext), 0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
SDValue NewSel =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 706bd1266f4ca..a7baff1f48f4a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -60,6 +60,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
// high result bits.
bool fp16SrcZerosHighBits(unsigned Opc) const;
bool preprocessSelect(SDNode *N) const;
+ bool SetCCFromFP(const SDNode *N) const;
public:
AMDGPUDAGToDAGISel() = delete;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index dc11714f3d519..301f2fc8dab45 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16868,14 +16868,6 @@ SDValue SITargetLowering::performSelectCombine(SDNode *N,
if (Cond.getOpcode() != ISD::SETCC)
return SDValue();
- if (DCI.isAfterLegalizeDAG()) {
- // in generic combini, (uint_to_fp (setcc x, y, cc)) -> (select (setcc x, y,
- // cc), 1.0, 0.0)
- if (N->getValueType(0).isVector()) {
- DCI.DAG.updateDivergence(N);
- }
- }
-
SDValue LHS = Cond.getOperand(0);
SDValue RHS = Cond.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9f6185c0ee05f..06ce3585819f2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3038,8 +3038,17 @@ def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
// instructions resulting in the copies from SCC to these instructions
// will be moved to the VALU.
+def UniformAND : PatFrag<
+ (ops node:$src),
+ (zext $src),
+ [{
+ if (N->isDivergent() || N->getOperand(0)) return false;
+ return !SetCCFromFP(N->getOperand(0).getNode());
+ }]
+>;
+
def : GCNPat <
- (i32 (UniformUnaryFrag<zext> i1:$src)),
+ (i32 (UniformAND i1:$src)),
(S_AND_B32 $src, (i32 1))
>;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index ef09aa4dede3c..4f88dad50198b 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1990,11 +1990,13 @@ def : GCNPat<
//===----------------------------------------------------------------------===//
// SOP2 Patterns
//===----------------------------------------------------------------------===//
-
def UniformSelect : PatFrag<
(ops node:$src0, node:$src1),
(select SCC, $src0, $src1),
- [{ return !N->isDivergent(); }]
+ [{
+ if (N->isDivergent() || N->getOperand(0)->isDivergent()) return false;
+ return !SetCCFromFP(N->getOperand(0).getNode());
+ }]
>;
let AddedComplexity = 20 in {
@@ -2003,9 +2005,6 @@ let AddedComplexity = 20 in {
(S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
>;
- // TODO: The predicate should not be necessary, but enabling this pattern for
- // all subtargets generates worse code in some cases.
- // let OtherPredicates = [HasPseudoScalarTrans] in
def : GCNPat<
(f32 (UniformSelect f32:$src0, f32:$src1)),
(S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
>From dd058cf8c45ee99122cc4909df124ff127742c0a Mon Sep 17 00:00:00 2001
From: root <Zeng.Wu2 at amd.com>
Date: Sat, 14 Mar 2026 07:47:47 +0000
Subject: [PATCH 6/6] Add back predicate isWave64 for S_AND etc since it is
selected for results of V_CMP
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 2 +-
llvm/lib/Target/AMDGPU/SIInstructions.td | 45 ++++++++++++++++++-
2 files changed, 44 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 8f5ba672a2a0e..48795d61c407a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -349,7 +349,7 @@ bool AMDGPUDAGToDAGISel::SetCCFromFP(const SDNode *N) const {
Stack.push_back(Node->getOperand(0).getNode());
Stack.push_back(Node->getOperand(1).getNode());
} else {
- assert(false);
+ return true;
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 06ce3585819f2..c7f7ad302203b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3052,7 +3052,48 @@ def : GCNPat <
(S_AND_B32 $src, (i32 1))
>;
-// let WaveSizePredicate = isWave32 in {
+// There is a pattern like the results are from V_CMP and results are consumed by S_AND
+let WaveSizePredicate = isWave64 in {
+def : GCNPat <
+ (i1 (and i1:$src0, i1:$src1)),
+ (S_AND_B64 $src0, $src1)
+>;
+
+
+def : GCNPat <
+ (i1 (or i1:$src0, i1:$src1)),
+ (S_OR_B64 $src0, $src1)
+>;
+
+def : GCNPat <
+ (i1 (xor i1:$src0, i1:$src1)),
+ (S_XOR_B64 $src0, $src1)
+>;
+
+def : GCNPat <
+ (i1 (add i1:$src0, i1:$src1)),
+ (S_XOR_B64 $src0, $src1)
+>;
+
+def : GCNPat <
+ (i1 (sub i1:$src0, i1:$src1)),
+ (S_XOR_B64 $src0, $src1)
+>;
+
+let AddedComplexity = 1 in {
+def : GCNPat <
+ (i1 (add i1:$src0, (i1 -1))),
+ (S_NOT_B64 $src0)
+>;
+
+def : GCNPat <
+ (i1 (sub i1:$src0, (i1 -1))),
+ (S_NOT_B64 $src0)
+>;
+}
+} // end isWave64
+
+let WaveSizePredicate = isWave32 in {
def : GCNPat <
(i1 (and i1:$src0, i1:$src1)),
(S_AND_B32 $src0, $src1)
@@ -3089,7 +3130,7 @@ def : GCNPat <
(S_NOT_B32 $src0)
>;
}
-//} // end isWave32
+} // end isWave32
def : GCNPat <
(i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))),
More information about the llvm-commits
mailing list