[llvm] [AMDGPU] using divergent/uniform information in ISel of zext (PR #174539)

Sat Mar 14 00:49:39 PDT 2026

https://github.com/zwu-2025 updated https://github.com/llvm/llvm-project/pull/174539

>From 73353214f9c7de605c8b8109c705e7d4048eeeea Mon Sep 17 00:00:00 2001
From: root <Zeng.Wu2 at amd.com>
Date: Fri, 6 Mar 2026 04:42:31 +0000
Subject: [PATCH 1/6] changes decision order of UseRC and use
 common-sub-regclass

---
 llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 4ad721bf21959..3096ab5ccb698 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -24,9 +24,11 @@
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/PseudoProbe.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
@@ -105,8 +107,6 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
   MVT VT = Op.getSimpleValueType();
 
   // Stick to the preferred register classes for legal types.
-  if (TLI->isTypeLegal(VT))
-    UseRC = TLI->getRegClassFor(VT, Op->isDivergent());
 
   for (SDNode *User : Op->users()) {
     bool Match = true;
@@ -131,6 +131,7 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
             RC = TRI->getAllocatableClass(
                 TII->getRegClass(II, i + II.getNumDefs()));
           }
+
           if (!UseRC)
             UseRC = RC;
           else if (RC) {
@@ -149,6 +150,18 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
       break;
   }
 
+  const TargetRegisterClass *RegClassForVT = nullptr;
+  // The check is to be removed in other pending PR, it is kept to make System Z happy.
+  if (TLI->isTypeLegal(VT)) {
+      RegClassForVT = TLI->getRegClassFor(VT, Op->isDivergent());
+  }
+
+  if (UseRC == nullptr || !UseRC->isAllocatable()) {
+      UseRC = RegClassForVT;
+  } else if (auto CommonSubClass = TRI->getCommonSubClass(UseRC, RegClassForVT)) {
+      UseRC = CommonSubClass;
+  }
+
   const TargetRegisterClass *SrcRC = nullptr, *DstRC = nullptr;
   SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT);
 

>From 166eb11e087769f2b266f248896b68944f7abe15 Mon Sep 17 00:00:00 2001
From: root <Zeng.Wu2 at amd.com>
Date: Fri, 6 Mar 2026 04:43:45 +0000
Subject: [PATCH 2/6] using divergent/uniform information zext pattern in
 tablegen

---
 llvm/lib/Target/AMDGPU/SIInstructions.td | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index ca5a4d7301bda..349deb8d93633 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3038,12 +3038,28 @@ def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
 // instructions resulting in the copies from SCC to these instructions
 // will be moved to the VALU.
 
+def : GCNPat <
+ (i32 (UniformUnaryFrag<zext> i1:$src)),
+  (S_AND_B32 $src, (i32 1))
+>;
+
+// since operand of S_AND_B64 is SREG-64 and it does not support i1.
+let AddedComplexity = 10 in
+def : GCNPat <
+  (i64 (UniformUnaryFrag<zext> i1:$src)),
+  (REG_SEQUENCE SReg_64,
+    (S_MOV_B32 (S_AND_B32 $src, (i32 1))), sub0,
+    (S_MOV_B32 (i32 0)), sub1
+  )
+>;
+
 let WaveSizePredicate = isWave64 in {
 def : GCNPat <
   (i1 (and i1:$src0, i1:$src1)),
   (S_AND_B64 $src0, $src1)
 >;
 
+
 def : GCNPat <
   (i1 (or i1:$src0, i1:$src1)),
   (S_OR_B64 $src0, $src1)

>From e399a6d08ea76d442f03755e12c37b8fcaa2ece5 Mon Sep 17 00:00:00 2001
From: root <Zeng.Wu2 at amd.com>
Date: Fri, 6 Mar 2026 04:47:41 +0000
Subject: [PATCH 3/6] uniform propagation in dag combiner

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 ++++++++
 llvm/lib/Target/AMDGPU/SOPInstructions.td | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 301f2fc8dab45..dc11714f3d519 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16868,6 +16868,14 @@ SDValue SITargetLowering::performSelectCombine(SDNode *N,
   if (Cond.getOpcode() != ISD::SETCC)
     return SDValue();
 
+  if (DCI.isAfterLegalizeDAG()) {
+      // in generic combini, (uint_to_fp (setcc x, y, cc)) -> (select (setcc x, y,
+      // cc), 1.0, 0.0)
+      if (N->getValueType(0).isVector()) {
+          DCI.DAG.updateDivergence(N);
+      }
+  }
+
   SDValue LHS = Cond.getOperand(0);
   SDValue RHS = Cond.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 1931e0be15152..ef09aa4dede3c 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -2005,7 +2005,7 @@ let AddedComplexity = 20 in {
 
   // TODO: The predicate should not be necessary, but enabling this pattern for
   // all subtargets generates worse code in some cases.
-  let OtherPredicates = [HasPseudoScalarTrans] in
+  // let OtherPredicates = [HasPseudoScalarTrans] in
   def : GCNPat<
     (f32 (UniformSelect f32:$src0, f32:$src1)),
     (S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)

>From 686894fe7f3754258803e91072d0461aa2a402fc Mon Sep 17 00:00:00 2001
From: root <Zeng.Wu2 at amd.com>
Date: Wed, 11 Mar 2026 06:32:45 +0000
Subject: [PATCH 4/6] hanle copy to scc

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 37 +++++++++++--
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |  2 +-
 llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp    |  2 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  1 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      | 54 +------------------
 5 files changed, 38 insertions(+), 58 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 8698e816ddbb9..5601efa3a4356 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -327,10 +327,33 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
   return false;
 }
 
-void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
-  if (!Subtarget->d16PreservesUnusedBits())
-    return;
+bool AMDGPUDAGToDAGISel::preprocessSelect(SDNode *N) const {
+  if (!Subtarget->isWave64()) return false;
+
+  // If VALU is selected for `select`, no changes needed.
+  auto VT = N->getValueType(0);
+  auto ScalarPred = !N->isDivergent() && (VT == MVT::i32 || (VT == MVT::f32 && Subtarget->hasPseudoScalarTrans()));
+  if (!ScalarPred)
+      return false;
+
+  SDValue Cond = N->getOperand(0);
+
+  SDLoc DL(N);
+
+  SDValue Mask = CurDAG->getRegister(AMDGPU::EXEC, MVT::i64);
+  SDValue Ext = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Cond), 0);
+  SDValue NewCond = SDValue(CurDAG->getMachineNode(AMDGPU::S_AND_B64, DL, MVT::i64, MVT::Glue, Mask, Ext), 0);
 
+  Cond = NewCond;
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  SDValue NewSel =
+      SDValue(CurDAG->getMachineNode(AMDGPU::S_CSELECT_B32, DL, VT, LHS, RHS, SDValue(Cond.getNode(), 1)), 0);
+  CurDAG->ReplaceAllUsesWith(N, NewSel.getNode());
+  return true;
+}
+
+void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
 
   bool MadeChange = false;
@@ -342,8 +365,14 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
     switch (N->getOpcode()) {
     case ISD::BUILD_VECTOR:
       // TODO: Match load d16 from shl (extload:i16), 16
-      MadeChange |= matchLoadD16FromBuildVector(N);
+      if (Subtarget->d16PreservesUnusedBits()) {
+        MadeChange |= matchLoadD16FromBuildVector(N);
+      }
+
       break;
+    case ISD::SELECT:
+      MadeChange |= preprocessSelect(N);
+        break;
     default:
       break;
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index a86b75458923e..706bd1266f4ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -59,6 +59,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   // Instructions that will be lowered with a final instruction that zeros the
   // high result bits.
   bool fp16SrcZerosHighBits(unsigned Opc) const;
+  bool preprocessSelect(SDNode *N) const;
 
 public:
   AMDGPUDAGToDAGISel() = delete;
@@ -301,7 +302,6 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
 class AMDGPUISelDAGToDAGPass : public SelectionDAGISelPass {
 public:
   AMDGPUISelDAGToDAGPass(TargetMachine &TM);
-
   PreservedAnalyses run(MachineFunction &MF,
                         MachineFunctionAnalysisManager &MFAM);
 };
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 39a6a7762eea5..1aa7fdbd0bf33 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -782,7 +782,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
 
   lowerVGPR2SGPRCopies(MF);
   // Postprocessing
-  fixSCCCopies(MF);
+  // fixSCCCopies(MF);
   for (auto *MI : S2VCopies) {
     // Check if it is still valid
     if (MI->isCopy()) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7535407741f1f..2cd9741199050 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -628,6 +628,7 @@ static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
                               const DebugLoc &DL, MCRegister DestReg,
                               MCRegister SrcReg, bool KillSrc,
                               const char *Msg = "illegal VGPR to SGPR copy") {
+  assert (false && "illegal VGPR to SGPR copy");
   MachineFunction *MF = MBB.getParent();
 
   LLVMContext &C = MF->getFunction().getContext();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 349deb8d93633..9f6185c0ee05f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3043,57 +3043,7 @@ def : GCNPat <
   (S_AND_B32 $src, (i32 1))
 >;
 
-// since operand of S_AND_B64 is SREG-64 and it does not support i1.
-let AddedComplexity = 10 in
-def : GCNPat <
-  (i64 (UniformUnaryFrag<zext> i1:$src)),
-  (REG_SEQUENCE SReg_64,
-    (S_MOV_B32 (S_AND_B32 $src, (i32 1))), sub0,
-    (S_MOV_B32 (i32 0)), sub1
-  )
->;
-
-let WaveSizePredicate = isWave64 in {
-def : GCNPat <
-  (i1 (and i1:$src0, i1:$src1)),
-  (S_AND_B64 $src0, $src1)
->;
-
-
-def : GCNPat <
-  (i1 (or i1:$src0, i1:$src1)),
-  (S_OR_B64 $src0, $src1)
->;
-
-def : GCNPat <
-  (i1 (xor i1:$src0, i1:$src1)),
-  (S_XOR_B64 $src0, $src1)
->;
-
-def : GCNPat <
-  (i1 (add i1:$src0, i1:$src1)),
-  (S_XOR_B64 $src0, $src1)
->;
-
-def : GCNPat <
-  (i1 (sub i1:$src0, i1:$src1)),
-  (S_XOR_B64 $src0, $src1)
->;
-
-let AddedComplexity = 1 in {
-def : GCNPat <
-  (i1 (add i1:$src0, (i1 -1))),
-  (S_NOT_B64 $src0)
->;
-
-def : GCNPat <
-  (i1 (sub i1:$src0, (i1 -1))),
-  (S_NOT_B64 $src0)
->;
-}
-} // end isWave64
-
-let WaveSizePredicate = isWave32 in {
+// let WaveSizePredicate = isWave32 in {
 def : GCNPat <
   (i1 (and i1:$src0, i1:$src1)),
   (S_AND_B32 $src0, $src1)
@@ -3130,7 +3080,7 @@ def : GCNPat <
   (S_NOT_B32 $src0)
 >;
 }
-} // end isWave32
+//} // end isWave32
 
 def : GCNPat <
   (i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))),

>From 9879048df874bc7063b046809112358314021b0e Mon Sep 17 00:00:00 2001
From: root <Zeng.Wu2 at amd.com>
Date: Thu, 12 Mar 2026 07:07:45 +0000
Subject: [PATCH 5/6] Add constraints on the instruction selection of i1

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 46 ++++++++++++++++---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |  1 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  8 ----
 llvm/lib/Target/AMDGPU/SIInstructions.td      | 11 ++++-
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  9 ++--
 5 files changed, 54 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 5601efa3a4356..8f5ba672a2a0e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -21,8 +21,10 @@
 #include "R600RegisterInfo.h"
 #include "SIISelLowering.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -327,24 +329,54 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
   return false;
 }
 
+bool AMDGPUDAGToDAGISel::SetCCFromFP(const SDNode *N) const {
+  const SDNode *CondNode = N;
+  if (CondNode->getOpcode() == ISD::FREEZE) {
+      CondNode = CondNode->getOperand(0).getNode();
+  }
+
+  auto SetCCOpcodes = {ISD::SETCC, ISD::FP_TO_UINT};
+  auto LogicalOpcodes  = {ISD::AND, ISD::OR, ISD::XOR};
+
+  SmallVector<const SDNode*, 4> SetCCOpnds;
+  SmallVector<const SDNode*, 4> Stack;
+  Stack.push_back(CondNode);
+  while (!Stack.empty()) {
+      auto Node = Stack.pop_back_val();
+      if (llvm::find(SetCCOpcodes, Node->getOpcode()) != SetCCOpcodes.end()) {
+          SetCCOpnds.push_back(Node);
+      } else if (llvm::find(LogicalOpcodes, Node->getOpcode()) != LogicalOpcodes.end()) {
+          Stack.push_back(Node->getOperand(0).getNode());
+          Stack.push_back(Node->getOperand(1).getNode());
+      } else {
+          assert(false);
+      }
+  }
+
+  return llvm::any_of(SetCCOpnds, [](const SDNode* Node) {
+      for (auto It = Node->op_begin(); It != Node->op_end(); ++It) {
+          if (It->getValueType() == MVT::f16 || It->getValueType() == MVT::f32) return true;
+      }
+      return false;
+  });
+}
+
 bool AMDGPUDAGToDAGISel::preprocessSelect(SDNode *N) const {
   if (!Subtarget->isWave64()) return false;
 
   // If VALU is selected for `select`, no changes needed.
   auto VT = N->getValueType(0);
-  auto ScalarPred = !N->isDivergent() && (VT == MVT::i32 || (VT == MVT::f32 && Subtarget->hasPseudoScalarTrans()));
-  if (!ScalarPred)
-      return false;
-
   SDValue Cond = N->getOperand(0);
+  const SDNode *CondNode = Cond.getNode();
+  if (SetCCFromFP(CondNode)) {
+      return false;
+  }
 
   SDLoc DL(N);
 
   SDValue Mask = CurDAG->getRegister(AMDGPU::EXEC, MVT::i64);
   SDValue Ext = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Cond), 0);
-  SDValue NewCond = SDValue(CurDAG->getMachineNode(AMDGPU::S_AND_B64, DL, MVT::i64, MVT::Glue, Mask, Ext), 0);
-
-  Cond = NewCond;
+  Cond = SDValue(CurDAG->getMachineNode(AMDGPU::S_AND_B64, DL, MVT::i64, MVT::Glue, Mask, Ext), 0);
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
   SDValue NewSel =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 706bd1266f4ca..a7baff1f48f4a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -60,6 +60,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   // high result bits.
   bool fp16SrcZerosHighBits(unsigned Opc) const;
   bool preprocessSelect(SDNode *N) const;
+  bool SetCCFromFP(const SDNode *N) const;
 
 public:
   AMDGPUDAGToDAGISel() = delete;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index dc11714f3d519..301f2fc8dab45 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16868,14 +16868,6 @@ SDValue SITargetLowering::performSelectCombine(SDNode *N,
   if (Cond.getOpcode() != ISD::SETCC)
     return SDValue();
 
-  if (DCI.isAfterLegalizeDAG()) {
-      // in generic combini, (uint_to_fp (setcc x, y, cc)) -> (select (setcc x, y,
-      // cc), 1.0, 0.0)
-      if (N->getValueType(0).isVector()) {
-          DCI.DAG.updateDivergence(N);
-      }
-  }
-
   SDValue LHS = Cond.getOperand(0);
   SDValue RHS = Cond.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9f6185c0ee05f..06ce3585819f2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3038,8 +3038,17 @@ def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
 // instructions resulting in the copies from SCC to these instructions
 // will be moved to the VALU.
 
+def UniformAND : PatFrag<
+  (ops node:$src),
+  (zext $src),
+  [{
+    if (N->isDivergent() || N->getOperand(0)) return false;
+    return !SetCCFromFP(N->getOperand(0).getNode());    
+  }]
+>;
+
 def : GCNPat <
- (i32 (UniformUnaryFrag<zext> i1:$src)),
+ (i32 (UniformAND i1:$src)),
   (S_AND_B32 $src, (i32 1))
 >;
 
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index ef09aa4dede3c..4f88dad50198b 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1990,11 +1990,13 @@ def : GCNPat<
 //===----------------------------------------------------------------------===//
 // SOP2 Patterns
 //===----------------------------------------------------------------------===//
-
 def UniformSelect : PatFrag<
   (ops node:$src0, node:$src1),
   (select SCC, $src0, $src1),
-  [{ return !N->isDivergent(); }]
+  [{
+    if (N->isDivergent() || N->getOperand(0)->isDivergent()) return false;
+    return !SetCCFromFP(N->getOperand(0).getNode());    
+  }]
 >;
 
 let AddedComplexity = 20 in {
@@ -2003,9 +2005,6 @@ let AddedComplexity = 20 in {
     (S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
   >;
 
-  // TODO: The predicate should not be necessary, but enabling this pattern for
-  // all subtargets generates worse code in some cases.
-  // let OtherPredicates = [HasPseudoScalarTrans] in
   def : GCNPat<
     (f32 (UniformSelect f32:$src0, f32:$src1)),
     (S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)

>From dd058cf8c45ee99122cc4909df124ff127742c0a Mon Sep 17 00:00:00 2001
From: root <Zeng.Wu2 at amd.com>
Date: Sat, 14 Mar 2026 07:47:47 +0000
Subject: [PATCH 6/6] Add back predicate isWave64 for S_AND etc since it is
 selected for results of V_CMP

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  2 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      | 45 ++++++++++++++++++-
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 8f5ba672a2a0e..48795d61c407a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -349,7 +349,7 @@ bool AMDGPUDAGToDAGISel::SetCCFromFP(const SDNode *N) const {
           Stack.push_back(Node->getOperand(0).getNode());
           Stack.push_back(Node->getOperand(1).getNode());
       } else {
-          assert(false);
+          return true;
       }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 06ce3585819f2..c7f7ad302203b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3052,7 +3052,48 @@ def : GCNPat <
   (S_AND_B32 $src, (i32 1))
 >;
 
-// let WaveSizePredicate = isWave32 in {
+// There is a pattern like the results are from V_CMP and results are consumed by S_AND
+let WaveSizePredicate = isWave64 in {
+def : GCNPat <
+  (i1 (and i1:$src0, i1:$src1)),
+  (S_AND_B64 $src0, $src1)
+>;
+
+
+def : GCNPat <
+  (i1 (or i1:$src0, i1:$src1)),
+  (S_OR_B64 $src0, $src1)
+>;
+
+def : GCNPat <
+  (i1 (xor i1:$src0, i1:$src1)),
+  (S_XOR_B64 $src0, $src1)
+>;
+
+def : GCNPat <
+  (i1 (add i1:$src0, i1:$src1)),
+  (S_XOR_B64 $src0, $src1)
+>;
+
+def : GCNPat <
+  (i1 (sub i1:$src0, i1:$src1)),
+  (S_XOR_B64 $src0, $src1)
+>;
+
+let AddedComplexity = 1 in {
+def : GCNPat <
+  (i1 (add i1:$src0, (i1 -1))),
+  (S_NOT_B64 $src0)
+>;
+
+def : GCNPat <
+  (i1 (sub i1:$src0, (i1 -1))),
+  (S_NOT_B64 $src0)
+>;
+}
+} // end isWave64
+
+let WaveSizePredicate = isWave32 in {
 def : GCNPat <
   (i1 (and i1:$src0, i1:$src1)),
   (S_AND_B32 $src0, $src1)
@@ -3089,7 +3130,7 @@ def : GCNPat <
   (S_NOT_B32 $src0)
 >;
 }
-//} // end isWave32
+} // end isWave32
 
 def : GCNPat <
   (i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))),