[llvm] [AMDGPU] propagate unifor/diverent in DAG combine (PR #184940)

Thu Mar 5 19:06:05 PST 2026

https://github.com/zwu-2025 created https://github.com/llvm/llvm-project/pull/184940

For the `select` node created in the DAG combiner for `(uint_to_fp (setcc x, y, cc))`, the uniform/divergent information will be updated in the custom combiner.

>From 34dbc2d0a4328bf6b3e4a8343b1de12789b65873 Mon Sep 17 00:00:00 2001
From: root <zengwu13 at amd.com>
Date: Fri, 6 Mar 2026 03:02:55 +0000
Subject: [PATCH] [AMDGPU] propagate unifor/diverent in DAG combine

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  8 ++++
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  2 +-
 .../AMDGPU/llvm.amdgcn.uniform.combine.ll     | 47 +++++++++++++++++++
 3 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.uniform.combine.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 301f2fc8dab45..dc11714f3d519 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16868,6 +16868,14 @@ SDValue SITargetLowering::performSelectCombine(SDNode *N,
   if (Cond.getOpcode() != ISD::SETCC)
     return SDValue();
 
+  if (DCI.isAfterLegalizeDAG()) {
+      // in generic combini, (uint_to_fp (setcc x, y, cc)) -> (select (setcc x, y,
+      // cc), 1.0, 0.0)
+      if (N->getValueType(0).isVector()) {
+          DCI.DAG.updateDivergence(N);
+      }
+  }
+
   SDValue LHS = Cond.getOperand(0);
   SDValue RHS = Cond.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 1931e0be15152..ef09aa4dede3c 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -2005,7 +2005,7 @@ let AddedComplexity = 20 in {
 
   // TODO: The predicate should not be necessary, but enabling this pattern for
   // all subtargets generates worse code in some cases.
-  let OtherPredicates = [HasPseudoScalarTrans] in
+  // let OtherPredicates = [HasPseudoScalarTrans] in
   def : GCNPat<
     (f32 (UniformSelect f32:$src0, f32:$src1)),
     (S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.uniform.combine.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.uniform.combine.ll
new file mode 100644
index 0000000000000..67f3e344a2e38
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.uniform.combine.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -%s --check-prefixes=GFX950
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -%s --check-prefixes=GFX1250
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -%s --check-prefixes=DEFALUT
+
+; CHECK-LABEL: f:
+; GFX950    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950    s_mov_b64 s[0:1], 0
+; GFX950    s_load_dword s0, s[0:1], 0x0
+; GFX950    s_waitcnt lgkmcnt(0)
+; GFX950    s_cmp_lg_u32 s0, 0
+; GFX950    s_cselect_b32 s0, 1.0, 0
+; GFX950    v_mov_b32_e32 v0, s0
+; GFX950    s_setpc_b64 s[30:31]
+
+; GFX1250   s_wait_loadcnt_dscnt 0x0
+; GFX1250   s_wait_kmcnt 0x0
+; GFX1250   s_mov_b64 s[0:1], 0
+; GFX1250   s_load_b32 s0, s[0:1], 0x0
+; GFX1250   s_wait_kmcnt 0x0
+; GFX1250   s_cmp_lg_u32 s0, 0
+; GFX1250   s_cselect_b32 s0, 1.0, 0
+; GFX1250   s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250   v_mov_b32_e32 v0, s0
+; GFX1250   s_set_pc_i64 s[30:31]
+
+; DEFALUT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DEFALUT:    s_mov_b64 s[4:5], 0
+; DEFALUT:    s_load_dword s4, s[4:5], 0x0
+; DEFALUT:    s_waitcnt lgkmcnt(0)
+; DEFALUT:    s_cmp_lg_u32 s4, 0
+; DEFALUT:    s_cselect_b32 s4, 1.0, 0
+; DEFALUT:    v_mov_b32_e32 v0, s4
+; DEFALUT:    s_setpc_b64 s[30:31]
+define float @f(i32 %arg, ptr %ptr) {
+bb:
+  %i = load <2 x i32>, ptr addrspace(4) null, align 4294967296
+  %i1 = extractelement <2 x i32> %i, i64 1
+  %i2 = extractelement <2 x i32> %i, i64 0
+  %i3 = lshr i32 %i1, 1
+  %i4 = icmp ne i32 %i2, 0
+  %i8 = uitofp i1 %i4 to float
+  ret float %i8
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; MIR: {{.*}}