[llvm] [AMDGPU] using divergent/uniform information in ISel of zext (PR #174539)

Wed Jan 21 18:12:24 PST 2026

https://github.com/zwu-2025 updated https://github.com/llvm/llvm-project/pull/174539

>From 07e0ee192b7640291fd0d908f1eb0035a4f2b14c Mon Sep 17 00:00:00 2001
From: Zeng Wu <Zeng.Wu2 at amd.com>
Date: Sun, 11 Jan 2026 01:11:49 -0600
Subject: [PATCH 1/2] [AMD] using divergent/uniform information in ISel of zext

---
 llvm/lib/Target/AMDGPU/SIInstructions.td      | 12 +++++++++++-
 .../AMDGPU/llvm.amdgcn.i1.i32.uniform.zext.ll | 19 +++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.i1.i32.uniform.zext.ll

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index ca5a4d7301bda..46a254263c0e4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2535,7 +2535,7 @@ def : GCNPat <
 >;
 
 class Ext32Pat <SDNode ext> : GCNPat <
-  (i32 (ext i1:$src0)),
+  (i32 (ext i1:$src0)), 
   (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
                      /*src1mod*/(i32 0), /*src1*/(i32 1), i1:$src0)
 >;
@@ -3044,6 +3044,11 @@ def : GCNPat <
   (S_AND_B64 $src0, $src1)
 >;
 
+def : GCNPat <
+  (i64 (UniformUnaryFrag<zext> i1:$src)),
+  (S_AND_B64 $src, (i64 1))
+>;
+
 def : GCNPat <
   (i1 (or i1:$src0, i1:$src1)),
   (S_OR_B64 $src0, $src1)
@@ -3083,6 +3088,11 @@ def : GCNPat <
   (S_AND_B32 $src0, $src1)
 >;
 
+def : GCNPat <
+  (i32 (UniformUnaryFrag<zext> i1:$src)),
+  (S_AND_B32 $src, (i32 1))
+>;
+
 def : GCNPat <
   (i1 (or i1:$src0, i1:$src1)),
   (S_OR_B32 $src0, $src1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.i1.i32.uniform.zext.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.i1.i32.uniform.zext.ll
new file mode 100644
index 0000000000000..63486534e7032
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.i1.i32.uniform.zext.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN-OPT %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefix=GCN-OPT %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN-G_SEL %s
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+define amdgpu_kernel void @zext_i1_to_i32_uniform(ptr addrspace(1) %out, i1 %pred) #0 {
+entry:
+; GCN-LABEL: zext_i1_to_i32_uniform:
+; GCN-OPT:    s_and_b32 s{{.*}}, s{{.*}}, 1
+; GCN-G_SEL:  v_mov_b32_e32
+; GCN-OPT: s_endpgm
+  %tmp2 = zext i1 %pred to i32
+  store i32 %tmp2, ptr addrspace(1) %out
+  ret void
+}
+
+attributes #0 = { nounwind }

>From ee4318bf0edb3b6492c6108a79892cc5870e7e89 Mon Sep 17 00:00:00 2001
From: Zeng Wu <Zeng.Wu2 at amd.com>
Date: Wed, 21 Jan 2026 20:11:35 -0600
Subject: [PATCH 2/2] [AMDGPU] using divergent/uniform information in ISel of
 zext

---
 .../lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 25 +++++++++++++++++--
 llvm/lib/Target/AMDGPU/SIInstructions.td      | 19 +++++++-------
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 4ad721bf21959..1bcc231425717 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -105,8 +105,29 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
   MVT VT = Op.getSimpleValueType();
 
   // Stick to the preferred register classes for legal types.
-  if (TLI->isTypeLegal(VT))
-    UseRC = TLI->getRegClassFor(VT, Op->isDivergent());
+  if (TLI->isTypeLegal(VT)) {
+    bool SrcRegIsImplicitDef = false;
+    auto Node = Op.getNode();
+    if (Node->isMachineOpcode() && SrcReg.isPhysical()) {
+      const MCInstrDesc &II = TII->get(Op.getMachineOpcode());
+      auto ImplicitDefs = II.implicit_defs();
+      auto MCSrcReg = SrcReg.asMCReg();
+
+      SrcRegIsImplicitDef = II.NumDefs == 0 && llvm::any_of(ImplicitDefs, [MCSrcReg](MCPhysReg Reg) {
+          return Reg == MCSrcReg;
+      });
+    }
+    if (SrcRegIsImplicitDef) {
+      // if SrcRegIsImplicitDef is true, then the purpose of EmitCopyFromReg is
+      // to insert a `COPY` to copy out from the implicit defined physical
+      // register to another with valid register class. For this case, we prefer getCrossCopyRegClass
+      // to getRegClassFor since this kind of copy could be cross register class.
+      const llvm::TargetRegisterClass *ImplicitRC = TRI->getMinimalPhysRegClass(SrcReg);
+      UseRC = TRI->getCrossCopyRegClass(ImplicitRC);
+    }
+    else
+      UseRC = TLI->getRegClassFor(VT, Op->isDivergent());
+  }
 
   for (SDNode *User : Op->users()) {
     bool Match = true;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 46a254263c0e4..e165f3bcf2b48 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3038,16 +3038,22 @@ def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
 // instructions resulting in the copies from SCC to these instructions
 // will be moved to the VALU.
 
+def : GCNPat <
+ (i64 (UniformUnaryFrag<zext> i1:$src)),
+  (S_AND_B64 $src, (i64 1))
+>;
+
+def : GCNPat <
+ (i32 (UniformUnaryFrag<zext> i1:$src)),
+  (S_AND_B32 $src, (i32 1))
+>;
+
 let WaveSizePredicate = isWave64 in {
 def : GCNPat <
   (i1 (and i1:$src0, i1:$src1)),
   (S_AND_B64 $src0, $src1)
 >;
 
-def : GCNPat <
-  (i64 (UniformUnaryFrag<zext> i1:$src)),
-  (S_AND_B64 $src, (i64 1))
->;
 
 def : GCNPat <
   (i1 (or i1:$src0, i1:$src1)),
@@ -3088,11 +3094,6 @@ def : GCNPat <
   (S_AND_B32 $src0, $src1)
 >;
 
-def : GCNPat <
-  (i32 (UniformUnaryFrag<zext> i1:$src)),
-  (S_AND_B32 $src, (i32 1))
->;
-
 def : GCNPat <
   (i1 (or i1:$src0, i1:$src1)),
   (S_OR_B32 $src0, $src1)