[llvm] [AMDGPU] using divergent/uniform information in ISel of zext (PR #174539)

Thu Mar 5 00:10:45 PST 2026

https://github.com/zwu-2025 updated https://github.com/llvm/llvm-project/pull/174539

>From 07e0ee192b7640291fd0d908f1eb0035a4f2b14c Mon Sep 17 00:00:00 2001
From: Zeng Wu <Zeng.Wu2 at amd.com>
Date: Sun, 11 Jan 2026 01:11:49 -0600
Subject: [PATCH 1/9] [AMD] using divergent/uniform information in ISel of zext

---
 llvm/lib/Target/AMDGPU/SIInstructions.td      | 12 +++++++++++-
 .../AMDGPU/llvm.amdgcn.i1.i32.uniform.zext.ll | 19 +++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.i1.i32.uniform.zext.ll

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index ca5a4d7301bda..46a254263c0e4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2535,7 +2535,7 @@ def : GCNPat <
 >;
 
 class Ext32Pat <SDNode ext> : GCNPat <
-  (i32 (ext i1:$src0)),
+  (i32 (ext i1:$src0)), 
   (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
                      /*src1mod*/(i32 0), /*src1*/(i32 1), i1:$src0)
 >;
@@ -3044,6 +3044,11 @@ def : GCNPat <
   (S_AND_B64 $src0, $src1)
 >;
 
+def : GCNPat <
+  (i64 (UniformUnaryFrag<zext> i1:$src)),
+  (S_AND_B64 $src, (i64 1))
+>;
+
 def : GCNPat <
   (i1 (or i1:$src0, i1:$src1)),
   (S_OR_B64 $src0, $src1)
@@ -3083,6 +3088,11 @@ def : GCNPat <
   (S_AND_B32 $src0, $src1)
 >;
 
+def : GCNPat <
+  (i32 (UniformUnaryFrag<zext> i1:$src)),
+  (S_AND_B32 $src, (i32 1))
+>;
+
 def : GCNPat <
   (i1 (or i1:$src0, i1:$src1)),
   (S_OR_B32 $src0, $src1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.i1.i32.uniform.zext.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.i1.i32.uniform.zext.ll
new file mode 100644
index 0000000000000..63486534e7032
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.i1.i32.uniform.zext.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN-OPT %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefix=GCN-OPT %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN-G_SEL %s
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+define amdgpu_kernel void @zext_i1_to_i32_uniform(ptr addrspace(1) %out, i1 %pred) #0 {
+entry:
+; GCN-LABEL: zext_i1_to_i32_uniform:
+; GCN-OPT:    s_and_b32 s{{.*}}, s{{.*}}, 1
+; GCN-G_SEL:  v_mov_b32_e32
+; GCN-OPT: s_endpgm
+  %tmp2 = zext i1 %pred to i32
+  store i32 %tmp2, ptr addrspace(1) %out
+  ret void
+}
+
+attributes #0 = { nounwind }

>From 29e1625a531b855192f4d9c96300e5dcd54287f7 Mon Sep 17 00:00:00 2001
From: Zeng Wu <Zeng.Wu2 at amd.com>
Date: Wed, 21 Jan 2026 20:11:35 -0600
Subject: [PATCH 2/9] [AMDGPU] using divergent/uniform information in ISel of
 zext

---
 .../lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 33 ++++++++++++++++---
 llvm/lib/Target/AMDGPU/SIInstructions.td      | 14 +++-----
 2 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 4ad721bf21959..9eb8df71ed165 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/PseudoProbe.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
@@ -105,8 +106,29 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
   MVT VT = Op.getSimpleValueType();
 
   // Stick to the preferred register classes for legal types.
-  if (TLI->isTypeLegal(VT))
-    UseRC = TLI->getRegClassFor(VT, Op->isDivergent());
+  if (TLI->isTypeLegal(VT)) {
+      const llvm::TargetRegisterClass *SrcRegRC = TRI->getMinimalPhysRegClass(SrcReg);
+      UseRC = TRI->getCrossCopyRegClass(SrcRegRC);
+      const llvm::TargetRegisterClass *LegalRC =
+          TLI->getRegClassFor(VT, Op->isDivergent());
+
+      if (!TRI->isTypeLegalForClass(*UseRC, VT)) {
+          UseRC = LegalRC;
+      }
+      {
+        // If there is a sub class relation between CrossCopyRegClass and
+        // natively supported RegClass, the result of getRegClassFor, then
+        // we use natively supported RegClass to stick the existing logic.
+        // For example, on AArch64, the CrossCopyRegClass of x0 is `GPR64arg`
+        // and x0 is natively supported in regclass `GPR64all`, then `GPR64all` is chosen.
+        // However, on AMDGPU, for `scc`, the natively supported regclass is, for
+        // some reasons, SGPR_64 but CrossCopyRegClass is SGPR_32. Since there
+        // is subclass relation, CrossCopyRegClass, SGPR_32 is picked.
+        if (TRI->getCommonSubClass(UseRC, LegalRC)) {
+            UseRC = LegalRC;
+        }
+      }
+  }
 
   for (SDNode *User : Op->users()) {
     bool Match = true;
@@ -121,7 +143,7 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
       for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
         if (User->getOperand(i) != Op)
           continue;
-        if (VT == MVT::Other || VT == MVT::Glue)
+        if (VT == MVT::Other || VT == MVT::Glue || !TLI->isTypeLegal(VT))
           continue;
         Match = false;
         if (User->isMachineOpcode()) {
@@ -131,10 +153,11 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
             RC = TRI->getAllocatableClass(
                 TII->getRegClass(II, i + II.getNumDefs()));
           }
+
           if (!UseRC)
             UseRC = RC;
           else if (RC) {
-            const TargetRegisterClass *ComRC =
+              const TargetRegisterClass *ComRC =
                 TRI->getCommonSubClass(UseRC, RC);
             // If multiple uses expect disjoint register classes, we emit
             // copies in AddRegisterOperand.
@@ -153,6 +176,8 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
   SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT);
 
   // Figure out the register class to create for the destreg.
+  // If SrcReg is phsysical register, the corresponding register class could be
+  // non allocable, so we prefer UseRC to SrcRC
   if (VRBase) {
     DstRC = MRI->getRegClass(VRBase);
   } else if (UseRC) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 46a254263c0e4..257fa23608d94 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3038,16 +3038,17 @@ def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
 // instructions resulting in the copies from SCC to these instructions
 // will be moved to the VALU.
 
+def : GCNPat <
+ (i32 (UniformUnaryFrag<zext> i1:$src)),
+  (S_AND_B32 $src, (i32 1))
+>;
+
 let WaveSizePredicate = isWave64 in {
 def : GCNPat <
   (i1 (and i1:$src0, i1:$src1)),
   (S_AND_B64 $src0, $src1)
 >;
 
-def : GCNPat <
-  (i64 (UniformUnaryFrag<zext> i1:$src)),
-  (S_AND_B64 $src, (i64 1))
->;
 
 def : GCNPat <
   (i1 (or i1:$src0, i1:$src1)),
@@ -3088,11 +3089,6 @@ def : GCNPat <
   (S_AND_B32 $src0, $src1)
 >;
 
-def : GCNPat <
-  (i32 (UniformUnaryFrag<zext> i1:$src)),
-  (S_AND_B32 $src, (i32 1))
->;
-
 def : GCNPat <
   (i1 (or i1:$src0, i1:$src1)),
   (S_OR_B32 $src0, $src1)

>From d165a018d080589e98b4a34a95e656856c6a4734 Mon Sep 17 00:00:00 2001
From: root <zengwu13 at amd.com>
Date: Fri, 30 Jan 2026 18:16:49 +0000
Subject: [PATCH 3/9] comments

---
 .../lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 43 +++++++++----------
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 9eb8df71ed165..2e661faafbb95 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -107,27 +107,24 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
 
   // Stick to the preferred register classes for legal types.
   if (TLI->isTypeLegal(VT)) {
-      const llvm::TargetRegisterClass *SrcRegRC = TRI->getMinimalPhysRegClass(SrcReg);
-      UseRC = TRI->getCrossCopyRegClass(SrcRegRC);
-      const llvm::TargetRegisterClass *LegalRC =
+    const llvm::TargetRegisterClass *SrcRegRC = TRI->getMinimalPhysRegClass(SrcReg);
+    const llvm::TargetRegisterClass *CrossCopyRC = TRI->getCrossCopyRegClass(SrcRegRC);
+    const llvm::TargetRegisterClass *LegalRC =
           TLI->getRegClassFor(VT, Op->isDivergent());
 
-      if (!TRI->isTypeLegalForClass(*UseRC, VT)) {
-          UseRC = LegalRC;
-      }
-      {
-        // If there is a sub class relation between CrossCopyRegClass and
-        // natively supported RegClass, the result of getRegClassFor, then
-        // we use natively supported RegClass to stick the existing logic.
-        // For example, on AArch64, the CrossCopyRegClass of x0 is `GPR64arg`
-        // and x0 is natively supported in regclass `GPR64all`, then `GPR64all` is chosen.
-        // However, on AMDGPU, for `scc`, the natively supported regclass is, for
-        // some reasons, SGPR_64 but CrossCopyRegClass is SGPR_32. Since there
-        // is subclass relation, CrossCopyRegClass, SGPR_32 is picked.
-        if (TRI->getCommonSubClass(UseRC, LegalRC)) {
-            UseRC = LegalRC;
-        }
-      }
+    // If there is a sub class relation between CrossCopyRegClass and
+    // natively supported RegClass, the result of getRegClassFor, then
+    // we use natively supported RegClass to stick the existing logic.
+    // For example, on AArch64, the CrossCopyRegClass of x0 is `GPR64arg`
+    // and x0 is natively supported in regclass `GPR64all`, then `GPR64all` is chosen.
+    // However, on AMDGPU, for `scc`, the natively supported regclass is, for
+    // some reasons, SGPR_64 but CrossCopyRegClass is SGPR_32. Since there
+    // is subclass relation, CrossCopyRegClass, SGPR_32 is picked.
+    if (!TRI->isTypeLegalForClass(*CrossCopyRC, VT) || TRI->getCommonSubClass(CrossCopyRC, LegalRC)) {
+      UseRC = LegalRC;
+    } else {
+      UseRC = CrossCopyRC;
+    }
   }
 
   for (SDNode *User : Op->users()) {
@@ -143,7 +140,7 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
       for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
         if (User->getOperand(i) != Op)
           continue;
-        if (VT == MVT::Other || VT == MVT::Glue || !TLI->isTypeLegal(VT))
+        if (VT == MVT::Other || VT == MVT::Glue)
           continue;
         Match = false;
         if (User->isMachineOpcode()) {
@@ -157,7 +154,7 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
           if (!UseRC)
             UseRC = RC;
           else if (RC) {
-              const TargetRegisterClass *ComRC =
+            const TargetRegisterClass *ComRC =
                 TRI->getCommonSubClass(UseRC, RC);
             // If multiple uses expect disjoint register classes, we emit
             // copies in AddRegisterOperand.
@@ -174,10 +171,10 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
 
   const TargetRegisterClass *SrcRC = nullptr, *DstRC = nullptr;
   SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT);
+  // if (!SrcRC->isAllocatable())
+  //  SrcRC = TRI->getCrossCopyRegClass(SrcRC);
 
   // Figure out the register class to create for the destreg.
-  // If SrcReg is phsysical register, the corresponding register class could be
-  // non allocable, so we prefer UseRC to SrcRC
   if (VRBase) {
     DstRC = MRI->getRegClass(VRBase);
   } else if (UseRC) {

>From 74fe3bee4ccc3f360fb0b4e89b6184ba952e46ee Mon Sep 17 00:00:00 2001
From: Zeng Wu <Zeng.Wu2 at amd.com>
Date: Fri, 30 Jan 2026 12:19:23 -0600
Subject: [PATCH 4/9] comments

---
 llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 2e661faafbb95..2300693aa766a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -171,8 +171,6 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
 
   const TargetRegisterClass *SrcRC = nullptr, *DstRC = nullptr;
   SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT);
-  // if (!SrcRC->isAllocatable())
-  //  SrcRC = TRI->getCrossCopyRegClass(SrcRC);
 
   // Figure out the register class to create for the destreg.
   if (VRBase) {

>From 529e931143253345bd32cb871313997318f466ed Mon Sep 17 00:00:00 2001
From: root <zengwu13 at amd.com>
Date: Wed, 4 Feb 2026 08:36:36 +0000
Subject: [PATCH 5/9] [AMDGPU] update SCCCopies in si-fix-sgpr-copies

---
 llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp    |  58 +++++++--
 .../CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll     | 122 ++++++++++++++++++
 2 files changed, 171 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll

diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 39a6a7762eea5..68a348cda39b9 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -69,6 +69,7 @@
 #include "AMDGPULaneMaskUtils.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIRegisterInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Target/TargetMachine.h"
@@ -1183,15 +1184,54 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
       Register SrcReg = MI.getOperand(1).getReg();
       Register DstReg = MI.getOperand(0).getReg();
       if (SrcReg == AMDGPU::SCC) {
-        Register SCCCopy =
-            MRI->createVirtualRegister(TRI->getWaveMaskRegClass());
-        I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)),
-                    MI.getDebugLoc(), TII->get(LMC.CSelectOpc), SCCCopy)
-                .addImm(-1)
-                .addImm(0);
-        I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(),
-                    TII->get(AMDGPU::COPY), DstReg)
-                .addReg(SCCCopy);
+        Register SCCCopy = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0_XEXECRegClass);
+        I = BuildMI(
+                  *MI.getParent(), std::next(MachineBasicBlock::iterator(MI)),
+                  MI.getDebugLoc(), TII->get(AMDGPU::S_CSELECT_B32), SCCCopy)
+                  .addImm(-1)
+                  .addImm(0);
+
+        assert(DstReg.isVirtual());
+
+        const llvm::TargetRegisterInfo *TRI =
+            MF.getSubtarget().getRegisterInfo();
+
+        for (llvm::MachineOperand &UseOp : MRI->use_operands(DstReg)) {
+          llvm::MachineInstr *UserMI = UseOp.getParent();
+          for (const llvm::MachineOperand &Output : UserMI->defs()) {
+            if (!Output.isReg())
+              continue;
+           Register OutputReg = Output.getReg();
+           const auto RegSize = TRI->getRegSizeInBits(OutputReg, *MRI);
+           if (RegSize == 32) {
+              I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(),
+                            TII->get(AMDGPU::COPY), DstReg)
+                    .addReg(SCCCopy);
+            } else {
+              assert(RegSize == 64);
+
+              if (UserMI->getOpcode() != AMDGPU::COPY) {
+                // After DAG-2-DAG selection, e.g.
+                // %12:sreg_32 = COPY $scc
+                // %14:sreg_64_xexec = COPY %12:sreg_32
+                // ....
+                // so if opcode is not COPY, the legalization in ISel will make
+                // sure the the copy is legal. Only the COPY inserted in DAG to Block
+                // could have this issue.
+                  continue;
+              }
+
+              BuildMI(MBB, UserMI, UserMI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE),
+                      OutputReg)
+                  .addReg(SCCCopy)
+                  .addImm(AMDGPU::sub0)
+                  .addReg(SCCCopy)
+                  .addImm(AMDGPU::sub1);
+              UserMI->eraseFromParent();
+            }
+          }
+        }
+
         MI.eraseFromParent();
         continue;
       }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll
new file mode 100644
index 0000000000000..6404afa911271
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GFX1250 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX950 %s
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+; GCN-LABEL: zext_i1_to_i32_uniform
+; GFX950:   s_load_dword s2, s[4:5], 0x34
+; GFX950:   s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX950:   v_mov_b32_e32 v0, 0
+; GFX950:   s_waitcnt lgkmcnt(0)
+; GFX950:   s_cmpk_eq_i32 s2, 0x171
+; GFX950:   s_cselect_b32 s2, -1, 0
+; GFX950:   s_and_b32 s2, s2, 1
+; GFX950:   v_mov_b32_e32 v1, s2
+; GFX950:   global_store_dword v0, v1, s[0:1]
+; GFX1250:  s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250:  s_load_b96 s[0:2], s[4:5], 0x2c
+; GFX1250:  s_wait_kmcnt 0x0
+; GFX1250:  s_cmp_eq_u32 s2, 0x171
+; GFX1250:  s_cselect_b32 s2, -1, 0
+; GFX1250:  s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250:  s_and_b32 s2, s2, 1
+; GFX1250:  v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250:  global_store_b32 v0, v1, s[0:1]
+define amdgpu_kernel void @zext_i1_to_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %out64, i32 %v1) #0 {
+entry:
+  %pred = icmp eq i32 %v1, 369
+  %tmp2 = zext i1 %pred to i32
+  
+  store i32 %tmp2, ptr addrspace(1) %out64
+  ret void
+}
+
+; GCN-LABEL: zext_i1_to_i64_uniform
+; GFX950:     s_load_dword s2, s[4:5], 0x34
+; GFX950:     s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX950:     s_mov_b32 s4, 0
+; GFX950:     v_mov_b32_e32 v2, 0
+; GFX950:     v_mov_b32_e32 v1, s4
+; GFX950:     s_waitcnt lgkmcnt(0)
+; GFX950:     s_cmpk_eq_i32 s2, 0x171
+; GFX950:     s_cselect_b32 s2, -1, 0
+; GFX950:     s_mov_b32 s3, s2
+; GFX950:     v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX950:     global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1250:   s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250:   s_load_b96 s[0:2], s[4:5], 0x2c
+; GFX1250:   s_wait_kmcnt 0x0
+; GFX1250:   s_cmp_eq_u32 s2, 0x171
+; GFX1250:   s_mov_b32 s2, 0
+; GFX1250:   s_cselect_b32 s3, -1, 0
+; GFX1250:   v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250:   v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1250:   global_store_b64 v2, v[0:1], s[0:1]
+define amdgpu_kernel void @zext_i1_to_i64_uniform(ptr addrspace(1) %out, ptr addrspace(1) %out64, i32 %v1) #0 {
+entry:
+  %pred = icmp eq i32 %v1, 369
+  %tmp2 = zext i1 %pred to i64
+  
+  store i64 %tmp2, ptr addrspace(1) %out64
+  ret void
+}
+
+; GCN-LABEL: zext_i1_to_i32_ext64_uniform
+; GFX950:   s_load_dword s2, s[4:5], 0x34
+; GFX950:   s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX950:   s_mov_b32 s4, 0
+; GFX950:   v_mov_b32_e32 v2, 0
+; GFX950:   v_mov_b32_e32 v1, s4
+; GFX950:   s_waitcnt lgkmcnt(0)
+; GFX950:   s_cmpk_eq_i32 s2, 0x171
+; GFX950:   s_cselect_b32 s2, -1, 0
+; GFX950:   s_mov_b32 s3, s2
+; GFX950:   v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX950:   global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1250:   s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250:   s_load_b96 s[0:2], s[4:5], 0x2c
+; GFX1250:   s_wait_kmcnt 0x0
+; GFX1250:   s_cmp_eq_u32 s2, 0x171
+; GFX1250:   s_mov_b32 s2, 0
+; GFX1250:   s_cselect_b32 s3, -1, 0
+; GFX1250:   v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250:   v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1250:   global_store_b64 v2, v[0:1], s[0:1]
+define amdgpu_kernel void @zext_i1_to_i32_ext64_uniform(ptr addrspace(1) %out, ptr addrspace(1) %out64, i32 %v1) #0 {
+entry:
+  %pred = icmp eq i32 %v1, 369
+  %tmp2 = zext i1 %pred to i32
+  %tmp3 = zext i32 %tmp2 to i64
+  store i64 %tmp3, ptr addrspace(1) %out64
+  ret void
+}
+
+; GCN-LABEL: zext_i1_to_i64_trunc32_uniform
+; GFX950:   s_load_dword s2, s[4:5], 0x34
+; GFX950:   s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX950:   v_mov_b32_e32 v0, 0
+; GFX950:   s_waitcnt lgkmcnt(0)
+; GFX950:   s_cmpk_eq_i32 s2, 0x171
+; GFX950:   s_cselect_b32 s2, -1, 0
+; GFX950:   s_and_b32 s2, s2, 1
+; GFX950:   v_mov_b32_e32 v1, s2
+; GFX1250:   s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250:   s_load_b96 s[0:2], s[4:5], 0x2c
+; GFX1250:   s_wait_kmcnt 0x0
+; GFX1250:   s_cmp_eq_u32 s2, 0x171
+; GFX1250:   s_cselect_b32 s2, -1, 0
+; GFX1250:   s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250:   s_and_b32 s2, s2, 1
+; GFX1250:   v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250:   global_store_b32 v0, v1, s[0:1]
+define amdgpu_kernel void @zext_i1_to_i64_trunc32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %out64, i32 %v1) #0 {
+entry:
+  %pred = icmp eq i32 %v1, 369
+  %tmp2 = zext i1 %pred to i64
+  %tmp3 = trunc i64 %tmp2 to i32
+  store i32 %tmp3, ptr addrspace(1) %out64
+  ret void
+}
+
+attributes #0 = { nounwind }

>From a24d6e29f5594c9e1089db0e394040666ec90efc Mon Sep 17 00:00:00 2001
From: root <zengwu13 at amd.com>
Date: Sun, 22 Feb 2026 15:16:57 +0000
Subject: [PATCH 6/9] changs decision order of UseRC

---
 .../lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 28 ++++++-------------
 llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp    |  2 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  5 ++--
 llvm/lib/Target/AMDGPU/SIInstructions.td      | 10 +++++++
 4 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 2300693aa766a..2e247456f5c2e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -106,26 +106,6 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
   MVT VT = Op.getSimpleValueType();
 
   // Stick to the preferred register classes for legal types.
-  if (TLI->isTypeLegal(VT)) {
-    const llvm::TargetRegisterClass *SrcRegRC = TRI->getMinimalPhysRegClass(SrcReg);
-    const llvm::TargetRegisterClass *CrossCopyRC = TRI->getCrossCopyRegClass(SrcRegRC);
-    const llvm::TargetRegisterClass *LegalRC =
-          TLI->getRegClassFor(VT, Op->isDivergent());
-
-    // If there is a sub class relation between CrossCopyRegClass and
-    // natively supported RegClass, the result of getRegClassFor, then
-    // we use natively supported RegClass to stick the existing logic.
-    // For example, on AArch64, the CrossCopyRegClass of x0 is `GPR64arg`
-    // and x0 is natively supported in regclass `GPR64all`, then `GPR64all` is chosen.
-    // However, on AMDGPU, for `scc`, the natively supported regclass is, for
-    // some reasons, SGPR_64 but CrossCopyRegClass is SGPR_32. Since there
-    // is subclass relation, CrossCopyRegClass, SGPR_32 is picked.
-    if (!TRI->isTypeLegalForClass(*CrossCopyRC, VT) || TRI->getCommonSubClass(CrossCopyRC, LegalRC)) {
-      UseRC = LegalRC;
-    } else {
-      UseRC = CrossCopyRC;
-    }
-  }
 
   for (SDNode *User : Op->users()) {
     bool Match = true;
@@ -169,6 +149,14 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
       break;
   }
 
+
+  if (UseRC == nullptr || !UseRC->isAllocatable()) {
+    // The check is to be removed in other pending PR, it is kept to make System Z happy.
+    if (TLI->isTypeLegal(VT)) {
+      UseRC = TLI->getRegClassFor(VT, Op->isDivergent());
+    }
+  }
+
   const TargetRegisterClass *SrcRC = nullptr, *DstRC = nullptr;
   SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT);
 
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 68a348cda39b9..edf6a2872d6f5 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -783,7 +783,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
 
   lowerVGPR2SGPRCopies(MF);
   // Postprocessing
-  fixSCCCopies(MF);
+  // fixSCCCopies(MF);
   for (auto *MI : S2VCopies) {
     // Check if it is still valid
     if (MI->isCopy()) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 301f2fc8dab45..7ebd14bfe5c75 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -18852,8 +18852,9 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
   const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
   if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
-    return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
-                                 : &AMDGPU::SReg_32RegClass;
+      return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
+          : &AMDGPU::SReg_32RegClass;
+
   if (!TRI->isSGPRClass(RC) && !isDivergent)
     return TRI->getEquivalentSGPRClass(RC);
   if (TRI->isSGPRClass(RC) && isDivergent) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 257fa23608d94..be1af6cda43d7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3043,6 +3043,16 @@ def : GCNPat <
   (S_AND_B32 $src, (i32 1))
 >;
 
+// since operand of S_AND_B64 is SREG-64 and it does not support i1.
+let AddedComplexity = 10 in
+def : GCNPat <
+  (i64 (UniformUnaryFrag<zext> i1:$src)),
+  (REG_SEQUENCE SReg_64,
+    (S_MOV_B32 (S_AND_B32 $src, (i32 1))), sub0,
+    (S_MOV_B32 (i32 0)), sub1
+  )
+>;
+
 let WaveSizePredicate = isWave64 in {
 def : GCNPat <
   (i1 (and i1:$src0, i1:$src1)),

>From 421971dee9052ad4df4a34194e92534156c81af2 Mon Sep 17 00:00:00 2001
From: root <Zeng.Wu2 at amd.com>
Date: Mon, 2 Mar 2026 03:24:17 +0000
Subject: [PATCH 7/9] use common-sub-regclass

---
 llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 2e247456f5c2e..3096ab5ccb698 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/PseudoProbe.h"
@@ -149,12 +150,16 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
       break;
   }
 
+  const TargetRegisterClass *RegClassForVT = nullptr;
+  // The check is to be removed in other pending PR, it is kept to make System Z happy.
+  if (TLI->isTypeLegal(VT)) {
+      RegClassForVT = TLI->getRegClassFor(VT, Op->isDivergent());
+  }
 
   if (UseRC == nullptr || !UseRC->isAllocatable()) {
-    // The check is to be removed in other pending PR, it is kept to make System Z happy.
-    if (TLI->isTypeLegal(VT)) {
-      UseRC = TLI->getRegClassFor(VT, Op->isDivergent());
-    }
+      UseRC = RegClassForVT;
+  } else if (auto CommonSubClass = TRI->getCommonSubClass(UseRC, RegClassForVT)) {
+      UseRC = CommonSubClass;
   }
 
   const TargetRegisterClass *SrcRC = nullptr, *DstRC = nullptr;

>From cdc8e8023d68638ef67afe1ac21f04da6a93afb3 Mon Sep 17 00:00:00 2001
From: root <Zeng.Wu2 at amd.com>
Date: Tue, 3 Mar 2026 06:19:27 +0000
Subject: [PATCH 8/9] remove S_AND, S_OR etc under predicate isWave64

---
 llvm/lib/Target/AMDGPU/SIInstructions.td  | 44 ++---------------------
 llvm/lib/Target/AMDGPU/SOPInstructions.td | 11 +++++-
 2 files changed, 12 insertions(+), 43 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index be1af6cda43d7..0bd3de21e44a8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3053,47 +3053,7 @@ def : GCNPat <
   )
 >;
 
-let WaveSizePredicate = isWave64 in {
-def : GCNPat <
-  (i1 (and i1:$src0, i1:$src1)),
-  (S_AND_B64 $src0, $src1)
->;
-
-
-def : GCNPat <
-  (i1 (or i1:$src0, i1:$src1)),
-  (S_OR_B64 $src0, $src1)
->;
-
-def : GCNPat <
-  (i1 (xor i1:$src0, i1:$src1)),
-  (S_XOR_B64 $src0, $src1)
->;
-
-def : GCNPat <
-  (i1 (add i1:$src0, i1:$src1)),
-  (S_XOR_B64 $src0, $src1)
->;
-
-def : GCNPat <
-  (i1 (sub i1:$src0, i1:$src1)),
-  (S_XOR_B64 $src0, $src1)
->;
-
-let AddedComplexity = 1 in {
-def : GCNPat <
-  (i1 (add i1:$src0, (i1 -1))),
-  (S_NOT_B64 $src0)
->;
-
-def : GCNPat <
-  (i1 (sub i1:$src0, (i1 -1))),
-  (S_NOT_B64 $src0)
->;
-}
-} // end isWave64
-
-let WaveSizePredicate = isWave32 in {
+// let WaveSizePredicate = isWave32 in {
 def : GCNPat <
   (i1 (and i1:$src0, i1:$src1)),
   (S_AND_B32 $src0, $src1)
@@ -3130,7 +3090,7 @@ def : GCNPat <
   (S_NOT_B32 $src0)
 >;
 }
-} // end isWave32
+// } // end isWave32
 
 def : GCNPat <
   (i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))),
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 1931e0be15152..b95dae2e8a712 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1997,9 +1997,18 @@ def UniformSelect : PatFrag<
   [{ return !N->isDivergent(); }]
 >;
 
+def FreezeUniformSelect : PatFrag<
+  (ops node:$src0, node:$src1),
+  (select (freeze SCC), node:$src0, node:$src1),
+  [{
+    SDValue FreezeNode = N->getOperand(0);
+    return FreezeNode->hasOneUse() && !N->isDivergent();
+  }]
+>;
+
 let AddedComplexity = 20 in {
   def : GCNPat<
-    (i32 (UniformSelect i32:$src0, i32:$src1)),
+    (i32 (FreezeUniformSelect i32:$src0, i32:$src1)),
     (S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
   >;
 

>From 55e1ae6019080b00d069ee1f3e6b6199c02b5e35 Mon Sep 17 00:00:00 2001
From: root <Zeng.Wu2 at amd.com>
Date: Tue, 3 Mar 2026 06:19:27 +0000
Subject: [PATCH 9/9] Hack On fixSCCCopies

Revert "[AMDGPU] update SCCCopies in si-fix-sgpr-copies"

This reverts commit 529e931143253345bd32cb871313997318f466ed.
---
 llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp    |  87 ++++++-------
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   1 +
 .../CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll     | 122 ------------------
 3 files changed, 41 insertions(+), 169 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll

diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index edf6a2872d6f5..605217b5d0095 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -69,8 +69,8 @@
 #include "AMDGPULaneMaskUtils.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIRegisterInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -783,7 +783,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
 
   lowerVGPR2SGPRCopies(MF);
   // Postprocessing
-  // fixSCCCopies(MF);
+  fixSCCCopies(MF);
   for (auto *MI : S2VCopies) {
     // Check if it is still valid
     if (MI->isCopy()) {
@@ -1174,64 +1174,35 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
 void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
   const AMDGPU::LaneMaskConstants &LMC =
       AMDGPU::LaneMaskConstants::get(MF.getSubtarget<GCNSubtarget>());
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
   for (MachineBasicBlock &MBB : MF) {
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
-         ++I) {
-      MachineInstr &MI = *I;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
+
+      MachineInstr &MI = *I++;
       // May already have been lowered.
       if (!MI.isCopy())
         continue;
       Register SrcReg = MI.getOperand(1).getReg();
       Register DstReg = MI.getOperand(0).getReg();
       if (SrcReg == AMDGPU::SCC) {
-        Register SCCCopy = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0_XEXECRegClass);
-        I = BuildMI(
+        assert(DstReg.isVirtual());
+        Register NewDstReg = MRI->createVirtualRegister(TRI->getWaveMaskRegClass());
+        auto NewCopy = BuildMI(
                   *MI.getParent(), std::next(MachineBasicBlock::iterator(MI)),
-                  MI.getDebugLoc(), TII->get(AMDGPU::S_CSELECT_B32), SCCCopy)
+                  MI.getDebugLoc(), TII->get(LMC.CSelectOpc), NewDstReg)
                   .addImm(-1)
                   .addImm(0);
 
-        assert(DstReg.isVirtual());
-
-        const llvm::TargetRegisterInfo *TRI =
-            MF.getSubtarget().getRegisterInfo();
-
-        for (llvm::MachineOperand &UseOp : MRI->use_operands(DstReg)) {
-          llvm::MachineInstr *UserMI = UseOp.getParent();
-          for (const llvm::MachineOperand &Output : UserMI->defs()) {
-            if (!Output.isReg())
-              continue;
-           Register OutputReg = Output.getReg();
-           const auto RegSize = TRI->getRegSizeInBits(OutputReg, *MRI);
-           if (RegSize == 32) {
-              I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(),
-                            TII->get(AMDGPU::COPY), DstReg)
-                    .addReg(SCCCopy);
-            } else {
-              assert(RegSize == 64);
-
-              if (UserMI->getOpcode() != AMDGPU::COPY) {
-                // After DAG-2-DAG selection, e.g.
-                // %12:sreg_32 = COPY $scc
-                // %14:sreg_64_xexec = COPY %12:sreg_32
-                // ....
-                // so if opcode is not COPY, the legalization in ISel will make
-                // sure the the copy is legal. Only the COPY inserted in DAG to Block
-                // could have this issue.
-                  continue;
-              }
+        for (MachineOperand &UseMO : llvm::make_early_inc_range(MRI->use_operands(DstReg))) {
+            // We must NOT replace Opnd0 inside the new instruction we just made!
+            if (UseMO.getParent() == NewCopy)
+                continue;
 
-              BuildMI(MBB, UserMI, UserMI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE),
-                      OutputReg)
-                  .addReg(SCCCopy)
-                  .addImm(AMDGPU::sub0)
-                  .addReg(SCCCopy)
-                  .addImm(AMDGPU::sub1);
-              UserMI->eraseFromParent();
-            }
-          }
+            UseMO.setReg(NewDstReg);
         }
-
         MI.eraseFromParent();
         continue;
       }
@@ -1242,8 +1213,30 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
                 .addReg(Tmp, getDefRegState(true))
                 .addReg(SrcReg)
                 .addReg(LMC.ExecReg);
+
         MI.eraseFromParent();
       }
+
+      if (SrcReg.isPhysical() || DstReg.isPhysical()) continue;
+      auto SrcRC = MRI->getRegClass(SrcReg);
+      auto DstRC = MRI->getRegClass(DstReg);
+
+      if (TRI->isSGPRClass(DstRC) && TRI->isSGPRClass(SrcRC) && DstRC != SrcRC) {
+        auto DstBitWidth = TRI->getRegSizeInBits(DstReg, *MRI);
+        auto SrcBitWidth = TRI->getRegSizeInBits(SrcReg, *MRI);
+
+        if (DstBitWidth == 64 && SrcBitWidth == 32) {
+          BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE),
+                    DstReg)
+                .addReg(SrcReg)
+                .addImm(AMDGPU::sub0)
+                .addReg(SrcReg)
+                .addImm(AMDGPU::sub1);
+           MI.eraseFromParent();
+        } else if (DstBitWidth == 32 && SrcBitWidth == 64) {
+            MI.getOperand(1).setSubReg(AMDGPU::sub0);
+        }
+      }
     }
   }
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 0bd3de21e44a8..3b9c88976929c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2452,6 +2452,7 @@ def : GCNPat <
   (V_MOV_B32_e32 (bf16 (bitcast_fpimm_to_i32 $imm)))
 >;
 
+
 def : GCNPat <
   (bf16 fpimm:$imm),
   (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll
deleted file mode 100644
index 6404afa911271..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll
+++ /dev/null
@@ -1,122 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GFX1250 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX950 %s
-
-declare i32 @llvm.amdgcn.workitem.id.x() #0
-
-; GCN-LABEL: zext_i1_to_i32_uniform
-; GFX950:   s_load_dword s2, s[4:5], 0x34
-; GFX950:   s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX950:   v_mov_b32_e32 v0, 0
-; GFX950:   s_waitcnt lgkmcnt(0)
-; GFX950:   s_cmpk_eq_i32 s2, 0x171
-; GFX950:   s_cselect_b32 s2, -1, 0
-; GFX950:   s_and_b32 s2, s2, 1
-; GFX950:   v_mov_b32_e32 v1, s2
-; GFX950:   global_store_dword v0, v1, s[0:1]
-; GFX1250:  s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX1250:  s_load_b96 s[0:2], s[4:5], 0x2c
-; GFX1250:  s_wait_kmcnt 0x0
-; GFX1250:  s_cmp_eq_u32 s2, 0x171
-; GFX1250:  s_cselect_b32 s2, -1, 0
-; GFX1250:  s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250:  s_and_b32 s2, s2, 1
-; GFX1250:  v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1250:  global_store_b32 v0, v1, s[0:1]
-define amdgpu_kernel void @zext_i1_to_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %out64, i32 %v1) #0 {
-entry:
-  %pred = icmp eq i32 %v1, 369
-  %tmp2 = zext i1 %pred to i32
-  
-  store i32 %tmp2, ptr addrspace(1) %out64
-  ret void
-}
-
-; GCN-LABEL: zext_i1_to_i64_uniform
-; GFX950:     s_load_dword s2, s[4:5], 0x34
-; GFX950:     s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX950:     s_mov_b32 s4, 0
-; GFX950:     v_mov_b32_e32 v2, 0
-; GFX950:     v_mov_b32_e32 v1, s4
-; GFX950:     s_waitcnt lgkmcnt(0)
-; GFX950:     s_cmpk_eq_i32 s2, 0x171
-; GFX950:     s_cselect_b32 s2, -1, 0
-; GFX950:     s_mov_b32 s3, s2
-; GFX950:     v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX950:     global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1250:   s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX1250:   s_load_b96 s[0:2], s[4:5], 0x2c
-; GFX1250:   s_wait_kmcnt 0x0
-; GFX1250:   s_cmp_eq_u32 s2, 0x171
-; GFX1250:   s_mov_b32 s2, 0
-; GFX1250:   s_cselect_b32 s3, -1, 0
-; GFX1250:   v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
-; GFX1250:   v_cndmask_b32_e64 v0, 0, 1, s3
-; GFX1250:   global_store_b64 v2, v[0:1], s[0:1]
-define amdgpu_kernel void @zext_i1_to_i64_uniform(ptr addrspace(1) %out, ptr addrspace(1) %out64, i32 %v1) #0 {
-entry:
-  %pred = icmp eq i32 %v1, 369
-  %tmp2 = zext i1 %pred to i64
-  
-  store i64 %tmp2, ptr addrspace(1) %out64
-  ret void
-}
-
-; GCN-LABEL: zext_i1_to_i32_ext64_uniform
-; GFX950:   s_load_dword s2, s[4:5], 0x34
-; GFX950:   s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX950:   s_mov_b32 s4, 0
-; GFX950:   v_mov_b32_e32 v2, 0
-; GFX950:   v_mov_b32_e32 v1, s4
-; GFX950:   s_waitcnt lgkmcnt(0)
-; GFX950:   s_cmpk_eq_i32 s2, 0x171
-; GFX950:   s_cselect_b32 s2, -1, 0
-; GFX950:   s_mov_b32 s3, s2
-; GFX950:   v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX950:   global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1250:   s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX1250:   s_load_b96 s[0:2], s[4:5], 0x2c
-; GFX1250:   s_wait_kmcnt 0x0
-; GFX1250:   s_cmp_eq_u32 s2, 0x171
-; GFX1250:   s_mov_b32 s2, 0
-; GFX1250:   s_cselect_b32 s3, -1, 0
-; GFX1250:   v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
-; GFX1250:   v_cndmask_b32_e64 v0, 0, 1, s3
-; GFX1250:   global_store_b64 v2, v[0:1], s[0:1]
-define amdgpu_kernel void @zext_i1_to_i32_ext64_uniform(ptr addrspace(1) %out, ptr addrspace(1) %out64, i32 %v1) #0 {
-entry:
-  %pred = icmp eq i32 %v1, 369
-  %tmp2 = zext i1 %pred to i32
-  %tmp3 = zext i32 %tmp2 to i64
-  store i64 %tmp3, ptr addrspace(1) %out64
-  ret void
-}
-
-; GCN-LABEL: zext_i1_to_i64_trunc32_uniform
-; GFX950:   s_load_dword s2, s[4:5], 0x34
-; GFX950:   s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX950:   v_mov_b32_e32 v0, 0
-; GFX950:   s_waitcnt lgkmcnt(0)
-; GFX950:   s_cmpk_eq_i32 s2, 0x171
-; GFX950:   s_cselect_b32 s2, -1, 0
-; GFX950:   s_and_b32 s2, s2, 1
-; GFX950:   v_mov_b32_e32 v1, s2
-; GFX1250:   s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX1250:   s_load_b96 s[0:2], s[4:5], 0x2c
-; GFX1250:   s_wait_kmcnt 0x0
-; GFX1250:   s_cmp_eq_u32 s2, 0x171
-; GFX1250:   s_cselect_b32 s2, -1, 0
-; GFX1250:   s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250:   s_and_b32 s2, s2, 1
-; GFX1250:   v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1250:   global_store_b32 v0, v1, s[0:1]
-define amdgpu_kernel void @zext_i1_to_i64_trunc32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %out64, i32 %v1) #0 {
-entry:
-  %pred = icmp eq i32 %v1, 369
-  %tmp2 = zext i1 %pred to i64
-  %tmp3 = trunc i64 %tmp2 to i32
-  store i32 %tmp3, ptr addrspace(1) %out64
-  ret void
-}
-
-attributes #0 = { nounwind }