[llvm] [AMDGPU] using divergent/uniform information in ISel of zext (PR #174539)
Zeng Wu via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 5 00:10:45 PST 2026
https://github.com/zwu-2025 updated https://github.com/llvm/llvm-project/pull/174539
>From 07e0ee192b7640291fd0d908f1eb0035a4f2b14c Mon Sep 17 00:00:00 2001
From: Zeng Wu <Zeng.Wu2 at amd.com>
Date: Sun, 11 Jan 2026 01:11:49 -0600
Subject: [PATCH 1/9] [AMD] using divergent/uniform information in ISel of zext
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 12 +++++++++++-
.../AMDGPU/llvm.amdgcn.i1.i32.uniform.zext.ll | 19 +++++++++++++++++++
2 files changed, 30 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.i1.i32.uniform.zext.ll
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index ca5a4d7301bda..46a254263c0e4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2535,7 +2535,7 @@ def : GCNPat <
>;
class Ext32Pat <SDNode ext> : GCNPat <
- (i32 (ext i1:$src0)),
+ (i32 (ext i1:$src0)),
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 1), i1:$src0)
>;
@@ -3044,6 +3044,11 @@ def : GCNPat <
(S_AND_B64 $src0, $src1)
>;
+def : GCNPat <
+ (i64 (UniformUnaryFrag<zext> i1:$src)),
+ (S_AND_B64 $src, (i64 1))
+>;
+
def : GCNPat <
(i1 (or i1:$src0, i1:$src1)),
(S_OR_B64 $src0, $src1)
@@ -3083,6 +3088,11 @@ def : GCNPat <
(S_AND_B32 $src0, $src1)
>;
+def : GCNPat <
+ (i32 (UniformUnaryFrag<zext> i1:$src)),
+ (S_AND_B32 $src, (i32 1))
+>;
+
def : GCNPat <
(i1 (or i1:$src0, i1:$src1)),
(S_OR_B32 $src0, $src1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.i1.i32.uniform.zext.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.i1.i32.uniform.zext.ll
new file mode 100644
index 0000000000000..63486534e7032
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.i1.i32.uniform.zext.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN-OPT %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefix=GCN-OPT %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN-G_SEL %s
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+define amdgpu_kernel void @zext_i1_to_i32_uniform(ptr addrspace(1) %out, i1 %pred) #0 {
+entry:
+; GCN-LABEL: zext_i1_to_i32_uniform:
+; GCN-OPT: s_and_b32 s{{.*}}, s{{.*}}, 1
+; GCN-G_SEL: v_mov_b32_e32
+; GCN-OPT: s_endpgm
+ %tmp2 = zext i1 %pred to i32
+ store i32 %tmp2, ptr addrspace(1) %out
+ ret void
+}
+
+attributes #0 = { nounwind }
>From 29e1625a531b855192f4d9c96300e5dcd54287f7 Mon Sep 17 00:00:00 2001
From: Zeng Wu <Zeng.Wu2 at amd.com>
Date: Wed, 21 Jan 2026 20:11:35 -0600
Subject: [PATCH 2/9] [AMDGPU] using divergent/uniform information in ISel of
zext
---
.../lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 33 ++++++++++++++++---
llvm/lib/Target/AMDGPU/SIInstructions.td | 14 +++-----
2 files changed, 34 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 4ad721bf21959..9eb8df71ed165 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -27,6 +27,7 @@
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/PseudoProbe.h"
+#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -105,8 +106,29 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
MVT VT = Op.getSimpleValueType();
// Stick to the preferred register classes for legal types.
- if (TLI->isTypeLegal(VT))
- UseRC = TLI->getRegClassFor(VT, Op->isDivergent());
+ if (TLI->isTypeLegal(VT)) {
+ const llvm::TargetRegisterClass *SrcRegRC = TRI->getMinimalPhysRegClass(SrcReg);
+ UseRC = TRI->getCrossCopyRegClass(SrcRegRC);
+ const llvm::TargetRegisterClass *LegalRC =
+ TLI->getRegClassFor(VT, Op->isDivergent());
+
+ if (!TRI->isTypeLegalForClass(*UseRC, VT)) {
+ UseRC = LegalRC;
+ }
+ {
+ // If there is a sub class relation between CrossCopyRegClass and
+ // natively supported RegClass, the result of getRegClassFor, then
+ // we use natively supported RegClass to stick the existing logic.
+ // For example, on AArch64, the CrossCopyRegClass of x0 is `GPR64arg`
+ // and x0 is natively supported in regclass `GPR64all`, then `GPR64all` is chosen.
+ // However, on AMDGPU, for `scc`, the natively supported regclass is, for
+ // some reasons, SGPR_64 but CrossCopyRegClass is SGPR_32. Since there
+ // is subclass relation, CrossCopyRegClass, SGPR_32 is picked.
+ if (TRI->getCommonSubClass(UseRC, LegalRC)) {
+ UseRC = LegalRC;
+ }
+ }
+ }
for (SDNode *User : Op->users()) {
bool Match = true;
@@ -121,7 +143,7 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
if (User->getOperand(i) != Op)
continue;
- if (VT == MVT::Other || VT == MVT::Glue)
+ if (VT == MVT::Other || VT == MVT::Glue || !TLI->isTypeLegal(VT))
continue;
Match = false;
if (User->isMachineOpcode()) {
@@ -131,10 +153,11 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
RC = TRI->getAllocatableClass(
TII->getRegClass(II, i + II.getNumDefs()));
}
+
if (!UseRC)
UseRC = RC;
else if (RC) {
- const TargetRegisterClass *ComRC =
+ const TargetRegisterClass *ComRC =
TRI->getCommonSubClass(UseRC, RC);
// If multiple uses expect disjoint register classes, we emit
// copies in AddRegisterOperand.
@@ -153,6 +176,8 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT);
// Figure out the register class to create for the destreg.
+ // If SrcReg is phsysical register, the corresponding register class could be
+ // non allocable, so we prefer UseRC to SrcRC
if (VRBase) {
DstRC = MRI->getRegClass(VRBase);
} else if (UseRC) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 46a254263c0e4..257fa23608d94 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3038,16 +3038,17 @@ def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
// instructions resulting in the copies from SCC to these instructions
// will be moved to the VALU.
+def : GCNPat <
+ (i32 (UniformUnaryFrag<zext> i1:$src)),
+ (S_AND_B32 $src, (i32 1))
+>;
+
let WaveSizePredicate = isWave64 in {
def : GCNPat <
(i1 (and i1:$src0, i1:$src1)),
(S_AND_B64 $src0, $src1)
>;
-def : GCNPat <
- (i64 (UniformUnaryFrag<zext> i1:$src)),
- (S_AND_B64 $src, (i64 1))
->;
def : GCNPat <
(i1 (or i1:$src0, i1:$src1)),
@@ -3088,11 +3089,6 @@ def : GCNPat <
(S_AND_B32 $src0, $src1)
>;
-def : GCNPat <
- (i32 (UniformUnaryFrag<zext> i1:$src)),
- (S_AND_B32 $src, (i32 1))
->;
-
def : GCNPat <
(i1 (or i1:$src0, i1:$src1)),
(S_OR_B32 $src0, $src1)
>From d165a018d080589e98b4a34a95e656856c6a4734 Mon Sep 17 00:00:00 2001
From: root <zengwu13 at amd.com>
Date: Fri, 30 Jan 2026 18:16:49 +0000
Subject: [PATCH 3/9] comments
---
.../lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 43 +++++++++----------
1 file changed, 20 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 9eb8df71ed165..2e661faafbb95 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -107,27 +107,24 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
// Stick to the preferred register classes for legal types.
if (TLI->isTypeLegal(VT)) {
- const llvm::TargetRegisterClass *SrcRegRC = TRI->getMinimalPhysRegClass(SrcReg);
- UseRC = TRI->getCrossCopyRegClass(SrcRegRC);
- const llvm::TargetRegisterClass *LegalRC =
+ const llvm::TargetRegisterClass *SrcRegRC = TRI->getMinimalPhysRegClass(SrcReg);
+ const llvm::TargetRegisterClass *CrossCopyRC = TRI->getCrossCopyRegClass(SrcRegRC);
+ const llvm::TargetRegisterClass *LegalRC =
TLI->getRegClassFor(VT, Op->isDivergent());
- if (!TRI->isTypeLegalForClass(*UseRC, VT)) {
- UseRC = LegalRC;
- }
- {
- // If there is a sub class relation between CrossCopyRegClass and
- // natively supported RegClass, the result of getRegClassFor, then
- // we use natively supported RegClass to stick the existing logic.
- // For example, on AArch64, the CrossCopyRegClass of x0 is `GPR64arg`
- // and x0 is natively supported in regclass `GPR64all`, then `GPR64all` is chosen.
- // However, on AMDGPU, for `scc`, the natively supported regclass is, for
- // some reasons, SGPR_64 but CrossCopyRegClass is SGPR_32. Since there
- // is subclass relation, CrossCopyRegClass, SGPR_32 is picked.
- if (TRI->getCommonSubClass(UseRC, LegalRC)) {
- UseRC = LegalRC;
- }
- }
+ // If there is a sub class relation between CrossCopyRegClass and
+ // natively supported RegClass, the result of getRegClassFor, then
+ // we use natively supported RegClass to stick the existing logic.
+ // For example, on AArch64, the CrossCopyRegClass of x0 is `GPR64arg`
+ // and x0 is natively supported in regclass `GPR64all`, then `GPR64all` is chosen.
+ // However, on AMDGPU, for `scc`, the natively supported regclass is, for
+ // some reasons, SGPR_64 but CrossCopyRegClass is SGPR_32. Since there
+ // is subclass relation, CrossCopyRegClass, SGPR_32 is picked.
+ if (!TRI->isTypeLegalForClass(*CrossCopyRC, VT) || TRI->getCommonSubClass(CrossCopyRC, LegalRC)) {
+ UseRC = LegalRC;
+ } else {
+ UseRC = CrossCopyRC;
+ }
}
for (SDNode *User : Op->users()) {
@@ -143,7 +140,7 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
if (User->getOperand(i) != Op)
continue;
- if (VT == MVT::Other || VT == MVT::Glue || !TLI->isTypeLegal(VT))
+ if (VT == MVT::Other || VT == MVT::Glue)
continue;
Match = false;
if (User->isMachineOpcode()) {
@@ -157,7 +154,7 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
if (!UseRC)
UseRC = RC;
else if (RC) {
- const TargetRegisterClass *ComRC =
+ const TargetRegisterClass *ComRC =
TRI->getCommonSubClass(UseRC, RC);
// If multiple uses expect disjoint register classes, we emit
// copies in AddRegisterOperand.
@@ -174,10 +171,10 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
const TargetRegisterClass *SrcRC = nullptr, *DstRC = nullptr;
SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT);
+ // if (!SrcRC->isAllocatable())
+ // SrcRC = TRI->getCrossCopyRegClass(SrcRC);
// Figure out the register class to create for the destreg.
- // If SrcReg is phsysical register, the corresponding register class could be
- // non allocable, so we prefer UseRC to SrcRC
if (VRBase) {
DstRC = MRI->getRegClass(VRBase);
} else if (UseRC) {
>From 74fe3bee4ccc3f360fb0b4e89b6184ba952e46ee Mon Sep 17 00:00:00 2001
From: Zeng Wu <Zeng.Wu2 at amd.com>
Date: Fri, 30 Jan 2026 12:19:23 -0600
Subject: [PATCH 4/9] comments
---
llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 2 --
1 file changed, 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 2e661faafbb95..2300693aa766a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -171,8 +171,6 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
const TargetRegisterClass *SrcRC = nullptr, *DstRC = nullptr;
SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT);
- // if (!SrcRC->isAllocatable())
- // SrcRC = TRI->getCrossCopyRegClass(SrcRC);
// Figure out the register class to create for the destreg.
if (VRBase) {
>From 529e931143253345bd32cb871313997318f466ed Mon Sep 17 00:00:00 2001
From: root <zengwu13 at amd.com>
Date: Wed, 4 Feb 2026 08:36:36 +0000
Subject: [PATCH 5/9] [AMDGPU] update SCCCopies in si-fix-sgpr-copies
---
llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 58 +++++++--
.../CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll | 122 ++++++++++++++++++
2 files changed, 171 insertions(+), 9 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 39a6a7762eea5..68a348cda39b9 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -69,6 +69,7 @@
#include "AMDGPULaneMaskUtils.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIRegisterInfo.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/InitializePasses.h"
#include "llvm/Target/TargetMachine.h"
@@ -1183,15 +1184,54 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
Register SrcReg = MI.getOperand(1).getReg();
Register DstReg = MI.getOperand(0).getReg();
if (SrcReg == AMDGPU::SCC) {
- Register SCCCopy =
- MRI->createVirtualRegister(TRI->getWaveMaskRegClass());
- I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)),
- MI.getDebugLoc(), TII->get(LMC.CSelectOpc), SCCCopy)
- .addImm(-1)
- .addImm(0);
- I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(),
- TII->get(AMDGPU::COPY), DstReg)
- .addReg(SCCCopy);
+ Register SCCCopy = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0_XEXECRegClass);
+ I = BuildMI(
+ *MI.getParent(), std::next(MachineBasicBlock::iterator(MI)),
+ MI.getDebugLoc(), TII->get(AMDGPU::S_CSELECT_B32), SCCCopy)
+ .addImm(-1)
+ .addImm(0);
+
+ assert(DstReg.isVirtual());
+
+ const llvm::TargetRegisterInfo *TRI =
+ MF.getSubtarget().getRegisterInfo();
+
+ for (llvm::MachineOperand &UseOp : MRI->use_operands(DstReg)) {
+ llvm::MachineInstr *UserMI = UseOp.getParent();
+ for (const llvm::MachineOperand &Output : UserMI->defs()) {
+ if (!Output.isReg())
+ continue;
+ Register OutputReg = Output.getReg();
+ const auto RegSize = TRI->getRegSizeInBits(OutputReg, *MRI);
+ if (RegSize == 32) {
+ I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(),
+ TII->get(AMDGPU::COPY), DstReg)
+ .addReg(SCCCopy);
+ } else {
+ assert(RegSize == 64);
+
+ if (UserMI->getOpcode() != AMDGPU::COPY) {
+ // After DAG-2-DAG selection, e.g.
+ // %12:sreg_32 = COPY $scc
+ // %14:sreg_64_xexec = COPY %12:sreg_32
+ // ....
+ // so if opcode is not COPY, the legalization in ISel will make
+ // sure the the copy is legal. Only the COPY inserted in DAG to Block
+ // could have this issue.
+ continue;
+ }
+
+ BuildMI(MBB, UserMI, UserMI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE),
+ OutputReg)
+ .addReg(SCCCopy)
+ .addImm(AMDGPU::sub0)
+ .addReg(SCCCopy)
+ .addImm(AMDGPU::sub1);
+ UserMI->eraseFromParent();
+ }
+ }
+ }
+
MI.eraseFromParent();
continue;
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll
new file mode 100644
index 0000000000000..6404afa911271
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GFX1250 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX950 %s
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+; GCN-LABEL: zext_i1_to_i32_uniform
+; GFX950: s_load_dword s2, s[4:5], 0x34
+; GFX950: s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX950: v_mov_b32_e32 v0, 0
+; GFX950: s_waitcnt lgkmcnt(0)
+; GFX950: s_cmpk_eq_i32 s2, 0x171
+; GFX950: s_cselect_b32 s2, -1, 0
+; GFX950: s_and_b32 s2, s2, 1
+; GFX950: v_mov_b32_e32 v1, s2
+; GFX950: global_store_dword v0, v1, s[0:1]
+; GFX1250: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250: s_load_b96 s[0:2], s[4:5], 0x2c
+; GFX1250: s_wait_kmcnt 0x0
+; GFX1250: s_cmp_eq_u32 s2, 0x171
+; GFX1250: s_cselect_b32 s2, -1, 0
+; GFX1250: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250: s_and_b32 s2, s2, 1
+; GFX1250: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250: global_store_b32 v0, v1, s[0:1]
+define amdgpu_kernel void @zext_i1_to_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %out64, i32 %v1) #0 {
+entry:
+ %pred = icmp eq i32 %v1, 369
+ %tmp2 = zext i1 %pred to i32
+
+ store i32 %tmp2, ptr addrspace(1) %out64
+ ret void
+}
+
+; GCN-LABEL: zext_i1_to_i64_uniform
+; GFX950: s_load_dword s2, s[4:5], 0x34
+; GFX950: s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX950: s_mov_b32 s4, 0
+; GFX950: v_mov_b32_e32 v2, 0
+; GFX950: v_mov_b32_e32 v1, s4
+; GFX950: s_waitcnt lgkmcnt(0)
+; GFX950: s_cmpk_eq_i32 s2, 0x171
+; GFX950: s_cselect_b32 s2, -1, 0
+; GFX950: s_mov_b32 s3, s2
+; GFX950: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX950: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1250: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250: s_load_b96 s[0:2], s[4:5], 0x2c
+; GFX1250: s_wait_kmcnt 0x0
+; GFX1250: s_cmp_eq_u32 s2, 0x171
+; GFX1250: s_mov_b32 s2, 0
+; GFX1250: s_cselect_b32 s3, -1, 0
+; GFX1250: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250: v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1250: global_store_b64 v2, v[0:1], s[0:1]
+define amdgpu_kernel void @zext_i1_to_i64_uniform(ptr addrspace(1) %out, ptr addrspace(1) %out64, i32 %v1) #0 {
+entry:
+ %pred = icmp eq i32 %v1, 369
+ %tmp2 = zext i1 %pred to i64
+
+ store i64 %tmp2, ptr addrspace(1) %out64
+ ret void
+}
+
+; GCN-LABEL: zext_i1_to_i32_ext64_uniform
+; GFX950: s_load_dword s2, s[4:5], 0x34
+; GFX950: s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX950: s_mov_b32 s4, 0
+; GFX950: v_mov_b32_e32 v2, 0
+; GFX950: v_mov_b32_e32 v1, s4
+; GFX950: s_waitcnt lgkmcnt(0)
+; GFX950: s_cmpk_eq_i32 s2, 0x171
+; GFX950: s_cselect_b32 s2, -1, 0
+; GFX950: s_mov_b32 s3, s2
+; GFX950: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX950: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1250: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250: s_load_b96 s[0:2], s[4:5], 0x2c
+; GFX1250: s_wait_kmcnt 0x0
+; GFX1250: s_cmp_eq_u32 s2, 0x171
+; GFX1250: s_mov_b32 s2, 0
+; GFX1250: s_cselect_b32 s3, -1, 0
+; GFX1250: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250: v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1250: global_store_b64 v2, v[0:1], s[0:1]
+define amdgpu_kernel void @zext_i1_to_i32_ext64_uniform(ptr addrspace(1) %out, ptr addrspace(1) %out64, i32 %v1) #0 {
+entry:
+ %pred = icmp eq i32 %v1, 369
+ %tmp2 = zext i1 %pred to i32
+ %tmp3 = zext i32 %tmp2 to i64
+ store i64 %tmp3, ptr addrspace(1) %out64
+ ret void
+}
+
+; GCN-LABEL: zext_i1_to_i64_trunc32_uniform
+; GFX950: s_load_dword s2, s[4:5], 0x34
+; GFX950: s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX950: v_mov_b32_e32 v0, 0
+; GFX950: s_waitcnt lgkmcnt(0)
+; GFX950: s_cmpk_eq_i32 s2, 0x171
+; GFX950: s_cselect_b32 s2, -1, 0
+; GFX950: s_and_b32 s2, s2, 1
+; GFX950: v_mov_b32_e32 v1, s2
+; GFX1250: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250: s_load_b96 s[0:2], s[4:5], 0x2c
+; GFX1250: s_wait_kmcnt 0x0
+; GFX1250: s_cmp_eq_u32 s2, 0x171
+; GFX1250: s_cselect_b32 s2, -1, 0
+; GFX1250: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250: s_and_b32 s2, s2, 1
+; GFX1250: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250: global_store_b32 v0, v1, s[0:1]
+define amdgpu_kernel void @zext_i1_to_i64_trunc32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %out64, i32 %v1) #0 {
+entry:
+ %pred = icmp eq i32 %v1, 369
+ %tmp2 = zext i1 %pred to i64
+ %tmp3 = trunc i64 %tmp2 to i32
+ store i32 %tmp3, ptr addrspace(1) %out64
+ ret void
+}
+
+attributes #0 = { nounwind }
>From a24d6e29f5594c9e1089db0e394040666ec90efc Mon Sep 17 00:00:00 2001
From: root <zengwu13 at amd.com>
Date: Sun, 22 Feb 2026 15:16:57 +0000
Subject: [PATCH 6/9] changs decision order of UseRC
---
.../lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 28 ++++++-------------
llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 2 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 5 ++--
llvm/lib/Target/AMDGPU/SIInstructions.td | 10 +++++++
4 files changed, 22 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 2300693aa766a..2e247456f5c2e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -106,26 +106,6 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
MVT VT = Op.getSimpleValueType();
// Stick to the preferred register classes for legal types.
- if (TLI->isTypeLegal(VT)) {
- const llvm::TargetRegisterClass *SrcRegRC = TRI->getMinimalPhysRegClass(SrcReg);
- const llvm::TargetRegisterClass *CrossCopyRC = TRI->getCrossCopyRegClass(SrcRegRC);
- const llvm::TargetRegisterClass *LegalRC =
- TLI->getRegClassFor(VT, Op->isDivergent());
-
- // If there is a sub class relation between CrossCopyRegClass and
- // natively supported RegClass, the result of getRegClassFor, then
- // we use natively supported RegClass to stick the existing logic.
- // For example, on AArch64, the CrossCopyRegClass of x0 is `GPR64arg`
- // and x0 is natively supported in regclass `GPR64all`, then `GPR64all` is chosen.
- // However, on AMDGPU, for `scc`, the natively supported regclass is, for
- // some reasons, SGPR_64 but CrossCopyRegClass is SGPR_32. Since there
- // is subclass relation, CrossCopyRegClass, SGPR_32 is picked.
- if (!TRI->isTypeLegalForClass(*CrossCopyRC, VT) || TRI->getCommonSubClass(CrossCopyRC, LegalRC)) {
- UseRC = LegalRC;
- } else {
- UseRC = CrossCopyRC;
- }
- }
for (SDNode *User : Op->users()) {
bool Match = true;
@@ -169,6 +149,14 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
break;
}
+
+ if (UseRC == nullptr || !UseRC->isAllocatable()) {
+ // The check is to be removed in other pending PR, it is kept to make System Z happy.
+ if (TLI->isTypeLegal(VT)) {
+ UseRC = TLI->getRegClassFor(VT, Op->isDivergent());
+ }
+ }
+
const TargetRegisterClass *SrcRC = nullptr, *DstRC = nullptr;
SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT);
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 68a348cda39b9..edf6a2872d6f5 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -783,7 +783,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
lowerVGPR2SGPRCopies(MF);
// Postprocessing
- fixSCCCopies(MF);
+ // fixSCCCopies(MF);
for (auto *MI : S2VCopies) {
// Check if it is still valid
if (MI->isCopy()) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 301f2fc8dab45..7ebd14bfe5c75 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -18852,8 +18852,9 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
- return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
- : &AMDGPU::SReg_32RegClass;
+ return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
+ : &AMDGPU::SReg_32RegClass;
+
if (!TRI->isSGPRClass(RC) && !isDivergent)
return TRI->getEquivalentSGPRClass(RC);
if (TRI->isSGPRClass(RC) && isDivergent) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 257fa23608d94..be1af6cda43d7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3043,6 +3043,16 @@ def : GCNPat <
(S_AND_B32 $src, (i32 1))
>;
+// since operand of S_AND_B64 is SREG-64 and it does not support i1.
+let AddedComplexity = 10 in
+def : GCNPat <
+ (i64 (UniformUnaryFrag<zext> i1:$src)),
+ (REG_SEQUENCE SReg_64,
+ (S_MOV_B32 (S_AND_B32 $src, (i32 1))), sub0,
+ (S_MOV_B32 (i32 0)), sub1
+ )
+>;
+
let WaveSizePredicate = isWave64 in {
def : GCNPat <
(i1 (and i1:$src0, i1:$src1)),
>From 421971dee9052ad4df4a34194e92534156c81af2 Mon Sep 17 00:00:00 2001
From: root <Zeng.Wu2 at amd.com>
Date: Mon, 2 Mar 2026 03:24:17 +0000
Subject: [PATCH 7/9] use common-sub-regclass
---
llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 2e247456f5c2e..3096ab5ccb698 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/PseudoProbe.h"
@@ -149,12 +150,16 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
break;
}
+ const TargetRegisterClass *RegClassForVT = nullptr;
+ // The check is to be removed in other pending PR, it is kept to make System Z happy.
+ if (TLI->isTypeLegal(VT)) {
+ RegClassForVT = TLI->getRegClassFor(VT, Op->isDivergent());
+ }
if (UseRC == nullptr || !UseRC->isAllocatable()) {
- // The check is to be removed in other pending PR, it is kept to make System Z happy.
- if (TLI->isTypeLegal(VT)) {
- UseRC = TLI->getRegClassFor(VT, Op->isDivergent());
- }
+ UseRC = RegClassForVT;
+ } else if (auto CommonSubClass = TRI->getCommonSubClass(UseRC, RegClassForVT)) {
+ UseRC = CommonSubClass;
}
const TargetRegisterClass *SrcRC = nullptr, *DstRC = nullptr;
>From cdc8e8023d68638ef67afe1ac21f04da6a93afb3 Mon Sep 17 00:00:00 2001
From: root <Zeng.Wu2 at amd.com>
Date: Tue, 3 Mar 2026 06:19:27 +0000
Subject: [PATCH 8/9] remove S_AND, S_OR etc under predicate isWave64
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 44 ++---------------------
llvm/lib/Target/AMDGPU/SOPInstructions.td | 11 +++++-
2 files changed, 12 insertions(+), 43 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index be1af6cda43d7..0bd3de21e44a8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3053,47 +3053,7 @@ def : GCNPat <
)
>;
-let WaveSizePredicate = isWave64 in {
-def : GCNPat <
- (i1 (and i1:$src0, i1:$src1)),
- (S_AND_B64 $src0, $src1)
->;
-
-
-def : GCNPat <
- (i1 (or i1:$src0, i1:$src1)),
- (S_OR_B64 $src0, $src1)
->;
-
-def : GCNPat <
- (i1 (xor i1:$src0, i1:$src1)),
- (S_XOR_B64 $src0, $src1)
->;
-
-def : GCNPat <
- (i1 (add i1:$src0, i1:$src1)),
- (S_XOR_B64 $src0, $src1)
->;
-
-def : GCNPat <
- (i1 (sub i1:$src0, i1:$src1)),
- (S_XOR_B64 $src0, $src1)
->;
-
-let AddedComplexity = 1 in {
-def : GCNPat <
- (i1 (add i1:$src0, (i1 -1))),
- (S_NOT_B64 $src0)
->;
-
-def : GCNPat <
- (i1 (sub i1:$src0, (i1 -1))),
- (S_NOT_B64 $src0)
->;
-}
-} // end isWave64
-
-let WaveSizePredicate = isWave32 in {
+// let WaveSizePredicate = isWave32 in {
def : GCNPat <
(i1 (and i1:$src0, i1:$src1)),
(S_AND_B32 $src0, $src1)
@@ -3130,7 +3090,7 @@ def : GCNPat <
(S_NOT_B32 $src0)
>;
}
-} // end isWave32
+// } // end isWave32
def : GCNPat <
(i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))),
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 1931e0be15152..b95dae2e8a712 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1997,9 +1997,18 @@ def UniformSelect : PatFrag<
[{ return !N->isDivergent(); }]
>;
+def FreezeUniformSelect : PatFrag<
+ (ops node:$src0, node:$src1),
+ (select (freeze SCC), node:$src0, node:$src1),
+ [{
+ SDValue FreezeNode = N->getOperand(0);
+ return FreezeNode->hasOneUse() && !N->isDivergent();
+ }]
+>;
+
let AddedComplexity = 20 in {
def : GCNPat<
- (i32 (UniformSelect i32:$src0, i32:$src1)),
+ (i32 (FreezeUniformSelect i32:$src0, i32:$src1)),
(S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
>;
>From 55e1ae6019080b00d069ee1f3e6b6199c02b5e35 Mon Sep 17 00:00:00 2001
From: root <Zeng.Wu2 at amd.com>
Date: Tue, 3 Mar 2026 06:19:27 +0000
Subject: [PATCH 9/9] Hack On fixSCCCopies
Revert "[AMDGPU] update SCCCopies in si-fix-sgpr-copies"
This reverts commit 529e931143253345bd32cb871313997318f466ed.
---
llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 87 ++++++-------
llvm/lib/Target/AMDGPU/SIInstructions.td | 1 +
.../CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll | 122 ------------------
3 files changed, 41 insertions(+), 169 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index edf6a2872d6f5..605217b5d0095 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -69,8 +69,8 @@
#include "AMDGPULaneMaskUtils.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIRegisterInfo.h"
#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/InitializePasses.h"
#include "llvm/Target/TargetMachine.h"
@@ -783,7 +783,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
lowerVGPR2SGPRCopies(MF);
// Postprocessing
- // fixSCCCopies(MF);
+ fixSCCCopies(MF);
for (auto *MI : S2VCopies) {
// Check if it is still valid
if (MI->isCopy()) {
@@ -1174,64 +1174,35 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
const AMDGPU::LaneMaskConstants &LMC =
AMDGPU::LaneMaskConstants::get(MF.getSubtarget<GCNSubtarget>());
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
for (MachineBasicBlock &MBB : MF) {
- for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
- ++I) {
- MachineInstr &MI = *I;
+ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
+
+ MachineInstr &MI = *I++;
// May already have been lowered.
if (!MI.isCopy())
continue;
Register SrcReg = MI.getOperand(1).getReg();
Register DstReg = MI.getOperand(0).getReg();
if (SrcReg == AMDGPU::SCC) {
- Register SCCCopy = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0_XEXECRegClass);
- I = BuildMI(
+ assert(DstReg.isVirtual());
+ Register NewDstReg = MRI->createVirtualRegister(TRI->getWaveMaskRegClass());
+ auto NewCopy = BuildMI(
*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)),
- MI.getDebugLoc(), TII->get(AMDGPU::S_CSELECT_B32), SCCCopy)
+ MI.getDebugLoc(), TII->get(LMC.CSelectOpc), NewDstReg)
.addImm(-1)
.addImm(0);
- assert(DstReg.isVirtual());
-
- const llvm::TargetRegisterInfo *TRI =
- MF.getSubtarget().getRegisterInfo();
-
- for (llvm::MachineOperand &UseOp : MRI->use_operands(DstReg)) {
- llvm::MachineInstr *UserMI = UseOp.getParent();
- for (const llvm::MachineOperand &Output : UserMI->defs()) {
- if (!Output.isReg())
- continue;
- Register OutputReg = Output.getReg();
- const auto RegSize = TRI->getRegSizeInBits(OutputReg, *MRI);
- if (RegSize == 32) {
- I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(),
- TII->get(AMDGPU::COPY), DstReg)
- .addReg(SCCCopy);
- } else {
- assert(RegSize == 64);
-
- if (UserMI->getOpcode() != AMDGPU::COPY) {
- // After DAG-2-DAG selection, e.g.
- // %12:sreg_32 = COPY $scc
- // %14:sreg_64_xexec = COPY %12:sreg_32
- // ....
- // so if opcode is not COPY, the legalization in ISel will make
- // sure the the copy is legal. Only the COPY inserted in DAG to Block
- // could have this issue.
- continue;
- }
+ for (MachineOperand &UseMO : llvm::make_early_inc_range(MRI->use_operands(DstReg))) {
+ // We must NOT replace Opnd0 inside the new instruction we just made!
+ if (UseMO.getParent() == NewCopy)
+ continue;
- BuildMI(MBB, UserMI, UserMI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE),
- OutputReg)
- .addReg(SCCCopy)
- .addImm(AMDGPU::sub0)
- .addReg(SCCCopy)
- .addImm(AMDGPU::sub1);
- UserMI->eraseFromParent();
- }
- }
+ UseMO.setReg(NewDstReg);
}
-
MI.eraseFromParent();
continue;
}
@@ -1242,8 +1213,30 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
.addReg(Tmp, getDefRegState(true))
.addReg(SrcReg)
.addReg(LMC.ExecReg);
+
MI.eraseFromParent();
}
+
+ if (SrcReg.isPhysical() || DstReg.isPhysical()) continue;
+ auto SrcRC = MRI->getRegClass(SrcReg);
+ auto DstRC = MRI->getRegClass(DstReg);
+
+ if (TRI->isSGPRClass(DstRC) && TRI->isSGPRClass(SrcRC) && DstRC != SrcRC) {
+ auto DstBitWidth = TRI->getRegSizeInBits(DstReg, *MRI);
+ auto SrcBitWidth = TRI->getRegSizeInBits(SrcReg, *MRI);
+
+ if (DstBitWidth == 64 && SrcBitWidth == 32) {
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE),
+ DstReg)
+ .addReg(SrcReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(SrcReg)
+ .addImm(AMDGPU::sub1);
+ MI.eraseFromParent();
+ } else if (DstBitWidth == 32 && SrcBitWidth == 64) {
+ MI.getOperand(1).setSubReg(AMDGPU::sub0);
+ }
+ }
}
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 0bd3de21e44a8..3b9c88976929c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2452,6 +2452,7 @@ def : GCNPat <
(V_MOV_B32_e32 (bf16 (bitcast_fpimm_to_i32 $imm)))
>;
+
def : GCNPat <
(bf16 fpimm:$imm),
(S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll
deleted file mode 100644
index 6404afa911271..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.zext.i1.ll
+++ /dev/null
@@ -1,122 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GFX1250 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX950 %s
-
-declare i32 @llvm.amdgcn.workitem.id.x() #0
-
-; GCN-LABEL: zext_i1_to_i32_uniform
-; GFX950: s_load_dword s2, s[4:5], 0x34
-; GFX950: s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX950: v_mov_b32_e32 v0, 0
-; GFX950: s_waitcnt lgkmcnt(0)
-; GFX950: s_cmpk_eq_i32 s2, 0x171
-; GFX950: s_cselect_b32 s2, -1, 0
-; GFX950: s_and_b32 s2, s2, 1
-; GFX950: v_mov_b32_e32 v1, s2
-; GFX950: global_store_dword v0, v1, s[0:1]
-; GFX1250: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX1250: s_load_b96 s[0:2], s[4:5], 0x2c
-; GFX1250: s_wait_kmcnt 0x0
-; GFX1250: s_cmp_eq_u32 s2, 0x171
-; GFX1250: s_cselect_b32 s2, -1, 0
-; GFX1250: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250: s_and_b32 s2, s2, 1
-; GFX1250: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1250: global_store_b32 v0, v1, s[0:1]
-define amdgpu_kernel void @zext_i1_to_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %out64, i32 %v1) #0 {
-entry:
- %pred = icmp eq i32 %v1, 369
- %tmp2 = zext i1 %pred to i32
-
- store i32 %tmp2, ptr addrspace(1) %out64
- ret void
-}
-
-; GCN-LABEL: zext_i1_to_i64_uniform
-; GFX950: s_load_dword s2, s[4:5], 0x34
-; GFX950: s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX950: s_mov_b32 s4, 0
-; GFX950: v_mov_b32_e32 v2, 0
-; GFX950: v_mov_b32_e32 v1, s4
-; GFX950: s_waitcnt lgkmcnt(0)
-; GFX950: s_cmpk_eq_i32 s2, 0x171
-; GFX950: s_cselect_b32 s2, -1, 0
-; GFX950: s_mov_b32 s3, s2
-; GFX950: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX950: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1250: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX1250: s_load_b96 s[0:2], s[4:5], 0x2c
-; GFX1250: s_wait_kmcnt 0x0
-; GFX1250: s_cmp_eq_u32 s2, 0x171
-; GFX1250: s_mov_b32 s2, 0
-; GFX1250: s_cselect_b32 s3, -1, 0
-; GFX1250: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
-; GFX1250: v_cndmask_b32_e64 v0, 0, 1, s3
-; GFX1250: global_store_b64 v2, v[0:1], s[0:1]
-define amdgpu_kernel void @zext_i1_to_i64_uniform(ptr addrspace(1) %out, ptr addrspace(1) %out64, i32 %v1) #0 {
-entry:
- %pred = icmp eq i32 %v1, 369
- %tmp2 = zext i1 %pred to i64
-
- store i64 %tmp2, ptr addrspace(1) %out64
- ret void
-}
-
-; GCN-LABEL: zext_i1_to_i32_ext64_uniform
-; GFX950: s_load_dword s2, s[4:5], 0x34
-; GFX950: s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX950: s_mov_b32 s4, 0
-; GFX950: v_mov_b32_e32 v2, 0
-; GFX950: v_mov_b32_e32 v1, s4
-; GFX950: s_waitcnt lgkmcnt(0)
-; GFX950: s_cmpk_eq_i32 s2, 0x171
-; GFX950: s_cselect_b32 s2, -1, 0
-; GFX950: s_mov_b32 s3, s2
-; GFX950: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX950: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1250: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX1250: s_load_b96 s[0:2], s[4:5], 0x2c
-; GFX1250: s_wait_kmcnt 0x0
-; GFX1250: s_cmp_eq_u32 s2, 0x171
-; GFX1250: s_mov_b32 s2, 0
-; GFX1250: s_cselect_b32 s3, -1, 0
-; GFX1250: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
-; GFX1250: v_cndmask_b32_e64 v0, 0, 1, s3
-; GFX1250: global_store_b64 v2, v[0:1], s[0:1]
-define amdgpu_kernel void @zext_i1_to_i32_ext64_uniform(ptr addrspace(1) %out, ptr addrspace(1) %out64, i32 %v1) #0 {
-entry:
- %pred = icmp eq i32 %v1, 369
- %tmp2 = zext i1 %pred to i32
- %tmp3 = zext i32 %tmp2 to i64
- store i64 %tmp3, ptr addrspace(1) %out64
- ret void
-}
-
-; GCN-LABEL: zext_i1_to_i64_trunc32_uniform
-; GFX950: s_load_dword s2, s[4:5], 0x34
-; GFX950: s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX950: v_mov_b32_e32 v0, 0
-; GFX950: s_waitcnt lgkmcnt(0)
-; GFX950: s_cmpk_eq_i32 s2, 0x171
-; GFX950: s_cselect_b32 s2, -1, 0
-; GFX950: s_and_b32 s2, s2, 1
-; GFX950: v_mov_b32_e32 v1, s2
-; GFX1250: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX1250: s_load_b96 s[0:2], s[4:5], 0x2c
-; GFX1250: s_wait_kmcnt 0x0
-; GFX1250: s_cmp_eq_u32 s2, 0x171
-; GFX1250: s_cselect_b32 s2, -1, 0
-; GFX1250: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250: s_and_b32 s2, s2, 1
-; GFX1250: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1250: global_store_b32 v0, v1, s[0:1]
-define amdgpu_kernel void @zext_i1_to_i64_trunc32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %out64, i32 %v1) #0 {
-entry:
- %pred = icmp eq i32 %v1, 369
- %tmp2 = zext i1 %pred to i64
- %tmp3 = trunc i64 %tmp2 to i32
- store i32 %tmp3, ptr addrspace(1) %out64
- ret void
-}
-
-attributes #0 = { nounwind }
More information about the llvm-commits
mailing list