[llvm] [AMDGPU][True16][CodeGen] readfirstlane for vgpr16 copy to sgpr32 (PR #137848)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Fri May 2 07:11:17 PDT 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/137848
>From 9adcefeef3465b31154020f404bb608d0a987bda Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 29 Apr 2025 13:28:11 -0400
Subject: [PATCH] fix vgpr16 copy to sgpr32
---
llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 21 ++++++++--
llvm/lib/Target/AMDGPU/VOP1Instructions.td | 9 +---
.../fix-sgpr-copies-vgpr16-to-spgr32.ll | 41 +++++++++++++++++++
3 files changed, 59 insertions(+), 12 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index bb8e9a092e07c..47052a80bf125 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -1086,10 +1086,23 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1));
size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);
if (SrcSize == 16) {
- // HACK to handle possible 16bit VGPR source
- auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
- TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
- MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister);
+ assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
+ "We do not expect to see 16-bit copies from VGPR to SGPR unless "
+ "we have 16-bit VGPRs");
+ assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass ||
+ MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass ||
+ MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass);
+ // There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits
+ if (MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass)
+ MRI->setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
+ Register VReg32 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ const DebugLoc &DL = MI->getDebugLoc();
+ BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::SUBREG_TO_REG), VReg32)
+ .addImm(0)
+ .addReg(SrcReg, 0)
+ .addImm(AMDGPU::lo16);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ .addReg(VReg32);
} else if (SrcSize == 32) {
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 071f55ce16403..352a3f9c2d27f 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1472,16 +1472,9 @@ def : GCNPat <
} // End OtherPredicates = [isGFX8Plus, p]
-let True16Predicate = UseFakeTrue16Insts in {
-def : GCNPat<
- (i32 (DivergentUnaryFrag<anyext> i16:$src)),
- (COPY $src)
->;
-} // End True16Predicate = UseFakeTrue16Insts
-
let True16Predicate = UseRealTrue16Insts in {
def : GCNPat<
- (i32 (UniformUnaryFrag<anyext> (i16 SReg_32:$src))),
+ (i32 (UniformUnaryFrag<anyext> i16:$src)),
(COPY $src)
>;
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll
new file mode 100644
index 0000000000000..0b42274f9553d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s
+
+; expect readfirstlane to pick the 32bit register
+define amdgpu_gs i32 @vgpr16_copyto_sgpr(ptr addrspace(3) %a, i32 %b, ptr addrspace(1) %out) {
+; CHECK-LABEL: vgpr16_copyto_sgpr:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_and_b32 s0, 0xffff, s0
+; CHECK-NEXT: s_mul_i32 s0, s0, 5
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT: s_cmp_lg_u32 s0, 2
+; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
+; CHECK-NEXT: ; %bb.1: ; %a1
+; CHECK-NEXT: s_mov_b32 s0, 1
+; CHECK-NEXT: s_branch .LBB0_3
+; CHECK-NEXT: .LBB0_2: ; %a2
+; CHECK-NEXT: s_mov_b32 s0, 2
+; CHECK-NEXT: s_branch .LBB0_3
+; CHECK-NEXT: .LBB0_3:
+entry:
+ %1 = load <4 x float>, ptr addrspace(3) poison, align 4
+ %2 = extractelement <4 x float> %1, i32 0
+ %3 = fptrunc float %2 to half
+ %4 = bitcast half %3 to i16
+ %5 = zext i16 %4 to i32
+ %6 = add i32 %5, 1
+ %7 = mul i32 %6, 5
+ %8 = icmp eq i32 %7, 7
+ br i1 %8, label %a1, label %a2
+
+a1:
+ ret i32 1
+
+a2:
+ ret i32 2
+}
More information about the llvm-commits
mailing list