[llvm] [AMDGPU][True16][CodeGen] readfirstlane for vgpr16 copy to sgpr32 (PR #118037)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Mon May 5 11:49:21 PDT 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/118037
>From 2973ed7061c41a33ba75b68a047e3b51a7b953c9 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 29 Apr 2025 13:28:11 -0400
Subject: [PATCH 1/3] fix vgpr16 copy to sgpr32
---
llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 20 ++++--
llvm/lib/Target/AMDGPU/VOP1Instructions.td | 9 +--
.../AMDGPU/fix-sgpr-copies-f16-true16.mir | 63 +++++++++++++++++++
.../fix-sgpr-copies-vgpr16-to-spgr32.ll | 41 ++++++++++++
4 files changed, 121 insertions(+), 12 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index bb8e9a092e07c..caa5c7a599b8b 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -1086,10 +1086,22 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1));
size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);
if (SrcSize == 16) {
- // HACK to handle possible 16bit VGPR source
- auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
- TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
- MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister);
+ assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
+ "We do not expect to see 16-bit copies from VGPR to SGPR unless "
+ "we have 16-bit VGPRs");
+ assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass ||
+ MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass ||
+ MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass);
+ // There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits
+ MRI->setRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
+ Register VReg32 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ const DebugLoc &DL = MI->getDebugLoc();
+ BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::SUBREG_TO_REG), VReg32)
+ .addImm(0)
+ .addReg(SrcReg, 0)
+ .addImm(AMDGPU::lo16);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ .addReg(VReg32);
} else if (SrcSize == 32) {
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 071f55ce16403..352a3f9c2d27f 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1472,16 +1472,9 @@ def : GCNPat <
} // End OtherPredicates = [isGFX8Plus, p]
-let True16Predicate = UseFakeTrue16Insts in {
-def : GCNPat<
- (i32 (DivergentUnaryFrag<anyext> i16:$src)),
- (COPY $src)
->;
-} // End True16Predicate = UseFakeTrue16Insts
-
let True16Predicate = UseRealTrue16Insts in {
def : GCNPat<
- (i32 (UniformUnaryFrag<anyext> (i16 SReg_32:$src))),
+ (i32 (UniformUnaryFrag<anyext> i16:$src)),
(COPY $src)
>;
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index 137a9aaea6a77..28f498e1e0518 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -53,3 +53,66 @@ body: |
%3:sreg_32 = S_OR_B32 %2:sreg_32, %2:sreg_32, implicit-def $scc
%4:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %3:sreg_32, 0, 0, 0, implicit $mode, implicit $exec
...
+
+---
+name: vgpr16_to_spgr32
+body: |
+ ; GCN-LABEL: name: vgpr16_to_spgr32
+ ; GCN: bb.0.entry:
+ ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; GCN-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 killed [[COPY]], 0, 1, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) poison` + 8, align 4, addrspace 3)
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DS_READ2_B32_gfx9_]].sub0
+ ; GCN-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, killed [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+ ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16
+ ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[SUBREG_TO_REG]], implicit $exec
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_MOV_B32_]], killed [[V_READFIRSTLANE_B32_]], implicit-def dead $scc
+ ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5
+ ; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]]
+ ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+ ; GCN-NEXT: S_CMP_LG_U32 killed [[S_MUL_I32_]], killed [[S_MOV_B32_2]], implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: S_BRANCH %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_3]]
+ ; GCN-NEXT: $sgpr0 = COPY [[S_MOV_B32_4]]
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+ ; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_5]]
+ ; GCN-NEXT: $sgpr0 = COPY [[S_MOV_B32_6]]
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0
+ bb.0.entry:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%)
+
+ %5:sreg_32 = IMPLICIT_DEF
+ %6:vgpr_32 = COPY %5:sreg_32
+ %4:vreg_64 = DS_READ2_B32_gfx9 killed %6:vgpr_32, 0, 1, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) poison` + 8, align 4, addrspace 3)
+ %7:sgpr_32 = COPY %4.sub0:vreg_64
+ %8:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, killed %7:sgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+ %9:sreg_32 = S_MOV_B32 65535
+ %11:sreg_32 = COPY %8:vgpr_16
+ %10:sreg_32 = S_AND_B32 killed %9:sreg_32, killed %11:sreg_32, implicit-def dead $scc
+ %12:sreg_32 = S_MOV_B32 5
+ %13:sreg_32 = S_MUL_I32 killed %10:sreg_32, killed %12:sreg_32
+ %14:sreg_32 = S_MOV_B32 2
+ S_CMP_LG_U32 killed %13:sreg_32, killed %14:sreg_32, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+ bb.1:
+ %17:sreg_32 = S_MOV_B32 1
+ %18:sreg_32 = S_MOV_B32 killed %17:sreg_32
+ $sgpr0 = COPY %18:sreg_32
+ SI_RETURN_TO_EPILOG $sgpr0
+ bb.2:
+ %15:sreg_32 = S_MOV_B32 2
+ %16:sreg_32 = S_MOV_B32 killed %15:sreg_32
+ $sgpr0 = COPY %16:sreg_32
+ SI_RETURN_TO_EPILOG $sgpr0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll
new file mode 100644
index 0000000000000..0b42274f9553d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s
+
+; expect readfirstlane to pick the 32bit register
+define amdgpu_gs i32 @vgpr16_copyto_sgpr(ptr addrspace(3) %a, i32 %b, ptr addrspace(1) %out) {
+; CHECK-LABEL: vgpr16_copyto_sgpr:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_and_b32 s0, 0xffff, s0
+; CHECK-NEXT: s_mul_i32 s0, s0, 5
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT: s_cmp_lg_u32 s0, 2
+; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
+; CHECK-NEXT: ; %bb.1: ; %a1
+; CHECK-NEXT: s_mov_b32 s0, 1
+; CHECK-NEXT: s_branch .LBB0_3
+; CHECK-NEXT: .LBB0_2: ; %a2
+; CHECK-NEXT: s_mov_b32 s0, 2
+; CHECK-NEXT: s_branch .LBB0_3
+; CHECK-NEXT: .LBB0_3:
+entry:
+ %1 = load <4 x float>, ptr addrspace(3) poison, align 4
+ %2 = extractelement <4 x float> %1, i32 0
+ %3 = fptrunc float %2 to half
+ %4 = bitcast half %3 to i16
+ %5 = zext i16 %4 to i32
+ %6 = add i32 %5, 1
+ %7 = mul i32 %6, 5
+ %8 = icmp eq i32 %7, 7
+ br i1 %8, label %a1, label %a2
+
+a1:
+ ret i32 1
+
+a2:
+ ret i32 2
+}
>From 43a7ff511207f344dae10ab72d41e804949ef619 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Mon, 5 May 2025 13:00:49 -0400
Subject: [PATCH 2/3] replace sbureg_to_reg to reg_sequence
---
llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 11 +++++++----
.../CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir | 5 +++--
2 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index caa5c7a599b8b..6f89a3a207f93 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -1096,10 +1096,13 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
MRI->setRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
Register VReg32 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
const DebugLoc &DL = MI->getDebugLoc();
- BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::SUBREG_TO_REG), VReg32)
- .addImm(0)
- .addReg(SrcReg, 0)
- .addImm(AMDGPU::lo16);
+ Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), VReg32)
+ .addReg(SrcReg, 0, SubReg)
+ .addImm(AMDGPU::lo16)
+ .addReg(Undef)
+ .addImm(AMDGPU::hi16);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
.addReg(VReg32);
} else if (SrcSize == 32) {
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index 28f498e1e0518..6e24d9afa2bbc 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -67,8 +67,9 @@ body: |
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DS_READ2_B32_gfx9_]].sub0
; GCN-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, killed [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
- ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16
- ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[SUBREG_TO_REG]], implicit $exec
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]], implicit $exec
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_MOV_B32_]], killed [[V_READFIRSTLANE_B32_]], implicit-def dead $scc
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5
; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]]
>From 7706bea702081bb5fc83aef32765490a80990620 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Mon, 5 May 2025 14:49:08 -0400
Subject: [PATCH 3/3] update test
---
.../fix-sgpr-copies-vgpr16-to-spgr32.ll | 23 +++++++++++--------
1 file changed, 13 insertions(+), 10 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll
index 0b42274f9553d..5df61f19033a3 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll
@@ -1,10 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s
+ at lds = external local_unnamed_addr addrspace(3) global [4 x float], align 4
+
; expect readfirstlane to pick the 32bit register
-define amdgpu_gs i32 @vgpr16_copyto_sgpr(ptr addrspace(3) %a, i32 %b, ptr addrspace(1) %out) {
+define amdgpu_gs i32 @vgpr16_copyto_sgpr() {
; CHECK-LABEL: vgpr16_copyto_sgpr:
; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: v_mov_b32_e32 v0, lds at abs32@lo
; CHECK-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cvt_f16_f32_e32 v0.l, v0
@@ -23,15 +26,15 @@ define amdgpu_gs i32 @vgpr16_copyto_sgpr(ptr addrspace(3) %a, i32 %b, ptr addrsp
; CHECK-NEXT: s_branch .LBB0_3
; CHECK-NEXT: .LBB0_3:
entry:
- %1 = load <4 x float>, ptr addrspace(3) poison, align 4
- %2 = extractelement <4 x float> %1, i32 0
- %3 = fptrunc float %2 to half
- %4 = bitcast half %3 to i16
- %5 = zext i16 %4 to i32
- %6 = add i32 %5, 1
- %7 = mul i32 %6, 5
- %8 = icmp eq i32 %7, 7
- br i1 %8, label %a1, label %a2
+ %ptr = load <4 x float>, ptr addrspace(3) @lds, align 4
+ %f = extractelement <4 x float> %ptr, i32 0
+ %half = fptrunc float %f to half
+ %i16 = bitcast half %half to i16
+ %i32 = zext i16 %i16 to i32
+ %add = add i32 %i32, 1
+ %mul = mul i32 %add, 5
+ %icmp = icmp eq i32 %mul, 7
+ br i1 %icmp, label %a1, label %a2
a1:
ret i32 1
More information about the llvm-commits
mailing list