[llvm] [AMDGPU][True16][CodeGen]Support V2S copy with True16 flow (PR #118037)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 13 14:49:23 PST 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/118037
>From bc0eb4438df3f9edf3557877bab8073d0db9fc90 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Thu, 28 Nov 2024 13:26:35 -0500
Subject: [PATCH] Support V2S copy with True16 inst format. V2S COPY can be
emitted as either
sgpr_32 = COPY vgpr_16
or
sgpr_lo16 = COPY vgpr_16
Emit REG_SEQUENCE with hi16 bits undef in readfirstlane for 16 bit
src
---
llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 23 +++-
.../AMDGPU/true16-copy-vgpr16-to-sgpr32.mir | 118 ++++++++++++++++++
2 files changed, 137 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/true16-copy-vgpr16-to-sgpr32.mir
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index ac69bf6d038ece..9749d09592bab6 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -1075,10 +1075,25 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1));
size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);
if (SrcSize == 16) {
- // HACK to handle possible 16bit VGPR source
- auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
- TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
- MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister);
+ assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
+ "We do not expect to see 16-bit copies from VGPR to SGPR unless "
+ "we have 16-bit VGPRs");
+ assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass ||
+ MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass);
+ // There is no V_READFIRSTLANE_B16, so widen the destination scalar
+ // value to 32 bits
+ MRI->setRegClass(DstReg, &AMDGPU::SGPR_32RegClass);
+ Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ const DebugLoc &DL = MI->getDebugLoc();
+ Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), TmpReg)
+ .addReg(SrcReg, 0, SubReg)
+ .addImm(AMDGPU::lo16)
+ .addReg(Undef)
+ .addImm(AMDGPU::hi16);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ .addReg(TmpReg);
} else if (SrcSize == 32) {
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
diff --git a/llvm/test/CodeGen/AMDGPU/true16-copy-vgpr16-to-sgpr32.mir b/llvm/test/CodeGen/AMDGPU/true16-copy-vgpr16-to-sgpr32.mir
new file mode 100644
index 00000000000000..779a749861a62d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/true16-copy-vgpr16-to-sgpr32.mir
@@ -0,0 +1,118 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s
+
+# Ensure READFIRSTLANE is generated, and that its src is REG_SEQUENCE.
+
+---
+name: test4
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: test4
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.3
+ ; CHECK-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %6, %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.3(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY [[PHI]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]]
+ ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY1]], [[DEF]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]]
+ ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY2]], [[DEF]], [[S_MOV_B32_1]], 2, 0, 0, implicit $exec
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[BUFFER_LOAD_USHORT_OFFEN]].lo16
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[BUFFER_LOAD_USHORT_OFFEN1]].lo16
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
+ ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_MOV_B32_e32_]], [[COPY5]], implicit $exec
+ ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[COPY6]], 16, killed [[V_AND_B32_e64_]], implicit $exec
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_lo16 = COPY [[PHI1]].lo16
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_16 = COPY [[COPY7]]
+ ; CHECK-NEXT: [[V_SUB_NC_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_SUB_NC_U16_t16_e64 0, [[COPY8]], 0, killed [[COPY3]], 0, 0, implicit $exec
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_SUB_NC_U16_t16_e64_]], %subreg.lo16, [[DEF2]], %subreg.hi16
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+ ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[V_READFIRSTLANE_B32_]], killed [[S_MOV_B32_2]], implicit-def dead $scc
+ ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 12
+ ; CHECK-NEXT: S_CMP_LT_I32 [[S_AND_B32_]], killed [[S_MOV_B32_3]], implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.4, implicit $scc
+ ; CHECK-NEXT: S_BRANCH %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 18
+ ; CHECK-NEXT: S_CMP_LT_I32 [[S_AND_B32_]], killed [[S_MOV_B32_5]], implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
+ ; CHECK-NEXT: S_BRANCH %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4:
+ ; CHECK-NEXT: S_ENDPGM 0
+
+ bb.0:
+ successors: %bb.1(0x80000000)
+
+ %0:sgpr_128 = IMPLICIT_DEF
+ %2:sreg_32 = S_MOV_B32 0
+ %3:sgpr_128 = IMPLICIT_DEF
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ %5:sreg_32 = PHI %2, %bb.0, %6, %bb.3
+ %7:sreg_32 = PHI %2, %bb.0, %8, %bb.3
+
+ bb.2:
+ successors: %bb.4(0x40000000), %bb.3(0x40000000)
+
+ %9:sreg_32 = S_MOV_B32 0
+ %10:vgpr_32 = COPY %5
+ %11:vgpr_32 = COPY %5
+ %12:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %11, %0, %9, 0, 0, 0, implicit $exec
+ %13:vgpr_32 = COPY %5
+ %14:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %13, %0, %9, 2, 0, 0, implicit $exec
+ %15:vgpr_16 = COPY %12.lo16
+ %16:vgpr_16 = COPY %14.lo16
+ %17:sreg_32 = COPY %15
+ %18:sreg_32 = COPY %16
+ %19:sreg_32 = S_PACK_LL_B32_B16 %17, %18
+ %20:sgpr_lo16 = COPY %7.lo16
+ %21:vgpr_16 = COPY %20
+ %22:vgpr_16 = V_SUB_NC_U16_t16_e64 0, %21, 0, killed %15, 0, 0, implicit $exec
+ %23:sreg_32 = COPY killed %22
+ %24:sreg_32 = S_MOV_B32 255
+ %25:sreg_32 = S_AND_B32 killed %23, killed %24, implicit-def dead $scc
+ %26:sreg_32 = S_MOV_B32 12
+ S_CMP_LT_I32 %25, killed %26, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.4, implicit $scc
+ S_BRANCH %bb.3
+
+ bb.3:
+ successors: %bb.4(0x40000000), %bb.1(0x40000000)
+
+ %6:sreg_32 = S_MOV_B32 -1
+ %8:sreg_32 = IMPLICIT_DEF
+ %27:sreg_32 = S_MOV_B32 18
+ S_CMP_LT_I32 %25, killed %27, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit $scc
+ S_BRANCH %bb.4
+
+ bb.4:
+ S_ENDPGM 0
+...
More information about the llvm-commits
mailing list