[llvm] [AMDGPU][True16][CodeGen]Support V2S copy with True16 inst format. (PR #118037)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 28 14:08:11 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Brox Chen (broxigarchen)
<details>
<summary>Changes</summary>
V2S COPY can be emitted as either
sgpr_32 = COPY vgpr_16
or
sgpr_lo16 = COPY vgpr_16
Emit REG_SEQUENCE with hi16 bits undef in readfirstlane for 16 bit src
---
Full diff: https://github.com/llvm/llvm-project/pull/118037.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp (+19-4)
- (added) llvm/test/CodeGen/AMDGPU/true16-copy-vgpr16-to-sgpr32.mir (+118)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index ac69bf6d038ece..9749d09592bab6 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -1075,10 +1075,25 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1));
size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);
if (SrcSize == 16) {
- // HACK to handle possible 16bit VGPR source
- auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
- TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
- MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister);
+ assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
+ "We do not expect to see 16-bit copies from VGPR to SGPR unless "
+ "we have 16-bit VGPRs");
+ assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass ||
+ MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass);
+ // There is no V_READFIRSTLANE_B16, so widen the destination scalar
+ // value to 32 bits
+ MRI->setRegClass(DstReg, &AMDGPU::SGPR_32RegClass);
+ Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ const DebugLoc &DL = MI->getDebugLoc();
+ Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), TmpReg)
+ .addReg(SrcReg, 0, SubReg)
+ .addImm(AMDGPU::lo16)
+ .addReg(Undef)
+ .addImm(AMDGPU::hi16);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ .addReg(TmpReg);
} else if (SrcSize == 32) {
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
diff --git a/llvm/test/CodeGen/AMDGPU/true16-copy-vgpr16-to-sgpr32.mir b/llvm/test/CodeGen/AMDGPU/true16-copy-vgpr16-to-sgpr32.mir
new file mode 100644
index 00000000000000..640245b53b5c0a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/true16-copy-vgpr16-to-sgpr32.mir
@@ -0,0 +1,118 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck %s
+
+# Ensure READFIRSTLANE is generated, and that its src is REG_SEQUENCE.
+
+---
+name: test4
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: test4
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY undef %1:sgpr_128
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY undef %4:sgpr_128
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %6, %bb.3
+ ; CHECK-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %8, %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.3(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[PHI]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[PHI]]
+ ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY3]], [[COPY]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[PHI]]
+ ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[COPY]], [[S_MOV_B32_1]], 2, 0, 0, implicit $exec
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_16 = COPY [[BUFFER_LOAD_USHORT_OFFEN]].lo16
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_16 = COPY [[BUFFER_LOAD_USHORT_OFFEN1]].lo16
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
+ ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_MOV_B32_e32_]], [[COPY7]], implicit $exec
+ ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[COPY8]], 16, killed [[V_AND_B32_e64_]], implicit $exec
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sgpr_lo16 = COPY [[PHI1]].lo16
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_16 = COPY [[COPY9]]
+ ; CHECK-NEXT: [[V_SUB_NC_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_SUB_NC_U16_t16_e64 0, [[COPY10]], 0, killed [[COPY5]], 0, 0, implicit $exec
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_SUB_NC_U16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+ ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[V_READFIRSTLANE_B32_]], killed [[S_MOV_B32_2]], implicit-def dead $scc
+ ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 12
+ ; CHECK-NEXT: S_CMP_LT_I32 [[S_AND_B32_]], killed [[S_MOV_B32_3]], implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.4, implicit $scc
+ ; CHECK-NEXT: S_BRANCH %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 18
+ ; CHECK-NEXT: S_CMP_LT_I32 [[S_AND_B32_]], killed [[S_MOV_B32_5]], implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
+ ; CHECK-NEXT: S_BRANCH %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4:
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.1:
+ successors: %bb.3(0x80000000); %bb.3(100.00%)
+
+ %1:sgpr_128 = COPY undef %150:sgpr_128
+ %131:sreg_32 = S_MOV_B32 0
+ %2:sgpr_128 = COPY undef %151:sgpr_128
+ S_BRANCH %bb.3
+
+ bb.3:
+ successors: %bb.4(0x80000000); %bb.4(100.00%)
+
+ %3:sreg_32 = PHI %131:sreg_32, %bb.1, %183, %bb.5
+ %4:sreg_32 = PHI %131:sreg_32, %bb.1, %182, %bb.5
+
+ bb.4:
+ successors: %bb.6(0x40000000), %bb.5(0x40000000); %bb.5(50.00%), %bb.6(50.00%)
+
+ %154:sreg_32 = S_MOV_B32 0
+ %156:vgpr_32 = COPY %3:sreg_32
+ %162:vgpr_32 = COPY %3:sreg_32
+ %161:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %162:vgpr_32, %1:sgpr_128, %154:sreg_32, 0, 0, 0, implicit $exec
+ %164:vgpr_32 = COPY %3:sreg_32
+ %163:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %164:vgpr_32, %1:sgpr_128, %154:sreg_32, 2, 0, 0, implicit $exec
+ %9:vgpr_16 = COPY %161.lo16:vgpr_32
+ %10:vgpr_16 = COPY %163.lo16:vgpr_32
+ %165:sreg_32 = COPY %9:vgpr_16
+ %166:sreg_32 = COPY %10:vgpr_16
+ %12:sreg_32 = S_PACK_LL_B32_B16 %165:sreg_32, %166:sreg_32
+ %167:sgpr_lo16 = COPY %4.lo16:sreg_32
+ %170:vgpr_16 = COPY %167:sgpr_lo16
+ %177:vgpr_16 = V_SUB_NC_U16_t16_e64 0, %170:vgpr_16, 0, killed %9:vgpr_16, 0, 0, implicit $exec
+ %179:sreg_32 = COPY killed %177:vgpr_16
+ %180:sreg_32 = S_MOV_B32 255
+ %13:sreg_32 = S_AND_B32 killed %179:sreg_32, killed %180:sreg_32, implicit-def dead $scc
+ %181:sreg_32 = S_MOV_B32 12
+ S_CMP_LT_I32 %13:sreg_32, killed %181:sreg_32, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.6, implicit $scc
+ S_BRANCH %bb.5
+
+ bb.5:
+ successors: %bb.6(0x40000000), %bb.3(0x40000000); %bb.6(50.00%), %bb.3(50.00%)
+
+ %183:sreg_32 = S_MOV_B32 -1
+ %182:sreg_32 = IMPLICIT_DEF
+ %184:sreg_32 = S_MOV_B32 18
+ S_CMP_LT_I32 %13:sreg_32, killed %184:sreg_32, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.3, implicit $scc
+ S_BRANCH %bb.6
+
+ bb.6:
+ S_ENDPGM 0
+
+...
``````````
</details>
https://github.com/llvm/llvm-project/pull/118037
More information about the llvm-commits
mailing list