[llvm] [AMDGPU][True16][CodeGen]Support V2S copy with True16 inst format. (PR #118037)

via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 28 14:08:11 PST 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Brox Chen (broxigarchen)

<details>
<summary>Changes</summary>

V2S COPY can be emitted as either

sgpr_32 = COPY vgpr_16
or
sgpr_lo16 = COPY vgpr_16

Emit REG_SEQUENCE with hi16 bits undef in readfirstlane for 16 bit src

---
Full diff: https://github.com/llvm/llvm-project/pull/118037.diff


2 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp (+19-4) 
- (added) llvm/test/CodeGen/AMDGPU/true16-copy-vgpr16-to-sgpr32.mir (+118) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index ac69bf6d038ece..9749d09592bab6 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -1075,10 +1075,25 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
         TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1));
     size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);
     if (SrcSize == 16) {
-      // HACK to handle possible 16bit VGPR source
-      auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
-                         TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
-      MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister);
+      assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
+             "We do not expect to see 16-bit copies from VGPR to SGPR unless "
+             "we have 16-bit VGPRs");
+      assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass ||
+             MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass);
+      // There is no V_READFIRSTLANE_B16, so widen the destination scalar
+      // value to 32 bits
+      MRI->setRegClass(DstReg, &AMDGPU::SGPR_32RegClass);
+      Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      const DebugLoc &DL = MI->getDebugLoc();
+      Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
+      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), TmpReg)
+          .addReg(SrcReg, 0, SubReg)
+          .addImm(AMDGPU::lo16)
+          .addReg(Undef)
+          .addImm(AMDGPU::hi16);
+      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+          .addReg(TmpReg);
     } else if (SrcSize == 32) {
       auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
                          TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
diff --git a/llvm/test/CodeGen/AMDGPU/true16-copy-vgpr16-to-sgpr32.mir b/llvm/test/CodeGen/AMDGPU/true16-copy-vgpr16-to-sgpr32.mir
new file mode 100644
index 00000000000000..640245b53b5c0a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/true16-copy-vgpr16-to-sgpr32.mir
@@ -0,0 +1,118 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck %s
+
+# Ensure READFIRSTLANE is generated, and that its src is REG_SEQUENCE.
+
+---
+name:            test4
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: test4
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY undef %1:sgpr_128
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_128 = COPY undef %4:sgpr_128
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %6, %bb.3
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %8, %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[PHI]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[PHI]]
+  ; CHECK-NEXT:   [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY3]], [[COPY]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[PHI]]
+  ; CHECK-NEXT:   [[BUFFER_LOAD_USHORT_OFFEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[COPY]], [[S_MOV_B32_1]], 2, 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_16 = COPY [[BUFFER_LOAD_USHORT_OFFEN]].lo16
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_16 = COPY [[BUFFER_LOAD_USHORT_OFFEN1]].lo16
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
+  ; CHECK-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_MOV_B32_e32_]], [[COPY7]], implicit $exec
+  ; CHECK-NEXT:   [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[COPY8]], 16, killed [[V_AND_B32_e64_]], implicit $exec
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sgpr_lo16 = COPY [[PHI1]].lo16
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vgpr_16 = COPY [[COPY9]]
+  ; CHECK-NEXT:   [[V_SUB_NC_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_SUB_NC_U16_t16_e64 0, [[COPY10]], 0, killed [[COPY5]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_SUB_NC_U16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[V_READFIRSTLANE_B32_]], killed [[S_MOV_B32_2]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 12
+  ; CHECK-NEXT:   S_CMP_LT_I32 [[S_AND_B32_]], killed [[S_MOV_B32_3]], implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit $scc
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 18
+  ; CHECK-NEXT:   S_CMP_LT_I32 [[S_AND_B32_]], killed [[S_MOV_B32_5]], implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit $scc
+  ; CHECK-NEXT:   S_BRANCH %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.1:
+    successors: %bb.3(0x80000000); %bb.3(100.00%)
+
+    %1:sgpr_128 = COPY undef %150:sgpr_128
+    %131:sreg_32 = S_MOV_B32 0
+    %2:sgpr_128 = COPY undef %151:sgpr_128
+    S_BRANCH %bb.3
+
+  bb.3:
+    successors: %bb.4(0x80000000); %bb.4(100.00%)
+
+    %3:sreg_32 = PHI %131:sreg_32, %bb.1, %183, %bb.5
+    %4:sreg_32 = PHI %131:sreg_32, %bb.1, %182, %bb.5
+
+  bb.4:
+    successors: %bb.6(0x40000000), %bb.5(0x40000000); %bb.5(50.00%), %bb.6(50.00%)
+
+    %154:sreg_32 = S_MOV_B32 0
+    %156:vgpr_32 = COPY %3:sreg_32
+    %162:vgpr_32 = COPY %3:sreg_32
+    %161:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %162:vgpr_32, %1:sgpr_128, %154:sreg_32, 0, 0, 0, implicit $exec
+    %164:vgpr_32 = COPY %3:sreg_32
+    %163:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %164:vgpr_32, %1:sgpr_128, %154:sreg_32, 2, 0, 0, implicit $exec
+    %9:vgpr_16 = COPY %161.lo16:vgpr_32
+    %10:vgpr_16 = COPY %163.lo16:vgpr_32
+    %165:sreg_32 = COPY %9:vgpr_16
+    %166:sreg_32 = COPY %10:vgpr_16
+    %12:sreg_32 = S_PACK_LL_B32_B16 %165:sreg_32, %166:sreg_32
+    %167:sgpr_lo16 = COPY %4.lo16:sreg_32
+    %170:vgpr_16 = COPY %167:sgpr_lo16
+    %177:vgpr_16 = V_SUB_NC_U16_t16_e64 0, %170:vgpr_16, 0, killed %9:vgpr_16, 0, 0, implicit $exec
+    %179:sreg_32 = COPY killed %177:vgpr_16
+    %180:sreg_32 = S_MOV_B32 255
+    %13:sreg_32 = S_AND_B32 killed %179:sreg_32, killed %180:sreg_32, implicit-def dead $scc
+    %181:sreg_32 = S_MOV_B32 12
+    S_CMP_LT_I32 %13:sreg_32, killed %181:sreg_32, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.6, implicit $scc
+    S_BRANCH %bb.5
+
+  bb.5:
+    successors: %bb.6(0x40000000), %bb.3(0x40000000); %bb.6(50.00%), %bb.3(50.00%)
+
+    %183:sreg_32 = S_MOV_B32 -1
+    %182:sreg_32 = IMPLICIT_DEF
+    %184:sreg_32 = S_MOV_B32 18
+    S_CMP_LT_I32 %13:sreg_32, killed %184:sreg_32, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.3, implicit $scc
+    S_BRANCH %bb.6
+
+  bb.6:
+      S_ENDPGM 0
+
+...

``````````

</details>


https://github.com/llvm/llvm-project/pull/118037


More information about the llvm-commits mailing list