[llvm] [AMDGPU] Don't optimize agpr phis if there the operand doesn't have subreg use (PR #91267)

Mon May 6 13:44:01 PDT 2024

https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/91267

If the operand doesn't have any subreg use, the optimization could potentially
generate `V_ACCVGPR_READ_B32_e64` with wrong register class, such as the
following case:

%46:vreg_128 = V_ACCVGPR_READ_B32_e64 %38:areg_128, implicit $exec


>From 73f3e2a885175f951770edebb96adde5dcf4d984 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Mon, 6 May 2024 16:40:22 -0400
Subject: [PATCH] [AMDGPU] Don't optimize agpr phis if there the operand
 doesn't have subreg use

If the operand doesn't have any subreg use, the optimization could potentially
generate `V_ACCVGPR_READ_B32_e64` with wrong register class, such as the
following case:

%46:vreg_128 = V_ACCVGPR_READ_B32_e64 %38:areg_128, implicit $exec
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     |  2 +
 ...p-optimize-agpr-phi-without-subreg-use.mir | 58 +++++++++++++++++++
 2 files changed, 60 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/skip-optimize-agpr-phi-without-subreg-use.mir

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index cb448aaafa4c08..5c411a0955878f 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -2106,6 +2106,8 @@ bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
 
     for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
       MachineOperand &PhiMO = MI.getOperand(K);
+      if (!PhiMO.getSubReg())
+        continue;
       RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
     }
   }
diff --git a/llvm/test/CodeGen/AMDGPU/skip-optimize-agpr-phi-without-subreg-use.mir b/llvm/test/CodeGen/AMDGPU/skip-optimize-agpr-phi-without-subreg-use.mir
new file mode 100644
index 00000000000000..5e2d0426ecf719
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/skip-optimize-agpr-phi-without-subreg-use.mir
@@ -0,0 +1,58 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -run-pass si-fold-operands -o - %s | FileCheck %s
+
+# CHECK-NOT: V_ACCVGPR_READ_B32_e64
+
+---
+name:            skip_optimize_agpr_phi_without_subreg_use
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+body:             |
+  bb.0:
+    successors: %bb.1(0x80000000); %bb.1(100.00%)
+
+    %10:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %11:sgpr_32 = S_MOV_B32 0
+    %12:sgpr_128 = REG_SEQUENCE %11:sgpr_32, %subreg.sub0, %11:sgpr_32, %subreg.sub1, %11:sgpr_32, %subreg.sub2, %11:sgpr_32, %subreg.sub3
+    %0:vreg_128 = COPY %12:sgpr_128
+    %9:sreg_64 = S_MOV_B64 0
+    %38:areg_128 = COPY %0:vreg_128, implicit $exec
+    %27:sreg_32 = S_MOV_B32 1
+
+  bb.1:
+    successors: %bb.2(0x04000000), %bb.1(0x7c000000); %bb.2(3.12%), %bb.1(96.88%)
+
+    %2:sreg_64 = PHI %9:sreg_64, %bb.0, %7:sreg_64, %bb.1
+    %3:areg_128 = PHI %38:areg_128, %bb.0, %39:areg_128, %bb.1
+    %4:areg_128 = PHI %38:areg_128, %bb.0, %41:areg_128, %bb.1
+    %14:areg_128 = V_MFMA_F32_16X16X4F32_e64 %10:vgpr_32, %10:vgpr_32, %4:areg_128, 0, 0, 0, implicit $mode, implicit $exec
+    %16:vgpr_32 = COPY %14.sub3:areg_128
+    %17:vgpr_32 = COPY %14.sub2:areg_128
+    %18:vgpr_32 = COPY %14.sub1:areg_128
+    %19:vgpr_32 = COPY %14.sub0:areg_128
+    %20:areg_128 = V_MFMA_F32_16X16X4F32_e64 %10:vgpr_32, %10:vgpr_32, %3:areg_128, 0, 0, 0, implicit $mode, implicit $exec
+    %22:vgpr_32 = COPY %20.sub3:areg_128
+    %23:vgpr_32 = COPY %20.sub2:areg_128
+    %24:vgpr_32 = COPY %20.sub1:areg_128
+    %25:vgpr_32 = COPY %20.sub0:areg_128
+    %26:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+    %28:sreg_64 = V_CMP_LT_I32_e64 killed %26:vgpr_32, %27:sreg_32, implicit $exec
+    %7:sreg_64 = SI_IF_BREAK killed %28:sreg_64, %2:sreg_64, implicit-def dead $scc
+    %39:areg_128 = COPY %20:areg_128, implicit $exec
+    %41:areg_128 = COPY %14:areg_128, implicit $exec
+    SI_LOOP %7:sreg_64, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.2:
+
+    SI_END_CF %7:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %29:sreg_64 = S_MOV_B64 $src_private_base
+    %30:sreg_32 = COPY %29.sub1:sreg_64
+    %35:sgpr_32 = S_MOV_B32 0
+    %36:vgpr_32 = COPY killed %35:sgpr_32
+    %37:vgpr_32 = COPY killed %30:sreg_32
+    %34:vreg_64 = REG_SEQUENCE killed %36:vgpr_32, %subreg.sub0, killed %37:vgpr_32, %subreg.sub1
+    %33:vreg_64 = V_MOV_B64_PSEUDO 24, implicit $exec
+    FLAT_STORE_DWORDX2 killed %33:vreg_64, killed %34:vreg_64, 0, 0, implicit $exec, implicit $flat_scr
+    SI_RETURN