[llvm] [MachineLICM] Hoist COPY instruction only when user can be hoisted (PR #81735)

Tue Feb 27 02:21:32 PST 2024

https://github.com/michaelselehov updated https://github.com/llvm/llvm-project/pull/81735

>From 68f81adb3f3a2c91872e1ee2ba5324684e99e975 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Fri, 26 Jan 2024 16:58:34 +0000
Subject: [PATCH 1/5] [MachineLICM] Hoist COPY instruction only when user can
 be hoisted

befa925acac8fd6a9266e introduced preliminary hoisting of COPY
instructions when the user of the COPY is inside the same loop.
That optimization appeared to be too aggressive and hoisted too many
COPY's greately increasing register pressure causing performance
regressions for AMDGPU target.

This is intended to fix the regression by hoisting COPY instruction only
if either:
 - User of COPY can be hoisted (other args are invariant)
or
 - Hoisting COPY doesn't bring high register pressure
---
 llvm/include/llvm/CodeGen/MachineLoopInfo.h |  5 ++++-
 llvm/lib/CodeGen/MachineLICM.cpp            | 19 +++++++++++++++----
 llvm/lib/CodeGen/MachineLoopInfo.cpp        |  6 +++++-
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineLoopInfo.h b/llvm/include/llvm/CodeGen/MachineLoopInfo.h
index ae075bee1daf76..445c9b1c3bc00f 100644
--- a/llvm/include/llvm/CodeGen/MachineLoopInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineLoopInfo.h
@@ -79,7 +79,10 @@ class MachineLoop : public LoopBase<MachineBasicBlock, MachineLoop> {
   /// I.e., all virtual register operands are defined outside of the loop,
   /// physical registers aren't accessed explicitly, and there are no side
   /// effects that aren't captured by the operands or other flags.
-  bool isLoopInvariant(MachineInstr &I) const;
+  /// ExcludeReg can be used to exclude the given register from the check
+  /// i.e. when we're considering hoisting it's definition but not hoisted it
+  /// yet
+  bool isLoopInvariant(MachineInstr &I, const Register ExcludeReg = 0) const;
 
   void dump() const;
 
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index efc19f8fdbf8c2..a3bfd97e5a90cb 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1264,13 +1264,24 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI,
 
   // If we have a COPY with other uses in the loop, hoist to allow the users to
   // also be hoisted.
+  Register defReg;
   if (MI.isCopy() && MI.getOperand(0).isReg() &&
-      MI.getOperand(0).getReg().isVirtual() && MI.getOperand(1).isReg() &&
-      MI.getOperand(1).getReg().isVirtual() &&
+      (defReg = MI.getOperand(0).getReg()).isVirtual() &&
+      MI.getOperand(1).isReg() && MI.getOperand(1).getReg().isVirtual() &&
       IsLoopInvariantInst(MI, CurLoop) &&
       any_of(MRI->use_nodbg_instructions(MI.getOperand(0).getReg()),
-             [&CurLoop](MachineInstr &UseMI) {
-               return CurLoop->contains(&UseMI);
+             [&CurLoop, this, defReg, Cost](MachineInstr &UseMI) {
+               if (!CurLoop->contains(&UseMI))
+                 return false;
+
+               // COPY is a cheap instruction, but if moving it won't cause high
+               // RP we're fine to hoist it even if the user can't be hoisted
+               // later Otherwise we want to check the user if it's hoistable
+               if (CanCauseHighRegPressure(Cost, false) &&
+                   !CurLoop->isLoopInvariant(UseMI, defReg))
+                 return false;
+
+               return true;
              }))
     return true;
 
diff --git a/llvm/lib/CodeGen/MachineLoopInfo.cpp b/llvm/lib/CodeGen/MachineLoopInfo.cpp
index bdbc57099aa8d3..1492c8c366fb41 100644
--- a/llvm/lib/CodeGen/MachineLoopInfo.cpp
+++ b/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -198,7 +198,8 @@ MDNode *MachineLoop::getLoopID() const {
   return LoopID;
 }
 
-bool MachineLoop::isLoopInvariant(MachineInstr &I) const {
+bool MachineLoop::isLoopInvariant(MachineInstr &I,
+                                  const Register ExcludeReg) const {
   MachineFunction *MF = I.getParent()->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
   const TargetSubtargetInfo &ST = MF->getSubtarget();
@@ -213,6 +214,9 @@ bool MachineLoop::isLoopInvariant(MachineInstr &I) const {
     Register Reg = MO.getReg();
     if (Reg == 0) continue;
 
+    if (ExcludeReg == Reg)
+      continue;
+
     // An instruction that uses or defines a physical register can't e.g. be
     // hoisted, so mark this as not invariant.
     if (Reg.isPhysical()) {

>From 10dbe5e99055349829af8f2a4c8d4efec637cb1c Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Fri, 23 Feb 2024 14:22:35 +0100
Subject: [PATCH 2/5] Added test

---
 .../CodeGen/AMDGPU/copy-hoist-no-spills.ll    | 94 +++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll

diff --git a/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll b/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll
new file mode 100644
index 00000000000000..55904f44082ec9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -vectorize-loops -vectorize-slp -amdgpu-early-inline-all=true -amdgpu-function-calls=false < %s | FileCheck %s
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p9:192:256:256:32"
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @_ZN6Kokkos4ImplL32hip_parallel_launch_local_memoryINS0_11ParallelForIN4Test7HexGradINS_3HIPEdfEENS_11RangePolicyIJS5_EEES5_EELj1024ELj1EEEvT_(ptr %.sroa.1.0.copyload, ptr %0, ptr %1, ptr %2, ptr %3, ptr %4, ptr %5, ptr %6, ptr %7, ptr %8, ptr %9, ptr %10, ptr %11, ptr %12, ptr %13, ptr %14, ptr %15, ptr %16, ptr %17, ptr %18, ptr %19, ptr %20, ptr %21, ptr %22, ptr %23, ptr %24, ptr %25, ptr %26, ptr %27, ptr %28, ptr %29, ptr %30, ptr %31, ptr %32, ptr %33, double %34, double %35, double %36, float %37, float %38, float %39, float %40, ptr %41) {
+; CHECK-LABEL: _ZN6Kokkos4ImplL32hip_parallel_launch_local_memoryINS0_11ParallelForIN4Test7HexGradINS_3HIPEdfEENS_11RangePolicyIJS5_EEES5_EELj1024ELj1EEEvT_:
+; CHECK-LABEL: .LBB0_1
+; CHECK-NOT: buffer_load_dword {{v[0-9]+}}
+
+.lr.ph:
+  %.pre = load double, ptr null, align 8
+  br label %42
+
+42:                                               ; preds = %42, %.lr.ph
+  %.0.i4402 = phi i32 [ 1, %.lr.ph ], [ 0, %42 ]
+  %43 = zext i32 %.0.i4402 to i64
+  %44 = load double, ptr %2, align 8
+  %45 = load double, ptr %4, align 8
+  %46 = load double, ptr %7, align 8
+  %47 = load double, ptr %13, align 8
+  %48 = load double, ptr %15, align 8
+  %49 = load double, ptr %17, align 8
+  %50 = load double, ptr %19, align 8
+  %51 = load double, ptr %18, align 8
+  %52 = load double, ptr %27, align 8
+  %53 = load double, ptr %23, align 8
+  %54 = load double, ptr %31, align 8
+  %55 = load double, ptr %33, align 8
+  %56 = load double, ptr %25, align 8
+  %57 = load double, ptr %16, align 8
+  %58 = fpext float %40 to double
+  %59 = fmul double %52, %58
+  %60 = fadd double %59, %51
+  %61 = fsub double %60, %48
+  %62 = fmul double 0.000000e+00, %36
+  %63 = fsub double %61, %62
+  %64 = fadd double %49, %63
+  %65 = fptrunc double %64 to float
+  %66 = fsub double 0.000000e+00, %34
+  %67 = fpext float %39 to double
+  %68 = fmul double %53, %67
+  %69 = fsub double %66, %68
+  %70 = fadd double %50, %69
+  %71 = fptrunc double %70 to float
+  store float 0.000000e+00, ptr %30, align 4
+  store float 0.000000e+00, ptr %26, align 4
+  %72 = getelementptr float, ptr %41, i64 %43
+  store float %38, ptr %72, align 4
+  store float %65, ptr %29, align 4
+  store float %71, ptr %14, align 4
+  store float %39, ptr %3, align 4
+  store float %39, ptr %11, align 4
+  %73 = fsub double %46, %44
+  %74 = fptrunc double %73 to float
+  %75 = fsub double %47, %45
+  %76 = fptrunc double %75 to float
+  %77 = fadd float %74, %76
+  %78 = fpext float %37 to double
+  %79 = fmul contract double %56, 0.000000e+00
+  %80 = fsub contract double %34, %79
+  %81 = fpext float %77 to double
+  %82 = fmul double %.pre, %81
+  %83 = fsub double %80, %82
+  %84 = fpext float %38 to double
+  %85 = fmul double %57, %84
+  %86 = fsub double %83, %85
+  %87 = fptrunc double %86 to float
+  %88 = fmul double %34, 0.000000e+00
+  %89 = fmul double %54, %78
+  %90 = fadd double %89, %88
+  %91 = fsub double %90, %55
+  %92 = fmul double 0.000000e+00, %35
+  %93 = fsub double %91, %92
+  %94 = fmul double %34, %34
+  %95 = fadd double %93, %94
+  %96 = fptrunc double %95 to float
+  store float %87, ptr %1, align 4
+  store float %37, ptr %21, align 4
+  store float %96, ptr %0, align 4
+  store float 0.000000e+00, ptr %9, align 4
+  store float 0.000000e+00, ptr %32, align 4
+  store float 0.000000e+00, ptr %20, align 4
+  store float 0.000000e+00, ptr %22, align 4
+  store float 0.000000e+00, ptr %5, align 4
+  store float 0.000000e+00, ptr %28, align 4
+  store float 0.000000e+00, ptr %12, align 4
+  store float 0.000000e+00, ptr %6, align 4
+  store float 0.000000e+00, ptr %8, align 4
+  store float 0.000000e+00, ptr %.sroa.1.0.copyload, align 4
+  store float %37, ptr %10, align 4
+  store float 0.000000e+00, ptr %24, align 4
+  br label %42
+}

>From 5e991929129a28572107561ce28d49576d0fd820 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Fri, 23 Feb 2024 14:39:40 +0100
Subject: [PATCH 3/5] Reduced function name in the test

---
 llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll b/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll
index 55904f44082ec9..86d6ee7609d2cc 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll
@@ -3,8 +3,8 @@
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p9:192:256:256:32"
 target triple = "amdgcn-amd-amdhsa"
 
-define amdgpu_kernel void @_ZN6Kokkos4ImplL32hip_parallel_launch_local_memoryINS0_11ParallelForIN4Test7HexGradINS_3HIPEdfEENS_11RangePolicyIJS5_EEES5_EELj1024ELj1EEEvT_(ptr %.sroa.1.0.copyload, ptr %0, ptr %1, ptr %2, ptr %3, ptr %4, ptr %5, ptr %6, ptr %7, ptr %8, ptr %9, ptr %10, ptr %11, ptr %12, ptr %13, ptr %14, ptr %15, ptr %16, ptr %17, ptr %18, ptr %19, ptr %20, ptr %21, ptr %22, ptr %23, ptr %24, ptr %25, ptr %26, ptr %27, ptr %28, ptr %29, ptr %30, ptr %31, ptr %32, ptr %33, double %34, double %35, double %36, float %37, float %38, float %39, float %40, ptr %41) {
-; CHECK-LABEL: _ZN6Kokkos4ImplL32hip_parallel_launch_local_memoryINS0_11ParallelForIN4Test7HexGradINS_3HIPEdfEENS_11RangePolicyIJS5_EEES5_EELj1024ELj1EEEvT_:
+define amdgpu_kernel void @foo(ptr %.sroa.1.0.copyload, ptr %0, ptr %1, ptr %2, ptr %3, ptr %4, ptr %5, ptr %6, ptr %7, ptr %8, ptr %9, ptr %10, ptr %11, ptr %12, ptr %13, ptr %14, ptr %15, ptr %16, ptr %17, ptr %18, ptr %19, ptr %20, ptr %21, ptr %22, ptr %23, ptr %24, ptr %25, ptr %26, ptr %27, ptr %28, ptr %29, ptr %30, ptr %31, ptr %32, ptr %33, double %34, double %35, double %36, float %37, float %38, float %39, float %40, ptr %41) {
+; CHECK-LABEL: foo:
 ; CHECK-LABEL: .LBB0_1
 ; CHECK-NOT: buffer_load_dword {{v[0-9]+}}
 

>From 67ccbd1430cd7e4f94bf0212a634367d8e9936a4 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Mon, 26 Feb 2024 13:47:00 +0100
Subject: [PATCH 4/5] Addressed PR comments

---
 llvm/lib/CodeGen/MachineLICM.cpp                 | 8 ++++----
 llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index a3bfd97e5a90cb..997f6eb085129d 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1264,13 +1264,13 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI,
 
   // If we have a COPY with other uses in the loop, hoist to allow the users to
   // also be hoisted.
-  Register defReg;
+  Register DefReg;
   if (MI.isCopy() && MI.getOperand(0).isReg() &&
-      (defReg = MI.getOperand(0).getReg()).isVirtual() &&
+      (DefReg = MI.getOperand(0).getReg()).isVirtual() &&
       MI.getOperand(1).isReg() && MI.getOperand(1).getReg().isVirtual() &&
       IsLoopInvariantInst(MI, CurLoop) &&
       any_of(MRI->use_nodbg_instructions(MI.getOperand(0).getReg()),
-             [&CurLoop, this, defReg, Cost](MachineInstr &UseMI) {
+             [&CurLoop, this, DefReg, Cost](MachineInstr &UseMI) {
                if (!CurLoop->contains(&UseMI))
                  return false;
 
@@ -1278,7 +1278,7 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI,
                // RP we're fine to hoist it even if the user can't be hoisted
                // later Otherwise we want to check the user if it's hoistable
                if (CanCauseHighRegPressure(Cost, false) &&
-                   !CurLoop->isLoopInvariant(UseMI, defReg))
+                   !CurLoop->isLoopInvariant(UseMI, DefReg))
                  return false;
 
                return true;
diff --git a/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll b/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll
index 86d6ee7609d2cc..4ffeb95721f3c7 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; NOTE: There must be no spill reload inside the loop starting with LBB0_1:
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -vectorize-loops -vectorize-slp -amdgpu-early-inline-all=true -amdgpu-function-calls=false < %s | FileCheck %s
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p9:192:256:256:32"
 target triple = "amdgcn-amd-amdhsa"

>From 5790f2376eaa403223ca8b720cd55057c8b1894f Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Mon, 26 Feb 2024 15:26:32 +0100
Subject: [PATCH 5/5] Addressed PR comments

---
 llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll b/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll
index 4ffeb95721f3c7..4ae122bb3546a7 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll
@@ -1,5 +1,5 @@
 ; NOTE: There must be no spill reload inside the loop starting with LBB0_1:
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -vectorize-loops -vectorize-slp -amdgpu-early-inline-all=true -amdgpu-function-calls=false < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck %s
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p9:192:256:256:32"
 target triple = "amdgcn-amd-amdhsa"