[llvm] 1e377a2 - AMDGPU: Don't consider global pressure when bundling soft clauses

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 2 16:00:22 PST 2021


Author: Matt Arsenault
Date: 2021-02-02T19:00:14-05:00
New Revision: 1e377a273f59375d8e6a424f66f069b3adfa1ca4

URL: https://github.com/llvm/llvm-project/commit/1e377a273f59375d8e6a424f66f069b3adfa1ca4
DIFF: https://github.com/llvm/llvm-project/commit/1e377a273f59375d8e6a424f66f069b3adfa1ca4.diff

LOG: AMDGPU: Don't consider global pressure when bundling soft clauses

This should only consider whether the pressure impact of the bundle at
the given point in the program will decrease the occupancy. High VGPR
pressure was incorrectly blocking the formation of scalar bundles, and
vice versa. This was also blocking bundling from high pressure
situations at other points in the program.

Added: 
    llvm/test/CodeGen/AMDGPU/soft-clause-bundle-local-pressure.mir

Modified: 
    llvm/lib/Target/AMDGPU/GCNRegPressure.h
    llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index ba8c85aa502b..d07da90aa944 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -127,6 +127,21 @@ class GCNRPTracker {
     return Res;
   }
 
+  /// Return the VGPR pressure at the current point
+  unsigned getCurrentNumVGPR() const {
+    return CurPressure.getVGPRNum();
+  }
+
+  /// Return the SGPR pressure at the current point
+  unsigned getCurrentNumSGPR() const {
+    return CurPressure.getSGPRNum();
+  }
+
+  /// Return the occupancy at the current point
+  unsigned getCurrentOccupancy(const GCNSubtarget &ST) const {
+    return CurPressure.getOccupancy(ST);
+  }
+
   decltype(LiveRegs) moveLiveRegs() {
     return std::move(LiveRegs);
   }

diff  --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index 59ce080c8e70..acaa3b367415 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -64,10 +64,11 @@ class SIFormMemoryClauses : public MachineFunctionPass {
 
   bool canBundle(const MachineInstr &MI, const RegUse &Defs,
                  const RegUse &Uses) const;
-  bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT);
+  bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT,
+                     bool IsVMEMClause);
   void collectRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
   bool processRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses,
-                      GCNDownwardRPTracker &RPT);
+                      GCNDownwardRPTracker &RPT, bool IsVMEMClause);
 
   const GCNSubtarget *ST;
   const SIRegisterInfo *TRI;
@@ -243,21 +244,33 @@ bool SIFormMemoryClauses::canBundle(const MachineInstr &MI, const RegUse &Defs,
 // Function returns false if pressure would hit the limit if instruction is
 // bundled into a memory clause.
 bool SIFormMemoryClauses::checkPressure(const MachineInstr &MI,
-                                        GCNDownwardRPTracker &RPT) {
+                                        GCNDownwardRPTracker &RPT,
+                                        bool IsVMEMClause) {
+  unsigned OldOccupancy = RPT.getCurrentOccupancy(*ST);
+
   // NB: skip advanceBeforeNext() call. Since all defs will be marked
   // early-clobber they will all stay alive at least to the end of the
-  // clause. Therefor we should not decrease pressure even if load
-  // pointer becomes dead and could otherwise be reused for destination.
+  // clause. Therefore we should not decrease pressure even if load pointer
+  // becomes dead and could otherwise be reused for destination.
   RPT.advanceToNext();
-  GCNRegPressure MaxPressure = RPT.moveMaxPressure();
-  unsigned Occupancy = MaxPressure.getOccupancy(*ST);
-  if (Occupancy >= MFI->getMinAllowedOccupancy() &&
-      MaxPressure.getVGPRNum() <= MaxVGPRs &&
-      MaxPressure.getSGPRNum() <= MaxSGPRs) {
-    LastRecordedOccupancy = Occupancy;
-    return true;
+
+  unsigned NewOccupancy = RPT.getCurrentOccupancy(*ST);
+  if (NewOccupancy < OldOccupancy &&
+      NewOccupancy < MFI->getMinAllowedOccupancy())
+    return false;
+
+  // Scalar loads/clauses cannot def VGPRs, and vector loads/clauses cannot def
+  // SGPRs (ignoring artificial implicit operands).
+  if (IsVMEMClause) {
+    if (RPT.getCurrentNumVGPR() > MaxVGPRs)
+      return false;
+  } else {
+    if (RPT.getCurrentNumSGPR() > MaxSGPRs)
+      return false;
   }
-  return false;
+
+  LastRecordedOccupancy = NewOccupancy;
+  return true;
 }
 
 // Collect register defs and uses along with their lane masks and states.
@@ -289,13 +302,14 @@ void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI,
 // Check register def/use conflicts, occupancy limits and collect def/use maps.
 // Return true if instruction can be bundled with previous. It it cannot
 // def/use maps are not updated.
-bool SIFormMemoryClauses::processRegUses(const MachineInstr &MI,
-                                         RegUse &Defs, RegUse &Uses,
-                                         GCNDownwardRPTracker &RPT) {
+bool SIFormMemoryClauses::processRegUses(const MachineInstr &MI, RegUse &Defs,
+                                         RegUse &Uses,
+                                         GCNDownwardRPTracker &RPT,
+                                         bool IsVMEMClause) {
   if (!canBundle(MI, Defs, Uses))
     return false;
 
-  if (!checkPressure(MI, RPT))
+  if (!checkPressure(MI, RPT, IsVMEMClause))
     return false;
 
   collectRegUses(MI, Defs, Uses);
@@ -349,7 +363,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
 
       const GCNRPTracker::LiveRegSet LiveRegsCopy(RPT.getLiveRegs());
       RegUse Defs, Uses;
-      if (!processRegUses(MI, Defs, Uses, RPT)) {
+      if (!processRegUses(MI, Defs, Uses, RPT, IsVMEM)) {
         RPT.reset(MI, &LiveRegsCopy);
         continue;
       }
@@ -367,7 +381,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
         // A load from pointer which was loaded inside the same bundle is an
         // impossible clause because we will need to write and read the same
         // register inside. In this case processRegUses will return false.
-        if (!processRegUses(*Next, Defs, Uses, RPT))
+        if (!processRegUses(*Next, Defs, Uses, RPT, IsVMEM))
           break;
 
         ++Length;

diff  --git a/llvm/test/CodeGen/AMDGPU/soft-clause-bundle-local-pressure.mir b/llvm/test/CodeGen/AMDGPU/soft-clause-bundle-local-pressure.mir
new file mode 100644
index 000000000000..a9409dddcdd4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/soft-clause-bundle-local-pressure.mir
@@ -0,0 +1,271 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+xnack -run-pass=si-form-memory-clauses -verify-machineinstrs -o - %s | FileCheck %s
+
+# The VGPR pressure at the point the scalar clause would form decreases the
+# occupancy. However, this occupancy decrease is not due to the introduction of
+# the bundle so the bundle should still be inserted.
+
+---
+name: form_sgpr_clause_high_vgpr_pressure
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  waveLimiter:     false
+  memoryBound: false
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       10
+
+body:             |
+  bb.0:
+    liveins: $sgpr4_sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
+    ; CHECK-LABEL: name: form_sgpr_clause_high_vgpr_pressure
+    ; CHECK: early-clobber %26:sreg_32_xm0_xexec, early-clobber %27:sreg_32_xm0_xexec = BUNDLE [[REG:%[0-9]+]] {
+    ; CHECK:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG]], 0, 0, 0 :: (load 4, addrspace 4)
+    ; CHECK:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG]], 8, 0, 0 :: (load 4, addrspace 4)
+    ; CHECK: }
+    %0:sreg_64 = COPY $sgpr4_sgpr5
+    %1:vgpr_32 = COPY $vgpr0
+    %2:vgpr_32 = COPY $vgpr1
+    %3:vgpr_32 = COPY $vgpr2
+    %4:vgpr_32 = COPY $vgpr3
+    %5:vgpr_32 = COPY $vgpr4
+    %6:vgpr_32 = COPY $vgpr5
+    %7:vgpr_32 = COPY $vgpr6
+    %8:vgpr_32 = COPY $vgpr7
+    %9:vgpr_32 = COPY $vgpr8
+    %10:vgpr_32 = COPY $vgpr9
+    %11:vgpr_32 = COPY $vgpr10
+    %12:vgpr_32 = COPY $vgpr11
+    %13:vgpr_32 = COPY $vgpr12
+    %14:vgpr_32 = COPY $vgpr13
+    %15:vgpr_32 = COPY $vgpr14
+    %16:vgpr_32 = COPY $vgpr15
+    %17:vgpr_32 = COPY $vgpr16
+    %18:vgpr_32 = COPY $vgpr17
+    %19:vgpr_32 = COPY $vgpr18
+    %20:vgpr_32 = COPY $vgpr19
+    %21:vgpr_32 = COPY $vgpr20
+    %22:vgpr_32 = COPY $vgpr21
+    %23:vgpr_32 = COPY $vgpr22
+    %24:vgpr_32 = COPY $vgpr23
+    %27:vgpr_32 = COPY $vgpr24
+    %25:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 0, 0, 0 :: (load 4, align 4, addrspace 4)
+    %26:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 8, 0, 0 :: (load 4, align 4, addrspace 4)
+    S_NOP 0, implicit %1
+    S_NOP 0, implicit %2
+    S_NOP 0, implicit %3
+    S_NOP 0, implicit %4
+    S_NOP 0, implicit %5
+    S_NOP 0, implicit %6
+    S_NOP 0, implicit %7
+    S_NOP 0, implicit %8
+    S_NOP 0, implicit %9
+    S_NOP 0, implicit %10
+    S_NOP 0, implicit %11
+    S_NOP 0, implicit %12
+    S_NOP 0, implicit %13
+    S_NOP 0, implicit %14
+    S_NOP 0, implicit %15
+    S_NOP 0, implicit %16
+    S_NOP 0, implicit %17
+    S_NOP 0, implicit %18
+    S_NOP 0, implicit %19
+    S_NOP 0, implicit %20
+    S_NOP 0, implicit %21
+    S_NOP 0, implicit %22
+    S_NOP 0, implicit %23
+    S_NOP 0, implicit %24
+    S_NOP 0, implicit %27
+    S_ENDPGM 0, implicit %25, implicit %26
+
+...
+
+# The SGPR pressure at the point the vector clause would form decreases the
+# occupancy. However, this occupancy decrease is not due to the introduction of
+# the bundle so the bundle should still be inserted.
+---
+name: form_vgpr_clause_high_sgpr_pressure
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  waveLimiter:     false
+  memoryBound: false
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       10
+
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr32, $sgpr33, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80
+    ; CHECK-LABEL: name: form_vgpr_clause_high_sgpr_pressure
+    ; CHECK: early-clobber %82:vgpr_32, early-clobber %83:vgpr_32 = BUNDLE [[REG:%[0-9]+]], implicit $exec {
+    ; CHECK:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; CHECK:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG]], 8, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; CHECK: }
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %1:sgpr_32 = COPY $sgpr0
+    %2:sgpr_32 = COPY $sgpr1
+    %3:sgpr_32 = COPY $sgpr2
+    %4:sgpr_32 = COPY $sgpr3
+    %5:sgpr_32 = COPY $sgpr4
+    %6:sgpr_32 = COPY $sgpr5
+    %7:sgpr_32 = COPY $sgpr6
+    %8:sgpr_32 = COPY $sgpr7
+    %9:sgpr_32 = COPY $sgpr8
+    %10:sgpr_32 = COPY $sgpr9
+    %11:sgpr_32 = COPY $sgpr10
+    %12:sgpr_32 = COPY $sgpr11
+    %13:sgpr_32 = COPY $sgpr12
+    %14:sgpr_32 = COPY $sgpr13
+    %15:sgpr_32 = COPY $sgpr14
+    %16:sgpr_32 = COPY $sgpr15
+    %17:sgpr_32 = COPY $sgpr16
+    %18:sgpr_32 = COPY $sgpr17
+    %19:sgpr_32 = COPY $sgpr18
+    %20:sgpr_32 = COPY $sgpr19
+    %21:sgpr_32 = COPY $sgpr20
+    %22:sgpr_32 = COPY $sgpr21
+    %23:sgpr_32 = COPY $sgpr22
+    %24:sgpr_32 = COPY $sgpr23
+    %25:sgpr_32 = COPY $sgpr24
+    %26:sgpr_32 = COPY $sgpr25
+    %27:sgpr_32 = COPY $sgpr26
+    %28:sgpr_32 = COPY $sgpr27
+    %29:sgpr_32 = COPY $sgpr28
+    %30:sgpr_32 = COPY $sgpr29
+    %31:sgpr_32 = COPY $sgpr30
+    %32:sgpr_32 = COPY $sgpr31
+    %33:sgpr_32 = COPY $sgpr32
+    %34:sgpr_32 = COPY $sgpr33
+    %35:sgpr_32 = COPY $sgpr34
+    %36:sgpr_32 = COPY $sgpr35
+    %37:sgpr_32 = COPY $sgpr36
+    %38:sgpr_32 = COPY $sgpr37
+    %39:sgpr_32 = COPY $sgpr38
+    %40:sgpr_32 = COPY $sgpr39
+    %41:sgpr_32 = COPY $sgpr40
+    %42:sgpr_32 = COPY $sgpr41
+    %43:sgpr_32 = COPY $sgpr42
+    %44:sgpr_32 = COPY $sgpr43
+    %45:sgpr_32 = COPY $sgpr44
+    %46:sgpr_32 = COPY $sgpr45
+    %47:sgpr_32 = COPY $sgpr46
+    %48:sgpr_32 = COPY $sgpr47
+    %49:sgpr_32 = COPY $sgpr48
+    %50:sgpr_32 = COPY $sgpr49
+    %51:sgpr_32 = COPY $sgpr50
+    %52:sgpr_32 = COPY $sgpr51
+    %53:sgpr_32 = COPY $sgpr52
+    %54:sgpr_32 = COPY $sgpr53
+    %55:sgpr_32 = COPY $sgpr54
+    %56:sgpr_32 = COPY $sgpr55
+    %57:sgpr_32 = COPY $sgpr56
+    %58:sgpr_32 = COPY $sgpr57
+    %59:sgpr_32 = COPY $sgpr58
+    %60:sgpr_32 = COPY $sgpr59
+    %61:sgpr_32 = COPY $sgpr60
+    %62:sgpr_32 = COPY $sgpr61
+    %63:sgpr_32 = COPY $sgpr62
+    %64:sgpr_32 = COPY $sgpr63
+    %65:sgpr_32 = COPY $sgpr64
+    %66:sgpr_32 = COPY $sgpr65
+    %67:sgpr_32 = COPY $sgpr66
+    %68:sgpr_32 = COPY $sgpr67
+    %69:sgpr_32 = COPY $sgpr68
+    %70:sgpr_32 = COPY $sgpr69
+    %71:sgpr_32 = COPY $sgpr70
+    %72:sgpr_32 = COPY $sgpr71
+    %73:sgpr_32 = COPY $sgpr72
+    %74:sgpr_32 = COPY $sgpr73
+    %75:sgpr_32 = COPY $sgpr74
+    %76:sgpr_32 = COPY $sgpr75
+    %77:sgpr_32 = COPY $sgpr76
+    %78:sgpr_32 = COPY $sgpr77
+    %79:sgpr_32 = COPY $sgpr78
+    %80:sgpr_32 = COPY $sgpr79
+    %81:sgpr_32 = COPY $sgpr80
+    %82:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec :: (load 4, align 4, addrspace 1)
+    %83:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, 0, 0, implicit $exec :: (load 4, align 4, addrspace 1)
+    S_NOP 0, implicit %1
+    S_NOP 0, implicit %2
+    S_NOP 0, implicit %3
+    S_NOP 0, implicit %4
+    S_NOP 0, implicit %5
+    S_NOP 0, implicit %6
+    S_NOP 0, implicit %7
+    S_NOP 0, implicit %8
+    S_NOP 0, implicit %9
+    S_NOP 0, implicit %10
+    S_NOP 0, implicit %11
+    S_NOP 0, implicit %12
+    S_NOP 0, implicit %13
+    S_NOP 0, implicit %14
+    S_NOP 0, implicit %15
+    S_NOP 0, implicit %16
+    S_NOP 0, implicit %17
+    S_NOP 0, implicit %18
+    S_NOP 0, implicit %19
+    S_NOP 0, implicit %20
+    S_NOP 0, implicit %21
+    S_NOP 0, implicit %22
+    S_NOP 0, implicit %23
+    S_NOP 0, implicit %24
+    S_NOP 0, implicit %25
+    S_NOP 0, implicit %26
+    S_NOP 0, implicit %27
+    S_NOP 0, implicit %28
+    S_NOP 0, implicit %29
+    S_NOP 0, implicit %30
+    S_NOP 0, implicit %31
+    S_NOP 0, implicit %32
+    S_NOP 0, implicit %33
+    S_NOP 0, implicit %34
+    S_NOP 0, implicit %35
+    S_NOP 0, implicit %36
+    S_NOP 0, implicit %37
+    S_NOP 0, implicit %38
+    S_NOP 0, implicit %39
+    S_NOP 0, implicit %40
+    S_NOP 0, implicit %41
+    S_NOP 0, implicit %42
+    S_NOP 0, implicit %43
+    S_NOP 0, implicit %44
+    S_NOP 0, implicit %45
+    S_NOP 0, implicit %46
+    S_NOP 0, implicit %47
+    S_NOP 0, implicit %48
+    S_NOP 0, implicit %49
+    S_NOP 0, implicit %50
+    S_NOP 0, implicit %51
+    S_NOP 0, implicit %52
+    S_NOP 0, implicit %53
+    S_NOP 0, implicit %54
+    S_NOP 0, implicit %55
+    S_NOP 0, implicit %56
+    S_NOP 0, implicit %57
+    S_NOP 0, implicit %58
+    S_NOP 0, implicit %59
+    S_NOP 0, implicit %60
+    S_NOP 0, implicit %61
+    S_NOP 0, implicit %62
+    S_NOP 0, implicit %63
+    S_NOP 0, implicit %64
+    S_NOP 0, implicit %65
+    S_NOP 0, implicit %66
+    S_NOP 0, implicit %67
+    S_NOP 0, implicit %68
+    S_NOP 0, implicit %69
+    S_NOP 0, implicit %70
+    S_NOP 0, implicit %71
+    S_NOP 0, implicit %72
+    S_NOP 0, implicit %73
+    S_NOP 0, implicit %74
+    S_NOP 0, implicit %75
+    S_NOP 0, implicit %76
+    S_NOP 0, implicit %77
+    S_NOP 0, implicit %78
+    S_NOP 0, implicit %79
+    S_NOP 0, implicit %80
+    S_NOP 0, implicit %81
+    S_ENDPGM 0, implicit %82, implicit %83
+...


        


More information about the llvm-commits mailing list