[llvm] [AArch64] Fix resource length computation for STP. (PR #81749)

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Fri Feb 16 07:02:10 PST 2024


https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/81749

>From debf1491e9c9e64be6abd4fd7606fa90cd493c98 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 14 Feb 2024 15:33:15 +0000
Subject: [PATCH] [AArch64] Fix resource length computation for STP

On some uArchs, `STP [s|d], [s|d]` first combines the 2 input registers in a
single register using a vector execution unit. IIUC AArch64StorePairSuppress
tries to prevent forming STPs in case the critical resource are the vector
units, in order to prevent adding more pressure on those units.

The implementation however simply computes the new critical resource length by
adding resource for another STP. If load/store units are the critical resource,
this means we increase that length by one, and incorrectly prevent forming the
STP.

This patch adjusts the resource computation by also removing 2 STRs, as
introducing a STP will remove 2 single stores. This should more accurately
reflect the resource usage after introducing an STP, and does not prevent
forming STPs if load/store units are the critical resources; in those cases,
STP can actually help to reduce resource usage.
---
 .../AArch64/AArch64StorePairSuppress.cpp       | 18 +++++++++++++-----
 llvm/test/CodeGen/AArch64/arm64-stur.ll        |  5 ++---
 llvm/test/CodeGen/AArch64/merge-store.ll       |  3 +--
 .../AArch64/storepairsuppress_minsize.ll       |  9 +++------
 4 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 7324be48a415ad..773c309a0943e3 100644
--- a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -81,15 +81,23 @@ bool AArch64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB)
   MachineTraceMetrics::Trace BBTrace = MinInstr->getTrace(BB);
   unsigned ResLength = BBTrace.getResourceLength();
 
-  // Get the machine model's scheduling class for STPQi.
+  // Get the machine model's scheduling class for STPDi and STRDui.
   // Bypass TargetSchedule's SchedClass resolution since we only have an opcode.
   unsigned SCIdx = TII->get(AArch64::STPDi).getSchedClass();
-  const MCSchedClassDesc *SCDesc =
+  const MCSchedClassDesc *PairSCDesc =
       SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
 
-  // If a subtarget does not define resources for STPQi, bail here.
-  if (SCDesc->isValid() && !SCDesc->isVariant()) {
-    unsigned ResLenWithSTP = BBTrace.getResourceLength(std::nullopt, SCDesc);
+  unsigned SCIdx2 = TII->get(AArch64::STRDui).getSchedClass();
+  const MCSchedClassDesc *SingleSCDesc =
+      SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx2);
+
+  // If a subtarget does not define resources for STPDi, bail here.
+  if (PairSCDesc->isValid() && !PairSCDesc->isVariant() &&
+      SingleSCDesc->isValid() && !SingleSCDesc->isVariant()) {
+    // Compute the new critical resource length after replacing 2 separate
+    // STRDui with one STPDi.
+    unsigned ResLenWithSTP = BBTrace.getResourceLength(
+        std::nullopt, PairSCDesc, {SingleSCDesc, SingleSCDesc});
     if (ResLenWithSTP > ResLength) {
       LLVM_DEBUG(dbgs() << "  Suppress STP in BB: " << BB->getNumber()
                         << " resources " << ResLength << " -> " << ResLenWithSTP
diff --git a/llvm/test/CodeGen/AArch64/arm64-stur.ll b/llvm/test/CodeGen/AArch64/arm64-stur.ll
index 2a74abb10226da..7d9de9e28ff5c0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-stur.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-stur.ll
@@ -65,9 +65,8 @@ declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) nounwind
 
 ; CHECK-LABEL: unaligned:
 ; CHECK-NOT: str q0
-; CHECK: str     d[[REG:[0-9]+]], [x0]
-; CHECK: ext.16b v[[REG2:[0-9]+]], v[[REG]], v[[REG]], #8
-; CHECK: str     d[[REG2]], [x0, #8]
+; CHECK: ext.16b v[[REG2:[0-9]+]], v[[REG:[0-9]+]], v[[REG]], #8
+; CHECK: stp     d[[REG]], d[[REG2]], [x0]
 define void @unaligned(ptr %p, <4 x i32> %v) nounwind {
   store <4 x i32> %v, ptr %p, align 4
   ret void
diff --git a/llvm/test/CodeGen/AArch64/merge-store.ll b/llvm/test/CodeGen/AArch64/merge-store.ll
index b93d0c3bc96086..6653984562ae6d 100644
--- a/llvm/test/CodeGen/AArch64/merge-store.ll
+++ b/llvm/test/CodeGen/AArch64/merge-store.ll
@@ -45,8 +45,7 @@ define void @merge_vec_extract_stores(<4 x float> %v1, ptr %ptr) {
 ; SPLITTING-LABEL: merge_vec_extract_stores:
 ; SPLITTING:       // %bb.0:
 ; SPLITTING-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; SPLITTING-NEXT:    str d0, [x0, #24]
-; SPLITTING-NEXT:    str d1, [x0, #32]
+; SPLITTING-NEXT:    stp d0, d1, [x0, #24]
 ; SPLITTING-NEXT:    ret
 ;
 ; MISALIGNED-LABEL: merge_vec_extract_stores:
diff --git a/llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll b/llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll
index 6d1986ebb8182b..9bad71089f8a31 100644
--- a/llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll
+++ b/llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll
@@ -16,12 +16,9 @@ define void @test_default() uwtable {
 ; CHECK-NEXT:    bl return_in_block
 ; CHECK-NEXT:    adrp x8, in_block_store
 ; CHECK-NEXT:    add x8, x8, :lo12:in_block_store
-; CHECK-NEXT:    str d0, [x8]
-; CHECK-NEXT:    str d1, [x8, #8]
-; CHECK-NEXT:    str d2, [x8, #16]
-; CHECK-NEXT:    str d3, [x8, #24]
-; CHECK-NEXT:    str d4, [x8, #32]
-; CHECK-NEXT:    str d5, [x8, #40]
+; CHECK-NEXT:    stp d0, d1, [x8]
+; CHECK-NEXT:    stp d2, d3, [x8, #16]
+; CHECK-NEXT:    stp d4, d5, [x8, #32]
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w30



More information about the llvm-commits mailing list