[llvm] [AArch64] Fix resource length computation for STP. (PR #81749)

via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 14 07:55:01 PST 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-aarch64

Author: Florian Hahn (fhahn)

<details>
<summary>Changes</summary>

On some uArchs, `STP [s|d], [s|d]` first combines the 2 input registers in a single register using a vector execution unit. IIUC AArch64StorePairSuppress tries to prevent forming STPs in case the critical resource are the vector units, in order to prevent adding more pressure on those units.

The implementation however simply computes the new critical resource length by adding resource for another STP. If load/store units are the critical resource, this means we increase that length by one, and incorrectly prevent forming the STP.

This patch adjusts the resource computation by also removing 2 STRs, as introducing a STP will remove 2 single stores. This should more accurately reflect the resource usage after introducing an STP, and does not prevent forming STPs if load/store units are the critical resources; in those cases, STP can actually help to reduce resource usage.

---
Full diff: https://github.com/llvm/llvm-project/pull/81749.diff


4 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp (+13-5) 
- (modified) llvm/test/CodeGen/AArch64/arm64-stur.ll (+2-3) 
- (modified) llvm/test/CodeGen/AArch64/merge-store.ll (+1-2) 
- (modified) llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll (+3-6) 


``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 7324be48a415ad..773c309a0943e3 100644
--- a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -81,15 +81,23 @@ bool AArch64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB)
   MachineTraceMetrics::Trace BBTrace = MinInstr->getTrace(BB);
   unsigned ResLength = BBTrace.getResourceLength();
 
-  // Get the machine model's scheduling class for STPQi.
+  // Get the machine model's scheduling class for STPDi and STRDui.
   // Bypass TargetSchedule's SchedClass resolution since we only have an opcode.
   unsigned SCIdx = TII->get(AArch64::STPDi).getSchedClass();
-  const MCSchedClassDesc *SCDesc =
+  const MCSchedClassDesc *PairSCDesc =
       SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
 
-  // If a subtarget does not define resources for STPQi, bail here.
-  if (SCDesc->isValid() && !SCDesc->isVariant()) {
-    unsigned ResLenWithSTP = BBTrace.getResourceLength(std::nullopt, SCDesc);
+  unsigned SCIdx2 = TII->get(AArch64::STRDui).getSchedClass();
+  const MCSchedClassDesc *SingleSCDesc =
+      SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx2);
+
+  // If a subtarget does not define resources for STPDi, bail here.
+  if (PairSCDesc->isValid() && !PairSCDesc->isVariant() &&
+      SingleSCDesc->isValid() && !SingleSCDesc->isVariant()) {
+    // Compute the new critical resource length after replacing 2 separate
+    // STRDui with one STPDi.
+    unsigned ResLenWithSTP = BBTrace.getResourceLength(
+        std::nullopt, PairSCDesc, {SingleSCDesc, SingleSCDesc});
     if (ResLenWithSTP > ResLength) {
       LLVM_DEBUG(dbgs() << "  Suppress STP in BB: " << BB->getNumber()
                         << " resources " << ResLength << " -> " << ResLenWithSTP
diff --git a/llvm/test/CodeGen/AArch64/arm64-stur.ll b/llvm/test/CodeGen/AArch64/arm64-stur.ll
index 2a74abb10226da..7d9de9e28ff5c0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-stur.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-stur.ll
@@ -65,9 +65,8 @@ declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) nounwind
 
 ; CHECK-LABEL: unaligned:
 ; CHECK-NOT: str q0
-; CHECK: str     d[[REG:[0-9]+]], [x0]
-; CHECK: ext.16b v[[REG2:[0-9]+]], v[[REG]], v[[REG]], #8
-; CHECK: str     d[[REG2]], [x0, #8]
+; CHECK: ext.16b v[[REG2:[0-9]+]], v[[REG:[0-9]+]], v[[REG]], #8
+; CHECK: stp     d[[REG]], d[[REG2]], [x0]
 define void @unaligned(ptr %p, <4 x i32> %v) nounwind {
   store <4 x i32> %v, ptr %p, align 4
   ret void
diff --git a/llvm/test/CodeGen/AArch64/merge-store.ll b/llvm/test/CodeGen/AArch64/merge-store.ll
index b93d0c3bc96086..6653984562ae6d 100644
--- a/llvm/test/CodeGen/AArch64/merge-store.ll
+++ b/llvm/test/CodeGen/AArch64/merge-store.ll
@@ -45,8 +45,7 @@ define void @merge_vec_extract_stores(<4 x float> %v1, ptr %ptr) {
 ; SPLITTING-LABEL: merge_vec_extract_stores:
 ; SPLITTING:       // %bb.0:
 ; SPLITTING-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; SPLITTING-NEXT:    str d0, [x0, #24]
-; SPLITTING-NEXT:    str d1, [x0, #32]
+; SPLITTING-NEXT:    stp d0, d1, [x0, #24]
 ; SPLITTING-NEXT:    ret
 ;
 ; MISALIGNED-LABEL: merge_vec_extract_stores:
diff --git a/llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll b/llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll
index 6d1986ebb8182b..9bad71089f8a31 100644
--- a/llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll
+++ b/llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll
@@ -16,12 +16,9 @@ define void @test_default() uwtable {
 ; CHECK-NEXT:    bl return_in_block
 ; CHECK-NEXT:    adrp x8, in_block_store
 ; CHECK-NEXT:    add x8, x8, :lo12:in_block_store
-; CHECK-NEXT:    str d0, [x8]
-; CHECK-NEXT:    str d1, [x8, #8]
-; CHECK-NEXT:    str d2, [x8, #16]
-; CHECK-NEXT:    str d3, [x8, #24]
-; CHECK-NEXT:    str d4, [x8, #32]
-; CHECK-NEXT:    str d5, [x8, #40]
+; CHECK-NEXT:    stp d0, d1, [x8]
+; CHECK-NEXT:    stp d2, d3, [x8, #16]
+; CHECK-NEXT:    stp d4, d5, [x8, #32]
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w30

``````````

</details>


https://github.com/llvm/llvm-project/pull/81749


More information about the llvm-commits mailing list