[llvm] [AArch64] MI Scheduler LDP/STP combine follow up (PR #79003)

Sjoerd Meijer via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 22 08:16:03 PST 2024


https://github.com/sjoerdmeijer created https://github.com/llvm/llvm-project/pull/79003

This is a follow up of 75d820dcdd86, adding more opcodes to the combine target hook enabling more LDP/STP creation.

Patch co-authored by Cameron McInally.

>From dc9c5c19e17ba71b31328eaa3faab494d3ec36c8 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <smeijer at nvidia.com>
Date: Tue, 16 Jan 2024 17:20:41 +0530
Subject: [PATCH] [AArch64] MI Scheduler LDP/STP combine follow up

This is a follow up of 75d820dcdd86, adding more opcodes to the combine
target hook enabling more LDP/STP creation.

Patch co-authored by Cameron McInally.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  24 +++
 .../test/CodeGen/AArch64/arm64-ldp-cluster.ll | 145 +++++++++++++++++-
 llvm/test/CodeGen/AArch64/zext-to-tbl.ll      |  20 +--
 3 files changed, 178 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 42b7a6418032ad..fbdd5f85e7aae8 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4214,6 +4214,27 @@ static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
   switch (FirstOpc) {
   default:
     return false;
+  case AArch64::STRSui:
+  case AArch64::STURSi:
+    return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
+  case AArch64::STRDui:
+  case AArch64::STURDi:
+    return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
+  case AArch64::STRQui:
+  case AArch64::STURQi:
+    return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
+  case AArch64::STRWui:
+  case AArch64::STURWi:
+    return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
+  case AArch64::STRXui:
+  case AArch64::STURXi:
+    return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
+  case AArch64::LDRSui:
+  case AArch64::LDURSi:
+    return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
+  case AArch64::LDRDui:
+  case AArch64::LDURDi:
+    return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
   case AArch64::LDRQui:
   case AArch64::LDURQi:
     return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
@@ -4223,6 +4244,9 @@ static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
   case AArch64::LDRSWui:
   case AArch64::LDURSWi:
     return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
+    return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
   }
   // These instructions can't be paired based on their opcodes.
   return false;
diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
index 83f86d1c3a7cbf..8c7b31fd34c488 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefixes=CHECK,CHECK-A57
 ; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m3 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
 
 ; Test ldr clustering.
@@ -114,6 +114,22 @@ define <2 x i64> @ldq_cluster(ptr %p) {
   ret <2 x i64> %res
 }
 
+; CHECK: ********** MI Scheduling **********
+; CHECK: LDURSi_LDRSui:%bb.0 entry
+; CHECK: Cluster ld/st SU(3) - SU(4)
+; CHECK: SU(3):   %3:fpr32 = LDURSi %0:gpr64
+; CHECK: SU(4):   %4:fpr32 = LDRSui %0:gpr64
+;
+define void @LDURSi_LDRSui(ptr nocapture readonly %arg, ptr nocapture readonly %wa, ptr nocapture readonly %wb) {
+entry:
+  %r51 = getelementptr i8, ptr %arg, i64 -4
+  %r52 = load float, ptr %r51, align 4
+  %r53 = load float, ptr %arg, align 4
+  store float %r52, ptr %wa
+  store float %r53, ptr %wb
+  ret void
+}
+
 ; Test LDURQi / LDRQui clustering
 ;
 ; CHECK: ********** MI Scheduling **********
@@ -154,3 +170,130 @@ vector_body:
 exit:
   ret void
 }
+
+; Test LDURDi / LDRDui clustering
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: LDURDi_LDRDui:%bb.1 vector_body
+;
+; CHECK: Cluster ld/st SU(2) - SU(6)
+; CHECK: Cluster ld/st SU(3) - SU(7)
+;
+; CHECK: SU(2): %{{[0-9]+}}:fpr64 = LDURDi
+; CHECK: SU(3): %{{[0-9]+}}:fpr64 = LDURDi
+; CHECK: SU(6): %{{[0-9]+}}:fpr64 = LDRDui
+; CHECK: SU(7): %{{[0-9]+}}:fpr64 = LDRDui
+;
+define void @LDURDi_LDRDui(ptr nocapture readonly %arg) {
+entry:
+  br label %vector_body
+vector_body:
+  %phi1 = phi ptr [ null, %entry ], [ %r63, %vector_body ]
+  %phi2 = phi ptr [ %arg, %entry ], [ %r62, %vector_body ]
+  %phi3 = phi i32 [ 0, %entry ], [ %r61, %vector_body ]
+  %r51 = getelementptr i8, ptr %phi1, i64 -8
+  %r52 = load <2 x float>, ptr %r51, align 8
+  %r53 = getelementptr i8, ptr %phi2, i64 -8
+  %r54 = load <2 x float>, ptr %r53, align 8
+  %r55 = fmul fast <2 x float> %r54, <float 3.0, float 4.0>
+  %r56 = fsub fast <2 x float> %r52, %r55
+  store <2 x float> %r56, ptr %r51, align 1
+  %r57 = load <2 x float>, ptr %phi1, align 8
+  %r58 = load <2 x float>, ptr %phi2, align 8
+  %r59 = fmul fast <2 x float> %r58,  <float 3.0, float 4.0>
+  %r60 = fsub fast <2 x float> %r57, %r59
+  store <2 x float> %r60, ptr %phi1, align 1
+  %r61 = add i32 %phi3, 4
+  %r62 = getelementptr i8, ptr %phi2, i64 32
+  %r63 = getelementptr i8, ptr %phi1, i64 32
+  %r.not = icmp eq i32 %r61, 0
+  br i1 %r.not, label %exit, label %vector_body
+exit:
+  ret void
+}
+
+; CHECK: ********** MI Scheduling **********
+; CHECK: LDURXi_LDRXui:%bb.0 entry
+; CHECK: Cluster ld/st SU(3) - SU(4)
+; CHECK: SU(3):  %{{[0-9]+}}:gpr64 = LDURXi 
+; CHECK: SU(4):  %{{[0-9]+}}:gpr64 = LDRXui
+;
+define void @LDURXi_LDRXui(ptr nocapture readonly %arg, ptr nocapture readonly %wa, ptr nocapture readonly %wb) {
+entry:
+  %r51 = getelementptr i8, ptr %arg, i64 -8
+  %r52 = load i64, ptr %r51, align 8
+  %r53 = load i64, ptr %arg, align 8
+  store i64 %r52, ptr %wa
+  store i64 %r53, ptr %wb
+  ret void
+}
+
+; CHECK: ********** MI Scheduling **********
+; CHECK: STURWi_STRWui:%bb.0 entry
+; CHECK: Cluster ld/st SU(3) - SU(4)
+; CHECK: SU(3):   STURWi %{{[0-9]+}}:gpr32
+; CHECK: SU(4):   STRWui %{{[0-9]+}}:gpr32
+;
+define void @STURWi_STRWui(ptr nocapture readonly %arg, i32 %b, i32 %c) {
+entry:
+  %r51 = getelementptr i8, ptr %arg, i64 -4
+  store i32 %b, ptr %r51
+  store i32 %c, ptr %arg
+  ret void
+}
+
+; CHECK: ********** MI Scheduling **********
+; CHECK: STURXi_STRXui:%bb.0 entry
+; CHECK: Cluster ld/st SU(3) - SU(4)
+; CHECK: SU(3):   STURXi %{{[0-9]+}}:gpr64
+; CHECK: SU(4):   STRXui %{{[0-9]+}}:gpr64
+;
+define void @STURXi_STRXui(ptr nocapture readonly %arg, i64 %b, i64 %c) {
+entry:
+  %r51 = getelementptr i8, ptr %arg, i64 -8
+  store i64 %b, ptr %r51
+  store i64 %c, ptr %arg
+  ret void
+}
+
+; CHECK-A57: ********** MI Scheduling **********
+; CHECK-A57: STURSi_STRSui:%bb.0 entry
+; CHECK-A57: Cluster ld/st SU(3) - SU(4)
+; CHECK-A57: SU(3):   STURSi %{{[0-9]+}}:fpr32
+; CHECK-A57: SU(4):   STRSui %{{[0-9]+}}:fpr32
+;
+define void @STURSi_STRSui(ptr nocapture readonly %arg, float %b, float %c) {
+entry:
+  %r51 = getelementptr i8, ptr %arg, i64 -4
+  store float %b, ptr %r51
+  store float %c, ptr %arg
+  ret void
+}
+
+; CHECK-A57: ********** MI Scheduling **********
+; CHECK-A57: STURDi_STRDui:%bb.0 entry
+; CHECK-A57: Cluster ld/st SU(3) - SU(4)
+; CHECK-A57: SU(3):   STURDi %{{[0-9]+}}:fpr64
+; CHECK-A57: SU(4):   STRDui %{{[0-9]+}}:fpr64
+;
+define void @STURDi_STRDui(ptr nocapture readonly %arg, <2 x float> %b, <2 x float> %c) {
+entry:
+  %r51 = getelementptr i8, ptr %arg, i64 -8
+  store <2 x float> %b, ptr %r51
+  store <2 x float> %c, ptr %arg
+  ret void
+}
+
+; CHECK-A57: ********** MI Scheduling **********
+; CHECK-A57: STURQi_STRQui:%bb.0 entry
+; CHECK-A57: Cluster ld/st SU(3) - SU(4)
+; CHECK-A57: SU(3):   STURQi %{{[0-9]+}}:fpr128
+; CHECK-A57: SU(4):   STRQui %{{[0-9]+}}:fpr128
+;
+define void @STURQi_STRQui(ptr nocapture readonly %arg, <2 x double> %b, <2 x double> %c) {
+entry:
+  %r51 = getelementptr i8, ptr %arg, i64 -16
+  store <2 x double> %b, ptr %r51
+  store <2 x double> %c, ptr %arg
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 0a3476e5f4cef6..08ad34c7b03ba0 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -1680,31 +1680,31 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst)
 ; CHECK-NEXT:    add x9, x0, #8
 ; CHECK-NEXT:  LBB17_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp d2, d4, [x9, #-8]
+; CHECK-NEXT:    ldp d2, d3, [x9, #-8]
 ; CHECK-NEXT:    add x10, x1, x8
 ; CHECK-NEXT:    ldp q6, q5, [x10, #32]
 ; CHECK-NEXT:    add x8, x8, #128
 ; CHECK-NEXT:    ldp q17, q16, [x10]
 ; CHECK-NEXT:    cmp x8, #1024
-; CHECK-NEXT:    tbl.16b v3, { v2 }, v1
+; CHECK-NEXT:    tbl.16b v4, { v2 }, v1
 ; CHECK-NEXT:    tbl.16b v2, { v2 }, v0
-; CHECK-NEXT:    tbl.16b v7, { v4 }, v1
-; CHECK-NEXT:    tbl.16b v4, { v4 }, v0
+; CHECK-NEXT:    tbl.16b v7, { v3 }, v1
+; CHECK-NEXT:    tbl.16b v3, { v3 }, v0
 ; CHECK-NEXT:    add x9, x9, #16
-; CHECK-NEXT:    uaddw2.2d v5, v5, v3
-; CHECK-NEXT:    uaddw.2d v3, v6, v3
+; CHECK-NEXT:    uaddw2.2d v5, v5, v4
+; CHECK-NEXT:    uaddw.2d v4, v6, v4
 ; CHECK-NEXT:    uaddw2.2d v6, v16, v2
 ; CHECK-NEXT:    ldp q18, q16, [x10, #96]
 ; CHECK-NEXT:    uaddw.2d v2, v17, v2
-; CHECK-NEXT:    stp q3, q5, [x10, #32]
+; CHECK-NEXT:    stp q4, q5, [x10, #32]
 ; CHECK-NEXT:    ldp q17, q5, [x10, #64]
 ; CHECK-NEXT:    uaddw2.2d v16, v16, v7
 ; CHECK-NEXT:    uaddw.2d v7, v18, v7
 ; CHECK-NEXT:    stp q2, q6, [x10]
-; CHECK-NEXT:    uaddw2.2d v3, v5, v4
-; CHECK-NEXT:    uaddw.2d v4, v17, v4
+; CHECK-NEXT:    uaddw2.2d v4, v5, v3
+; CHECK-NEXT:    uaddw.2d v3, v17, v3
 ; CHECK-NEXT:    stp q7, q16, [x10, #96]
-; CHECK-NEXT:    stp q4, q3, [x10, #64]
+; CHECK-NEXT:    stp q3, q4, [x10, #64]
 ; CHECK-NEXT:    b.ne LBB17_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret



More information about the llvm-commits mailing list