[llvm] [AArch64] MI Scheduler LDP/STP combine follow up (PR #79003)
Sjoerd Meijer via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 25 02:52:58 PST 2024
https://github.com/sjoerdmeijer updated https://github.com/llvm/llvm-project/pull/79003
>From 4bd9a164b1431251d68d26805c5907f2adec85d3 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <smeijer at nvidia.com>
Date: Tue, 16 Jan 2024 17:20:41 +0530
Subject: [PATCH] [AArch64] MI Scheduler LDP combine follow up
This is a follow up of 75d820dcdd86, adding more opcodes to the combine
target hook enabling more LDP creation.
Patch co-authored by Cameron McInally.
---
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 9 +++
.../test/CodeGen/AArch64/arm64-ldp-cluster.ll | 75 ++++++++++++++++++-
llvm/test/CodeGen/AArch64/zext-to-tbl.ll | 20 ++---
3 files changed, 93 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 42b7a6418032adc..5e7763580011129 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4214,6 +4214,12 @@ static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
switch (FirstOpc) {
default:
return false;
+ case AArch64::LDRSui:
+ case AArch64::LDURSi:
+ return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
+ case AArch64::LDRDui:
+ case AArch64::LDURDi:
+ return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
case AArch64::LDRQui:
case AArch64::LDURQi:
return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
@@ -4223,6 +4229,9 @@ static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
case AArch64::LDRSWui:
case AArch64::LDURSWi:
return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
+ case AArch64::LDRXui:
+ case AArch64::LDURXi:
+ return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
}
// These instructions can't be paired based on their opcodes.
return false;
diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
index 83f86d1c3a7cbf3..f9ab7175a1a12b8 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -1,5 +1,5 @@
; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefixes=CHECK,CHECK-A57
; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m3 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
; Test ldr clustering.
@@ -114,6 +114,22 @@ define <2 x i64> @ldq_cluster(ptr %p) {
ret <2 x i64> %res
}
+; CHECK: ********** MI Scheduling **********
+; CHECK: LDURSi_LDRSui:%bb.0 entry
+; CHECK: Cluster ld/st SU(3) - SU(4)
+; CHECK: SU(3): %3:fpr32 = LDURSi %0:gpr64
+; CHECK: SU(4): %4:fpr32 = LDRSui %0:gpr64
+;
+define void @LDURSi_LDRSui(ptr nocapture readonly %arg, ptr nocapture readonly %wa, ptr nocapture readonly %wb) {
+entry:
+ %r51 = getelementptr i8, ptr %arg, i64 -4
+ %r52 = load float, ptr %r51, align 4
+ %r53 = load float, ptr %arg, align 4
+ store float %r52, ptr %wa
+ store float %r53, ptr %wb
+ ret void
+}
+
; Test LDURQi / LDRQui clustering
;
; CHECK: ********** MI Scheduling **********
@@ -154,3 +170,60 @@ vector_body:
exit:
ret void
}
+
+; Test LDURDi / LDRDui clustering
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: LDURDi_LDRDui:%bb.1 vector_body
+;
+; CHECK: Cluster ld/st SU(2) - SU(6)
+; CHECK: Cluster ld/st SU(3) - SU(7)
+;
+; CHECK: SU(2): %{{[0-9]+}}:fpr64 = LDURDi
+; CHECK: SU(3): %{{[0-9]+}}:fpr64 = LDURDi
+; CHECK: SU(6): %{{[0-9]+}}:fpr64 = LDRDui
+; CHECK: SU(7): %{{[0-9]+}}:fpr64 = LDRDui
+;
+define void @LDURDi_LDRDui(ptr nocapture readonly %arg) {
+entry:
+ br label %vector_body
+vector_body:
+ %phi1 = phi ptr [ null, %entry ], [ %r63, %vector_body ]
+ %phi2 = phi ptr [ %arg, %entry ], [ %r62, %vector_body ]
+ %phi3 = phi i32 [ 0, %entry ], [ %r61, %vector_body ]
+ %r51 = getelementptr i8, ptr %phi1, i64 -8
+ %r52 = load <2 x float>, ptr %r51, align 8
+ %r53 = getelementptr i8, ptr %phi2, i64 -8
+ %r54 = load <2 x float>, ptr %r53, align 8
+ %r55 = fmul fast <2 x float> %r54, <float 3.0, float 4.0>
+ %r56 = fsub fast <2 x float> %r52, %r55
+ store <2 x float> %r56, ptr %r51, align 1
+ %r57 = load <2 x float>, ptr %phi1, align 8
+ %r58 = load <2 x float>, ptr %phi2, align 8
+ %r59 = fmul fast <2 x float> %r58, <float 3.0, float 4.0>
+ %r60 = fsub fast <2 x float> %r57, %r59
+ store <2 x float> %r60, ptr %phi1, align 1
+ %r61 = add i32 %phi3, 4
+ %r62 = getelementptr i8, ptr %phi2, i64 32
+ %r63 = getelementptr i8, ptr %phi1, i64 32
+ %r.not = icmp eq i32 %r61, 0
+ br i1 %r.not, label %exit, label %vector_body
+exit:
+ ret void
+}
+
+; CHECK: ********** MI Scheduling **********
+; CHECK: LDURXi_LDRXui:%bb.0 entry
+; CHECK: Cluster ld/st SU(3) - SU(4)
+; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDURXi
+; CHECK: SU(4): %{{[0-9]+}}:gpr64 = LDRXui
+;
+define void @LDURXi_LDRXui(ptr nocapture readonly %arg, ptr nocapture readonly %wa, ptr nocapture readonly %wb) {
+entry:
+ %r51 = getelementptr i8, ptr %arg, i64 -8
+ %r52 = load i64, ptr %r51, align 8
+ %r53 = load i64, ptr %arg, align 8
+ store i64 %r52, ptr %wa
+ store i64 %r53, ptr %wb
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 0a3476e5f4cef63..08ad34c7b03ba07 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -1680,31 +1680,31 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst)
; CHECK-NEXT: add x9, x0, #8
; CHECK-NEXT: LBB17_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp d2, d4, [x9, #-8]
+; CHECK-NEXT: ldp d2, d3, [x9, #-8]
; CHECK-NEXT: add x10, x1, x8
; CHECK-NEXT: ldp q6, q5, [x10, #32]
; CHECK-NEXT: add x8, x8, #128
; CHECK-NEXT: ldp q17, q16, [x10]
; CHECK-NEXT: cmp x8, #1024
-; CHECK-NEXT: tbl.16b v3, { v2 }, v1
+; CHECK-NEXT: tbl.16b v4, { v2 }, v1
; CHECK-NEXT: tbl.16b v2, { v2 }, v0
-; CHECK-NEXT: tbl.16b v7, { v4 }, v1
-; CHECK-NEXT: tbl.16b v4, { v4 }, v0
+; CHECK-NEXT: tbl.16b v7, { v3 }, v1
+; CHECK-NEXT: tbl.16b v3, { v3 }, v0
; CHECK-NEXT: add x9, x9, #16
-; CHECK-NEXT: uaddw2.2d v5, v5, v3
-; CHECK-NEXT: uaddw.2d v3, v6, v3
+; CHECK-NEXT: uaddw2.2d v5, v5, v4
+; CHECK-NEXT: uaddw.2d v4, v6, v4
; CHECK-NEXT: uaddw2.2d v6, v16, v2
; CHECK-NEXT: ldp q18, q16, [x10, #96]
; CHECK-NEXT: uaddw.2d v2, v17, v2
-; CHECK-NEXT: stp q3, q5, [x10, #32]
+; CHECK-NEXT: stp q4, q5, [x10, #32]
; CHECK-NEXT: ldp q17, q5, [x10, #64]
; CHECK-NEXT: uaddw2.2d v16, v16, v7
; CHECK-NEXT: uaddw.2d v7, v18, v7
; CHECK-NEXT: stp q2, q6, [x10]
-; CHECK-NEXT: uaddw2.2d v3, v5, v4
-; CHECK-NEXT: uaddw.2d v4, v17, v4
+; CHECK-NEXT: uaddw2.2d v4, v5, v3
+; CHECK-NEXT: uaddw.2d v3, v17, v3
; CHECK-NEXT: stp q7, q16, [x10, #96]
-; CHECK-NEXT: stp q4, q3, [x10, #64]
+; CHECK-NEXT: stp q3, q4, [x10, #64]
; CHECK-NEXT: b.ne LBB17_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
More information about the llvm-commits
mailing list