[llvm] [SystemZ] Consider VST/VL as SimpleBDXStore/Load (PR #135623)
Dominik Steenken via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 14 06:16:38 PDT 2025
https://github.com/dominik-steenken created https://github.com/llvm/llvm-project/pull/135623
Previously `vst` and `vl` were not considered "simple" BDX stores and loads, leading to, among other things, some opportunities for `mvc` optimization to be missed.
This PR addresses this and updates some tests to account for additional `mvc` instructions being emitted.
This is observed to have a neutral or slightly beneficial effect performance-wise.
>From cba24c1d3ab9ccbcc9ec11c2ebc65c92dccb69b7 Mon Sep 17 00:00:00 2001
From: Dominik Steenken <dost at de.ibm.com>
Date: Mon, 17 Mar 2025 13:49:30 +0100
Subject: [PATCH] [SystemZ] Consider VST/VL as SimpleBDXStore/Load
Previously VST and VL were not considered "simple" BDX stores and loads,
leading to, among other things, some opportunities for mvc optimization
to be missed.
This commit addresses this and updates some tests to account for additional
mvc instructions being emitted.
This is observed to have a neutral or slightly beneficial effect
performance-wise.
---
llvm/lib/Target/SystemZ/SystemZInstrVector.td | 6 +-
.../vector-constrained-fp-intrinsics.ll | 68 ++++++++-----------
2 files changed, 31 insertions(+), 43 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index e1fe7edc4cb08..d8c48239ac633 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -105,7 +105,8 @@ let Predicates = [FeatureVector] in {
let Predicates = [FeatureVector] in {
// Load.
- defm VL : UnaryVRXAlign<"vl", 0xE706>;
+ let SimpleBDXLoad = 1 in
+ defm VL : UnaryVRXAlign<"vl", 0xE706>;
// Load to block boundary. The number of loaded bytes is only known
// at run time. The instruction is really polymorphic, but v128b matches
@@ -213,7 +214,8 @@ defm : ReplicatePeephole<VLREPG, v2f64, z_load, f64>;
let Predicates = [FeatureVector] in {
// Store.
- defm VST : StoreVRXAlign<"vst", 0xE70E>;
+ let SimpleBDXStore = 1 in
+ defm VST : StoreVRXAlign<"vst", 0xE70E>;
// Store with length. The number of stored bytes is only known at run time.
def VSTL : StoreLengthVRSb<"vstl", 0xE73F, int_s390_vstl, 0>;
diff --git a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
index e0818ea3da294..96d91953b9f83 100644
--- a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
@@ -440,10 +440,9 @@ define void @constrained_vector_frem_v3f64(ptr %a) #0 {
; SZ13-NEXT: ld %f0, 0(%r1)
; SZ13-NEXT: ldr %f2, %f8
; SZ13-NEXT: brasl %r14, fmod at PLT
-; SZ13-NEXT: std %f0, 16(%r13)
-; SZ13-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT: mvc 0(16,%r13), 160(%r15) # 16-byte Folded Reload
; SZ13-NEXT: ld %f8, 192(%r15) # 8-byte Reload
-; SZ13-NEXT: vst %v0, 0(%r13), 4
+; SZ13-NEXT: std %f0, 16(%r13)
; SZ13-NEXT: lmg %r13, %r15, 304(%r15)
; SZ13-NEXT: br %r14
entry:
@@ -1473,11 +1472,10 @@ define void @constrained_vector_pow_v3f64(ptr %a) #0 {
; SZ13-NEXT: ldr %f0, %f8
; SZ13-NEXT: ldr %f2, %f9
; SZ13-NEXT: brasl %r14, pow at PLT
-; SZ13-NEXT: std %f0, 16(%r13)
-; SZ13-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT: mvc 0(16,%r13), 160(%r15) # 16-byte Folded Reload
; SZ13-NEXT: ld %f8, 200(%r15) # 8-byte Reload
; SZ13-NEXT: ld %f9, 192(%r15) # 8-byte Reload
-; SZ13-NEXT: vst %v0, 0(%r13), 4
+; SZ13-NEXT: std %f0, 16(%r13)
; SZ13-NEXT: lmg %r13, %r15, 312(%r15)
; SZ13-NEXT: br %r14
entry:
@@ -1829,9 +1827,8 @@ define void @constrained_vector_powi_v3f64(ptr %a) #0 {
; SZ13-NEXT: ld %f0, 0(%r1)
; SZ13-NEXT: lghi %r2, 3
; SZ13-NEXT: brasl %r14, __powidf2 at PLT
+; SZ13-NEXT: mvc 0(16,%r13), 160(%r15) # 16-byte Folded Reload
; SZ13-NEXT: std %f0, 16(%r13)
-; SZ13-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload
-; SZ13-NEXT: vst %v0, 0(%r13), 4
; SZ13-NEXT: lmg %r13, %r15, 280(%r15)
; SZ13-NEXT: br %r14
entry:
@@ -2155,10 +2152,9 @@ define void @constrained_vector_sin_v3f64(ptr %a) #0 {
; SZ13-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
; SZ13-NEXT: ldr %f0, %f8
; SZ13-NEXT: brasl %r14, sin at PLT
-; SZ13-NEXT: std %f0, 16(%r13)
-; SZ13-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT: mvc 0(16,%r13), 160(%r15) # 16-byte Folded Reload
; SZ13-NEXT: ld %f8, 192(%r15) # 8-byte Reload
-; SZ13-NEXT: vst %v0, 0(%r13), 4
+; SZ13-NEXT: std %f0, 16(%r13)
; SZ13-NEXT: lmg %r13, %r15, 304(%r15)
; SZ13-NEXT: br %r14
entry:
@@ -2472,10 +2468,9 @@ define void @constrained_vector_cos_v3f64(ptr %a) #0 {
; SZ13-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
; SZ13-NEXT: ldr %f0, %f8
; SZ13-NEXT: brasl %r14, cos at PLT
-; SZ13-NEXT: std %f0, 16(%r13)
-; SZ13-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT: mvc 0(16,%r13), 160(%r15) # 16-byte Folded Reload
; SZ13-NEXT: ld %f8, 192(%r15) # 8-byte Reload
-; SZ13-NEXT: vst %v0, 0(%r13), 4
+; SZ13-NEXT: std %f0, 16(%r13)
; SZ13-NEXT: lmg %r13, %r15, 304(%r15)
; SZ13-NEXT: br %r14
entry:
@@ -2789,10 +2784,9 @@ define void @constrained_vector_exp_v3f64(ptr %a) #0 {
; SZ13-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
; SZ13-NEXT: ldr %f0, %f8
; SZ13-NEXT: brasl %r14, exp at PLT
-; SZ13-NEXT: std %f0, 16(%r13)
-; SZ13-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT: mvc 0(16,%r13), 160(%r15) # 16-byte Folded Reload
; SZ13-NEXT: ld %f8, 192(%r15) # 8-byte Reload
-; SZ13-NEXT: vst %v0, 0(%r13), 4
+; SZ13-NEXT: std %f0, 16(%r13)
; SZ13-NEXT: lmg %r13, %r15, 304(%r15)
; SZ13-NEXT: br %r14
entry:
@@ -3106,10 +3100,9 @@ define void @constrained_vector_exp2_v3f64(ptr %a) #0 {
; SZ13-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
; SZ13-NEXT: ldr %f0, %f8
; SZ13-NEXT: brasl %r14, exp2 at PLT
-; SZ13-NEXT: std %f0, 16(%r13)
-; SZ13-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT: mvc 0(16,%r13), 160(%r15) # 16-byte Folded Reload
; SZ13-NEXT: ld %f8, 192(%r15) # 8-byte Reload
-; SZ13-NEXT: vst %v0, 0(%r13), 4
+; SZ13-NEXT: std %f0, 16(%r13)
; SZ13-NEXT: lmg %r13, %r15, 304(%r15)
; SZ13-NEXT: br %r14
entry:
@@ -3423,10 +3416,9 @@ define void @constrained_vector_log_v3f64(ptr %a) #0 {
; SZ13-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
; SZ13-NEXT: ldr %f0, %f8
; SZ13-NEXT: brasl %r14, log at PLT
-; SZ13-NEXT: std %f0, 16(%r13)
-; SZ13-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT: mvc 0(16,%r13), 160(%r15) # 16-byte Folded Reload
; SZ13-NEXT: ld %f8, 192(%r15) # 8-byte Reload
-; SZ13-NEXT: vst %v0, 0(%r13), 4
+; SZ13-NEXT: std %f0, 16(%r13)
; SZ13-NEXT: lmg %r13, %r15, 304(%r15)
; SZ13-NEXT: br %r14
entry:
@@ -3740,10 +3732,9 @@ define void @constrained_vector_log10_v3f64(ptr %a) #0 {
; SZ13-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
; SZ13-NEXT: ldr %f0, %f8
; SZ13-NEXT: brasl %r14, log10 at PLT
-; SZ13-NEXT: std %f0, 16(%r13)
-; SZ13-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT: mvc 0(16,%r13), 160(%r15) # 16-byte Folded Reload
; SZ13-NEXT: ld %f8, 192(%r15) # 8-byte Reload
-; SZ13-NEXT: vst %v0, 0(%r13), 4
+; SZ13-NEXT: std %f0, 16(%r13)
; SZ13-NEXT: lmg %r13, %r15, 304(%r15)
; SZ13-NEXT: br %r14
entry:
@@ -4057,10 +4048,9 @@ define void @constrained_vector_log2_v3f64(ptr %a) #0 {
; SZ13-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
; SZ13-NEXT: ldr %f0, %f8
; SZ13-NEXT: brasl %r14, log2 at PLT
-; SZ13-NEXT: std %f0, 16(%r13)
-; SZ13-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT: mvc 0(16,%r13), 160(%r15) # 16-byte Folded Reload
; SZ13-NEXT: ld %f8, 192(%r15) # 8-byte Reload
-; SZ13-NEXT: vst %v0, 0(%r13), 4
+; SZ13-NEXT: std %f0, 16(%r13)
; SZ13-NEXT: lmg %r13, %r15, 304(%r15)
; SZ13-NEXT: br %r14
entry:
@@ -4788,10 +4778,9 @@ define void @constrained_vector_log10_maxnum_v3f64(ptr %a) #0 {
; SZ13-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
; SZ13-NEXT: ldr %f0, %f8
; SZ13-NEXT: brasl %r14, fmax at PLT
-; SZ13-NEXT: std %f0, 16(%r13)
-; SZ13-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT: mvc 0(16,%r13), 160(%r15) # 16-byte Folded Reload
; SZ13-NEXT: ld %f8, 192(%r15) # 8-byte Reload
-; SZ13-NEXT: vst %v0, 0(%r13), 4
+; SZ13-NEXT: std %f0, 16(%r13)
; SZ13-NEXT: lmg %r13, %r15, 304(%r15)
; SZ13-NEXT: br %r14
entry:
@@ -5165,11 +5154,10 @@ define void @constrained_vector_minnum_v3f64(ptr %a) #0 {
; SZ13-NEXT: ldr %f0, %f8
; SZ13-NEXT: ldr %f2, %f9
; SZ13-NEXT: brasl %r14, fmin at PLT
-; SZ13-NEXT: std %f0, 16(%r13)
-; SZ13-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT: mvc 0(16,%r13), 160(%r15) # 16-byte Folded Reload
; SZ13-NEXT: ld %f8, 200(%r15) # 8-byte Reload
; SZ13-NEXT: ld %f9, 192(%r15) # 8-byte Reload
-; SZ13-NEXT: vst %v0, 0(%r13), 4
+; SZ13-NEXT: std %f0, 16(%r13)
; SZ13-NEXT: lmg %r13, %r15, 312(%r15)
; SZ13-NEXT: br %r14
entry:
@@ -6612,10 +6600,9 @@ define void @constrained_vector_tan_v3f64(ptr %a) #0 {
; SZ13-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
; SZ13-NEXT: ldr %f0, %f8
; SZ13-NEXT: brasl %r14, tan at PLT
-; SZ13-NEXT: std %f0, 16(%r13)
-; SZ13-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT: mvc 0(16,%r13), 160(%r15) # 16-byte Folded Reload
; SZ13-NEXT: ld %f8, 192(%r15) # 8-byte Reload
-; SZ13-NEXT: vst %v0, 0(%r13), 4
+; SZ13-NEXT: std %f0, 16(%r13)
; SZ13-NEXT: lmg %r13, %r15, 304(%r15)
; SZ13-NEXT: br %r14
entry:
@@ -6977,11 +6964,10 @@ define void @constrained_vector_atan2_v3f64(ptr %a, ptr %b) #0 {
; SZ13-NEXT: ldr %f0, %f8
; SZ13-NEXT: ldr %f2, %f9
; SZ13-NEXT: brasl %r14, atan2 at PLT
-; SZ13-NEXT: std %f0, 16(%r13)
-; SZ13-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT: mvc 0(16,%r13), 160(%r15) # 16-byte Folded Reload
; SZ13-NEXT: ld %f8, 216(%r15) # 8-byte Reload
; SZ13-NEXT: ld %f9, 208(%r15) # 8-byte Reload
-; SZ13-NEXT: vst %v0, 0(%r13), 4
+; SZ13-NEXT: std %f0, 16(%r13)
; SZ13-NEXT: lmg %r13, %r15, 328(%r15)
; SZ13-NEXT: br %r14
entry:
More information about the llvm-commits
mailing list