[llvm] 120e968 - [AArch64] Don't create ST2 for 64bit store that requires an EXT
David Green via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 6 06:05:32 PST 2023
Author: David Green
Date: 2023-02-06T14:05:26Z
New Revision: 120e96850486a5d273b10dbdb4b8877eced942fd
URL: https://github.com/llvm/llvm-project/commit/120e96850486a5d273b10dbdb4b8877eced942fd
DIFF: https://github.com/llvm/llvm-project/commit/120e96850486a5d273b10dbdb4b8877eced942fd.diff
LOG: [AArch64] Don't create ST2 for 64bit store that requires an EXT
A 64bit st2 which does not start at element 0 will involved adding extra ext
elements, making the st2 unprofitable. This prevents that case which can lead
to a few less instructions.
Differential Revision: https://reviews.llvm.org/D142966
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/vldn_shuffle.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 06fbc9e8e3be1..95f7300064bc1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14626,6 +14626,10 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
if (llvm::all_of(Mask, [](int Idx) { return Idx == UndefMaskElem; })) {
return false;
}
+ // A 64bit st2 which does not start at element 0 will involved adding extra
+ // ext elements, making the st2 unprofitable.
+ if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 && Mask[0] != 0)
+ return false;
Type *PtrTy =
UseScalable
diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
index a54c5e9c6a17e..e2357ee2d226d 100644
--- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
@@ -301,21 +301,18 @@ define void @transpose_s16_8x8_simpler(ptr nocapture noundef %a) {
; CHECK-LABEL: transpose_s16_8x8_simpler:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldp q0, q1, [x0]
-; CHECK-NEXT: mov x8, x0
; CHECK-NEXT: ldp q2, q3, [x0, #32]
; CHECK-NEXT: trn1 v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ldp q5, q6, [x0, #80]
+; CHECK-NEXT: ldp q4, q5, [x0, #64]
; CHECK-NEXT: trn1 v2.8h, v2.8h, v3.8h
-; CHECK-NEXT: ldr q4, [x8, #64]!
-; CHECK-NEXT: ldr q1, [x0, #112]
+; CHECK-NEXT: ldp q6, q1, [x0, #96]
; CHECK-NEXT: trn1 v3.8h, v4.8h, v5.8h
-; CHECK-NEXT: trn1 v1.8h, v6.8h, v1.8h
; CHECK-NEXT: trn1 v3.4s, v0.4s, v3.4s
+; CHECK-NEXT: trn1 v1.8h, v6.8h, v1.8h
; CHECK-NEXT: trn1 v4.4s, v2.4s, v1.4s
-; CHECK-NEXT: ext v0.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT: ext v1.16b, v4.16b, v4.16b, #8
+; CHECK-NEXT: zip2 v0.4s, v3.4s, v4.4s
; CHECK-NEXT: st2 { v3.2s, v4.2s }, [x0]
-; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x8]
+; CHECK-NEXT: str q0, [x0, #64]
; CHECK-NEXT: ret
entry:
%0 = load <8 x i16>, ptr %a, align 16
@@ -355,21 +352,18 @@ define void @transpose_s16_8x8_simpler2(ptr nocapture noundef %a) {
; CHECK-LABEL: transpose_s16_8x8_simpler2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldp q0, q2, [x0]
-; CHECK-NEXT: mov x8, x0
; CHECK-NEXT: ldp q3, q4, [x0, #32]
; CHECK-NEXT: mov v0.h[5], v2.h[4]
-; CHECK-NEXT: ldp q6, q7, [x0, #80]
+; CHECK-NEXT: ldp q5, q6, [x0, #64]
; CHECK-NEXT: zip1 v3.8h, v3.8h, v4.8h
-; CHECK-NEXT: ldr q5, [x8, #64]!
-; CHECK-NEXT: ldr q2, [x0, #112]
+; CHECK-NEXT: ldp q7, q2, [x0, #96]
; CHECK-NEXT: zip1 v4.8h, v5.8h, v6.8h
-; CHECK-NEXT: mov v7.h[5], v2.h[4]
; CHECK-NEXT: mov v0.s[1], v4.s[0]
+; CHECK-NEXT: mov v7.h[5], v2.h[4]
; CHECK-NEXT: uzp1 v1.4s, v3.4s, v7.4s
-; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT: zip2 v2.4s, v0.4s, v1.4s
; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x0]
-; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x8]
+; CHECK-NEXT: str q2, [x0, #64]
; CHECK-NEXT: ret
entry:
%0 = load <8 x i16>, ptr %a, align 16
@@ -421,33 +415,29 @@ define void @transpose_s16_8x8(ptr nocapture noundef %0, ptr nocapture noundef %
; CHECK-NEXT: trn1 v7.8h, v3.8h, v4.8h
; CHECK-NEXT: trn2 v3.8h, v3.8h, v4.8h
; CHECK-NEXT: trn1 v4.8h, v0.8h, v6.8h
-; CHECK-NEXT: trn1 v17.8h, v2.8h, v16.8h
; CHECK-NEXT: trn2 v0.8h, v0.8h, v6.8h
+; CHECK-NEXT: trn1 v17.8h, v2.8h, v16.8h
; CHECK-NEXT: trn2 v2.8h, v2.8h, v16.8h
; CHECK-NEXT: trn1 v18.4s, v5.4s, v4.4s
-; CHECK-NEXT: trn1 v19.4s, v7.4s, v17.4s
; CHECK-NEXT: trn1 v20.4s, v1.4s, v0.4s
+; CHECK-NEXT: trn2 v4.4s, v5.4s, v4.4s
; CHECK-NEXT: trn2 v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: trn1 v19.4s, v7.4s, v17.4s
; CHECK-NEXT: trn1 v21.4s, v3.4s, v2.4s
-; CHECK-NEXT: trn2 v4.4s, v5.4s, v4.4s
-; CHECK-NEXT: st2 { v18.2s, v19.2s }, [x0]
-; CHECK-NEXT: trn2 v1.4s, v3.4s, v2.4s
-; CHECK-NEXT: ext v2.16b, v18.16b, v18.16b, #8
; CHECK-NEXT: trn2 v5.4s, v7.4s, v17.4s
+; CHECK-NEXT: trn2 v1.4s, v3.4s, v2.4s
+; CHECK-NEXT: st2 { v18.2s, v19.2s }, [x0]
+; CHECK-NEXT: zip2 v2.4s, v18.4s, v19.4s
; CHECK-NEXT: st2 { v20.2s, v21.2s }, [x1]
-; CHECK-NEXT: ext v3.16b, v19.16b, v19.16b, #8
-; CHECK-NEXT: ext v6.16b, v20.16b, v20.16b, #8
-; CHECK-NEXT: ext v7.16b, v21.16b, v21.16b, #8
+; CHECK-NEXT: zip2 v3.4s, v20.4s, v21.4s
; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x2]
+; CHECK-NEXT: zip2 v4.4s, v4.4s, v5.4s
; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x3]
-; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x4]
-; CHECK-NEXT: ext v2.16b, v4.16b, v4.16b, #8
-; CHECK-NEXT: ext v3.16b, v5.16b, v5.16b, #8
-; CHECK-NEXT: st2 { v6.2s, v7.2s }, [x5]
-; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x6]
-; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x7]
+; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: str q2, [x4]
+; CHECK-NEXT: str q3, [x5]
+; CHECK-NEXT: str q4, [x6]
+; CHECK-NEXT: str q0, [x7]
; CHECK-NEXT: ret
%9 = load <8 x i16>, ptr %0, align 16
%10 = load <8 x i16>, ptr %1, align 16
@@ -505,51 +495,39 @@ define void @transpose_s16_8x8_(ptr nocapture noundef %0) {
; CHECK: // %bb.0:
; CHECK-NEXT: mov x8, x0
; CHECK-NEXT: mov x9, x0
+; CHECK-NEXT: ldp q1, q2, [x0, #64]
; CHECK-NEXT: mov x10, x0
-; CHECK-NEXT: mov x11, x0
-; CHECK-NEXT: mov x12, x0
-; CHECK-NEXT: mov x13, x0
-; CHECK-NEXT: mov x14, x0
+; CHECK-NEXT: ldp q6, q7, [x0, #96]
+; CHECK-NEXT: trn1 v16.8h, v1.8h, v2.8h
+; CHECK-NEXT: trn2 v1.8h, v1.8h, v2.8h
; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ldr q1, [x8, #16]!
-; CHECK-NEXT: ldr q2, [x9, #32]!
-; CHECK-NEXT: ldr q3, [x10, #48]!
-; CHECK-NEXT: ldr q4, [x11, #64]!
-; CHECK-NEXT: ldr q6, [x12, #80]!
-; CHECK-NEXT: ldr q7, [x13, #96]!
-; CHECK-NEXT: ldr q16, [x14, #112]!
-; CHECK-NEXT: trn1 v5.8h, v0.8h, v1.8h
-; CHECK-NEXT: trn2 v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: trn1 v1.8h, v2.8h, v3.8h
-; CHECK-NEXT: trn2 v2.8h, v2.8h, v3.8h
-; CHECK-NEXT: trn1 v3.8h, v4.8h, v6.8h
-; CHECK-NEXT: trn2 v4.8h, v4.8h, v6.8h
-; CHECK-NEXT: trn1 v17.8h, v7.8h, v16.8h
-; CHECK-NEXT: trn2 v6.8h, v7.8h, v16.8h
-; CHECK-NEXT: trn1 v18.4s, v5.4s, v3.4s
-; CHECK-NEXT: trn1 v20.4s, v0.4s, v4.4s
-; CHECK-NEXT: trn1 v19.4s, v1.4s, v17.4s
-; CHECK-NEXT: trn1 v21.4s, v2.4s, v6.4s
-; CHECK-NEXT: trn2 v22.4s, v5.4s, v3.4s
-; CHECK-NEXT: trn2 v23.4s, v1.4s, v17.4s
-; CHECK-NEXT: trn2 v0.4s, v0.4s, v4.4s
-; CHECK-NEXT: st2 { v18.2s, v19.2s }, [x0]
-; CHECK-NEXT: trn2 v1.4s, v2.4s, v6.4s
-; CHECK-NEXT: ext v2.16b, v18.16b, v18.16b, #8
-; CHECK-NEXT: st2 { v20.2s, v21.2s }, [x8]
-; CHECK-NEXT: ext v4.16b, v20.16b, v20.16b, #8
-; CHECK-NEXT: ext v3.16b, v19.16b, v19.16b, #8
-; CHECK-NEXT: st2 { v22.2s, v23.2s }, [x9]
-; CHECK-NEXT: ext v5.16b, v21.16b, v21.16b, #8
+; CHECK-NEXT: ldr q3, [x8, #16]!
+; CHECK-NEXT: ldr q4, [x9, #32]!
+; CHECK-NEXT: ldr q5, [x10, #48]!
+; CHECK-NEXT: trn1 v2.8h, v6.8h, v7.8h
+; CHECK-NEXT: trn2 v6.8h, v6.8h, v7.8h
+; CHECK-NEXT: trn1 v7.8h, v0.8h, v3.8h
+; CHECK-NEXT: trn2 v0.8h, v0.8h, v3.8h
+; CHECK-NEXT: trn1 v17.8h, v4.8h, v5.8h
+; CHECK-NEXT: trn2 v3.8h, v4.8h, v5.8h
+; CHECK-NEXT: trn1 v4.4s, v7.4s, v16.4s
+; CHECK-NEXT: trn1 v18.4s, v0.4s, v1.4s
+; CHECK-NEXT: trn2 v20.4s, v7.4s, v16.4s
+; CHECK-NEXT: trn2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: trn1 v5.4s, v17.4s, v2.4s
+; CHECK-NEXT: trn1 v19.4s, v3.4s, v6.4s
+; CHECK-NEXT: trn2 v21.4s, v17.4s, v2.4s
+; CHECK-NEXT: trn2 v1.4s, v3.4s, v6.4s
+; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x0]
+; CHECK-NEXT: zip2 v2.4s, v4.4s, v5.4s
+; CHECK-NEXT: zip2 v3.4s, v18.4s, v19.4s
+; CHECK-NEXT: st2 { v18.2s, v19.2s }, [x8]
+; CHECK-NEXT: zip2 v4.4s, v20.4s, v21.4s
; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x10]
-; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x11]
-; CHECK-NEXT: ext v2.16b, v22.16b, v22.16b, #8
-; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x12]
-; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: ext v3.16b, v23.16b, v23.16b, #8
-; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x13]
-; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x14]
+; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: st2 { v20.2s, v21.2s }, [x9]
+; CHECK-NEXT: stp q2, q3, [x0, #64]
+; CHECK-NEXT: stp q4, q0, [x0, #96]
; CHECK-NEXT: ret
%2 = load <8 x i16>, ptr %0, align 16
%3 = getelementptr inbounds <8 x i16>, ptr %0, i64 1
@@ -629,11 +607,10 @@ define void @store_factor2_high(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %a
; CHECK-NEXT: trn1 v2.4s, v0.4s, v1.4s
; CHECK-NEXT: trn1 v0.4s, v1.4s, v0.4s
; CHECK-NEXT: zip1 v1.4s, v2.4s, v0.4s
-; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: trn1 v1.4s, v1.4s, v0.4s
+; CHECK-NEXT: zip2 v0.4s, v2.4s, v0.4s
; CHECK-NEXT: str q1, [x0]
-; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x1]
+; CHECK-NEXT: str q0, [x1]
; CHECK-NEXT: ret
%v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
%v1 = shufflevector <4 x i32> %a1, <4 x i32> %a0, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -648,11 +625,10 @@ define void @store_factor2_high2(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %
; CHECK-LABEL: store_factor2_high2:
; CHECK: // %bb.0:
; CHECK-NEXT: zip1 v2.4s, v0.4s, v1.4s
-; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: trn1 v0.4s, v2.4s, v1.4s
-; CHECK-NEXT: str q0, [x0]
-; CHECK-NEXT: st2 { v3.2s, v4.2s }, [x1]
+; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: trn1 v2.4s, v2.4s, v1.4s
+; CHECK-NEXT: str q2, [x0]
+; CHECK-NEXT: str q0, [x1]
; CHECK-NEXT: ret
%interleaved.vec = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 6>
%interleaved.vec2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
More information about the llvm-commits
mailing list