[llvm] a95a818 - [AArch64] Lower fixed-length vector_shuffle to SVE splat if possible
Cullen Rhodes via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 16 03:48:08 PST 2022
Author: Benjamin Maxwell
Date: 2022-11-16T11:47:27Z
New Revision: a95a8188e897df282b3cf1e79bd46ec095591a44
URL: https://github.com/llvm/llvm-project/commit/a95a8188e897df282b3cf1e79bd46ec095591a44
DIFF: https://github.com/llvm/llvm-project/commit/a95a8188e897df282b3cf1e79bd46ec095591a44.diff
LOG: [AArch64] Lower fixed-length vector_shuffle to SVE splat if possible
This adds an extra case to check if a vector_shuffle for a fixed-length
vector that's being lowered to SVE, is just a splat. Doing this avoids
a round trip to the stack and back for a few cases.
Reviewed By: c-rhodes
Differential Revision: https://reviews.llvm.org/D137966
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b06f6308281c..41b30320a90c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23062,16 +23062,28 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
+ auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
+ if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
+ return MVT::i32;
+ return ScalarTy;
+ };
+
+ if (SVN->isSplat()) {
+ unsigned Lane = std::max(0, SVN->getSplatIndex());
+ EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
+ SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
+ DAG.getConstant(Lane, DL, MVT::i64));
+ Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
+ return convertFromScalableVector(DAG, VT, Op);
+ }
+
bool ReverseEXT = false;
unsigned Imm;
if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
Imm == VT.getVectorNumElements() - 1) {
if (ReverseEXT)
std::swap(Op1, Op2);
-
- EVT ScalarTy = VT.getVectorElementType();
- if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
- ScalarTy = MVT::i32;
+ EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
SDValue Scalar = DAG.getNode(
ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
index a4d37caaf199..b9080ed84bbe 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
@@ -10,26 +10,11 @@ target triple = "aarch64-unknown-linux-gnu"
define void @hang_when_merging_stores_after_legalisation(ptr %a, <2 x i32> %b) vscale_range(2,2) #0 {
; CHECK-LABEL: hang_when_merging_stores_after_legalisation:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: mov x29, sp
-; CHECK-NEXT: .cfi_def_cfa w29, 16
-; CHECK-NEXT: .cfi_offset w30, -8
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: sub x9, sp, #48
-; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: stp s0, s0, [sp, #24]
-; CHECK-NEXT: stp s0, s0, [sp, #16]
-; CHECK-NEXT: stp s0, s0, [sp, #8]
-; CHECK-NEXT: stp s0, s0, [sp]
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: mov z0.s, s0
; CHECK-NEXT: mov z1.d, z0.d
; CHECK-NEXT: ext z1.b, z1.b, z1.b, #16
; CHECK-NEXT: st2 { v0.4s, v1.4s }, [x0]
-; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
%splat = shufflevector <2 x i32> %b, <2 x i32> undef, <8 x i32> zeroinitializer
%interleaved.vec = shufflevector <8 x i32> %splat, <8 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
index 8542736694b2..0204613b9fc8 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
@@ -712,4 +712,82 @@ define void @splat_imm_v8f64(ptr %a) vscale_range(4,0) #0 {
ret void
}
+define <8 x float> @load_splat_v8f32(ptr %p) vscale_range(2,2) #0 {
+; CHECK-LABEL: load_splat_v8f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: mov z0.s, s0
+; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: ret
+ %v = load <8 x float>, ptr %p
+ %splat = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> zeroinitializer
+ ret <8 x float> %splat
+}
+
+define <4 x double> @load_splat_v4f64(ptr %p) vscale_range(2,2) #0 {
+; CHECK-LABEL: load_splat_v4f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: mov z0.d, d0
+; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: ret
+ %v = load <4 x double>, ptr %p
+ %splat = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %splat
+}
+
+define <32 x i8> @load_splat_v32i8(ptr %p) vscale_range(2,2) #0 {
+; CHECK-LABEL: load_splat_v32i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: mov z0.b, b0
+; CHECK-NEXT: st1b { z0.b }, p0, [x8]
+; CHECK-NEXT: ret
+ %v = load <32 x i8>, ptr %p
+ %splat = shufflevector <32 x i8> %v, <32 x i8> undef, <32 x i32> zeroinitializer
+ ret <32 x i8> %splat
+}
+
+define <16 x i16> @load_splat_v16i16(ptr %p) vscale_range(2,2) #0 {
+; CHECK-LABEL: load_splat_v16i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: mov z0.h, h0
+; CHECK-NEXT: st1h { z0.h }, p0, [x8]
+; CHECK-NEXT: ret
+ %v = load <16 x i16>, ptr %p
+ %splat = shufflevector <16 x i16> %v, <16 x i16> undef, <16 x i32> zeroinitializer
+ ret <16 x i16> %splat
+}
+
+define <8 x i32> @load_splat_v8i32(ptr %p) vscale_range(2,2) #0 {
+; CHECK-LABEL: load_splat_v8i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: mov z0.s, s0
+; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: ret
+ %v = load <8 x i32>, ptr %p
+ %splat = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> zeroinitializer
+ ret <8 x i32> %splat
+}
+
+define <4 x i64> @load_splat_v4i64(ptr %p) vscale_range(2,2) #0 {
+; CHECK-LABEL: load_splat_v4i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: mov z0.d, d0
+; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: ret
+ %v = load <4 x i64>, ptr %p
+ %splat = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> zeroinitializer
+ ret <4 x i64> %splat
+}
+
attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
index 8a6d1903c8f6..2999b84360a7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
@@ -124,8 +124,6 @@ define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) #0 {
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: mov z0.s, z0.s[1]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: insr z0.s, w8
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%ret = call <1 x i32> @llvm.vector.extract.v1i32.v2i32(<2 x i32> %op, i64 1)
@@ -229,7 +227,6 @@ define <1 x float> @extract_subvector_v2f32(<2 x float> %op) #0 {
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: mov z0.s, z0.s[1]
-; CHECK-NEXT: insr z0.s, s0
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%ret = call <1 x float> @llvm.vector.extract.v1f32.v2f32(<2 x float> %op, i64 1)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
index fdcc96974f7b..ffea4b4c5007 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
@@ -10,15 +10,9 @@ target triple = "aarch64-unknown-linux-gnu"
define void @hang_when_merging_stores_after_legalisation(<8 x i32>* %a, <2 x i32> %b) #0 {
; CHECK-LABEL: hang_when_merging_stores_after_legalisation:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: stp w8, w8, [sp, #8]
-; CHECK-NEXT: stp w8, w8, [sp]
-; CHECK-NEXT: ldr q0, [sp]
+; CHECK-NEXT: mov z0.s, s0
; CHECK-NEXT: stp q0, q0, [x0]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
%splat = shufflevector <2 x i32> %b, <2 x i32> undef, <8 x i32> zeroinitializer
%interleaved.vec = shufflevector <8 x i32> %splat, <8 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
More information about the llvm-commits
mailing list