[llvm] bd576e5 - [AArch64][SVE] Improve extract_subvector for predicates.
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 7 07:49:58 PDT 2021
Author: Sander de Smalen
Date: 2021-09-07T15:49:29+01:00
New Revision: bd576e5ac0fc097c16f5c2a900d5c243289a2a8b
URL: https://github.com/llvm/llvm-project/commit/bd576e5ac0fc097c16f5c2a900d5c243289a2a8b
DIFF: https://github.com/llvm/llvm-project/commit/bd576e5ac0fc097c16f5c2a900d5c243289a2a8b.diff
LOG: [AArch64][SVE] Improve extract_subvector for predicates.
Using PUNPKLO/HI instead of ZIP1/ZIP2, because that avoids
having to generate a predicate with all lanes inactive (PFALSE).
Reviewed By: CarolineConcatto
Differential Revision: https://reviews.llvm.org/D109312
Added:
Modified:
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll
llvm/test/CodeGen/AArch64/sve-masked-scatter.ll
llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
llvm/test/CodeGen/AArch64/sve-split-load.ll
llvm/test/CodeGen/AArch64/sve-split-store.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 1d8e3f9ab4c8..3a6f0841f200 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1222,17 +1222,17 @@ let Predicates = [HasSVEorStreamingSVE] in {
// Extract lo/hi halves of legal predicate types.
def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))),
- (ZIP1_PPP_S PPR:$Ps, (PFALSE))>;
+ (PUNPKLO_PP PPR:$Ps)>;
def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))),
- (ZIP2_PPP_S PPR:$Ps, (PFALSE))>;
+ (PUNPKHI_PP PPR:$Ps)>;
def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))),
- (ZIP1_PPP_H PPR:$Ps, (PFALSE))>;
+ (PUNPKLO_PP PPR:$Ps)>;
def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))),
- (ZIP2_PPP_H PPR:$Ps, (PFALSE))>;
+ (PUNPKHI_PP PPR:$Ps)>;
def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))),
- (ZIP1_PPP_B PPR:$Ps, (PFALSE))>;
+ (PUNPKLO_PP PPR:$Ps)>;
def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
- (ZIP2_PPP_B PPR:$Ps, (PFALSE))>;
+ (PUNPKHI_PP PPR:$Ps)>;
// Extract subvectors from FP SVE vectors
def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 0))),
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
index 16bac85e8091..501eacd09b4f 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
@@ -80,10 +80,9 @@ define <vscale x 2 x i32> @masked_gather_nxv2i32(<vscale x 2 x i32*> %ptrs, <vsc
define <vscale x 4 x half> @masked_gather_nxv4f16(<vscale x 4 x half*> %ptrs, <vscale x 4 x i1> %mask) #0 {
; CHECK-LABEL: masked_gather_nxv4f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1h { z1.d }, p2/z, [z1.d]
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z1.d }, p1/z, [z1.d]
; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d]
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
; CHECK-NEXT: ret
@@ -106,16 +105,15 @@ define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i
define <vscale x 8 x half> @masked_gather_nxv8f16(<vscale x 8 x half*> %ptrs, <vscale x 8 x i1> %mask) #0 {
; CHECK-LABEL: masked_gather_nxv8f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: zip2 p2.h, p0.h, p1.h
-; CHECK-NEXT: zip2 p3.s, p2.s, p1.s
-; CHECK-NEXT: zip1 p2.s, p2.s, p1.s
-; CHECK-NEXT: zip1 p0.h, p0.h, p1.h
-; CHECK-NEXT: ld1h { z3.d }, p3/z, [z3.d]
-; CHECK-NEXT: ld1h { z2.d }, p2/z, [z2.d]
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1h { z1.d }, p2/z, [z1.d]
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpkhi p2.h, p1.b
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z3.d }, p2/z, [z3.d]
+; CHECK-NEXT: ld1h { z2.d }, p1/z, [z2.d]
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z1.d }, p1/z, [z1.d]
; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d]
; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
@@ -128,12 +126,11 @@ define <vscale x 8 x half> @masked_gather_nxv8f16(<vscale x 8 x half*> %ptrs, <v
define <vscale x 8 x bfloat> @masked_gather_nxv8bf16(bfloat* %base, <vscale x 8 x i16> %indices, <vscale x 8 x i1> %mask) #0 {
; CHECK-LABEL: masked_gather_nxv8bf16:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: zip2 p2.h, p0.h, p1.h
; CHECK-NEXT: sunpkhi z1.s, z0.h
-; CHECK-NEXT: zip1 p0.h, p0.h, p1.h
+; CHECK-NEXT: punpkhi p1.h, p0.b
; CHECK-NEXT: sunpklo z0.s, z0.h
-; CHECK-NEXT: ld1h { z1.s }, p2/z, [x0, z1.s, sxtw #1]
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0, z1.s, sxtw #1]
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
; CHECK-NEXT: ret
@@ -148,12 +145,11 @@ define <vscale x 4 x double> @masked_gather_nxv4f64(double* %base, <vscale x 4 x
; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: movprfx z1, z0
; CHECK-NEXT: sxth z1.s, p1/m, z0.s
-; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: sunpklo z0.d, z1.s
-; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: punpklo p1.h, p0.b
; CHECK-NEXT: sunpkhi z1.d, z1.s
-; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1d { z0.d }, p2/z, [x0, z0.d, lsl #3]
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, z0.d, lsl #3]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, z1.d, lsl #3]
; CHECK-NEXT: ret
%ptrs = getelementptr double, double* %base, <vscale x 4 x i16> %indices
@@ -164,10 +160,9 @@ define <vscale x 4 x double> @masked_gather_nxv4f64(double* %base, <vscale x 4 x
define <vscale x 8 x float> @masked_gather_nxv8f32(float* %base, <vscale x 8 x i32> %offsets, <vscale x 8 x i1> %mask) #0 {
; CHECK-LABEL: masked_gather_nxv8f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: zip1 p2.h, p0.h, p1.h
-; CHECK-NEXT: zip2 p0.h, p0.h, p1.h
-; CHECK-NEXT: ld1w { z0.s }, p2/z, [x0, z0.s, uxtw #2]
+; CHECK-NEXT: punpklo p1.h, p0.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, z0.s, uxtw #2]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, z1.s, uxtw #2]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 8 x i32> %offsets to <vscale x 8 x i64>
@@ -180,23 +175,22 @@ define <vscale x 8 x float> @masked_gather_nxv8f32(float* %base, <vscale x 8 x i
define <vscale x 16 x i8> @masked_gather_nxv16i8(i8* %base, <vscale x 16 x i8> %indices, <vscale x 16 x i1> %mask) #0 {
; CHECK-LABEL: masked_gather_nxv16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: zip2 p2.b, p0.b, p1.b
; CHECK-NEXT: sunpkhi z1.h, z0.b
-; CHECK-NEXT: zip2 p3.h, p2.h, p1.h
+; CHECK-NEXT: punpkhi p1.h, p0.b
; CHECK-NEXT: sunpkhi z2.s, z1.h
-; CHECK-NEXT: zip1 p2.h, p2.h, p1.h
+; CHECK-NEXT: punpkhi p2.h, p1.b
; CHECK-NEXT: sunpklo z1.s, z1.h
-; CHECK-NEXT: ld1sb { z2.s }, p3/z, [x0, z2.s, sxtw]
-; CHECK-NEXT: ld1sb { z1.s }, p2/z, [x0, z1.s, sxtw]
-; CHECK-NEXT: zip1 p0.b, p0.b, p1.b
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1sb { z2.s }, p2/z, [x0, z2.s, sxtw]
+; CHECK-NEXT: ld1sb { z1.s }, p1/z, [x0, z1.s, sxtw]
; CHECK-NEXT: sunpklo z0.h, z0.b
-; CHECK-NEXT: zip2 p2.h, p0.h, p1.h
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: punpkhi p1.h, p0.b
; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
; CHECK-NEXT: sunpkhi z2.s, z0.h
-; CHECK-NEXT: zip1 p0.h, p0.h, p1.h
; CHECK-NEXT: sunpklo z0.s, z0.h
-; CHECK-NEXT: ld1sb { z2.s }, p2/z, [x0, z2.s, sxtw]
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z2.s }, p1/z, [x0, z2.s, sxtw]
; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
@@ -210,33 +204,26 @@ define <vscale x 16 x i8> @masked_gather_nxv16i8(i8* %base, <vscale x 16 x i8> %
define <vscale x 32 x i32> @masked_gather_nxv32i32(i32* %base, <vscale x 32 x i32> %indices, <vscale x 32 x i1> %mask) #0 {
; CHECK-LABEL: masked_gather_nxv32i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: pfalse p2.b
-; CHECK-NEXT: zip1 p3.b, p0.b, p2.b
-; CHECK-NEXT: zip1 p4.h, p3.h, p2.h
-; CHECK-NEXT: zip2 p3.h, p3.h, p2.h
-; CHECK-NEXT: zip2 p0.b, p0.b, p2.b
-; CHECK-NEXT: ld1w { z0.s }, p4/z, [x0, z0.s, sxtw #2]
-; CHECK-NEXT: ld1w { z1.s }, p3/z, [x0, z1.s, sxtw #2]
-; CHECK-NEXT: zip1 p3.h, p0.h, p2.h
-; CHECK-NEXT: zip2 p0.h, p0.h, p2.h
-; CHECK-NEXT: ld1w { z2.s }, p3/z, [x0, z2.s, sxtw #2]
+; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: punpklo p3.h, p2.b
+; CHECK-NEXT: punpkhi p2.h, p2.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p3/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT: ld1w { z1.s }, p2/z, [x0, z1.s, sxtw #2]
+; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: ld1w { z2.s }, p2/z, [x0, z2.s, sxtw #2]
; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, z3.s, sxtw #2]
-; CHECK-NEXT: zip1 p0.b, p1.b, p2.b
-; CHECK-NEXT: zip1 p3.h, p0.h, p2.h
-; CHECK-NEXT: zip2 p0.h, p0.h, p2.h
-; CHECK-NEXT: ld1w { z4.s }, p3/z, [x0, z4.s, sxtw #2]
+; CHECK-NEXT: punpklo p0.h, p1.b
+; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: ld1w { z4.s }, p2/z, [x0, z4.s, sxtw #2]
; CHECK-NEXT: ld1w { z5.s }, p0/z, [x0, z5.s, sxtw #2]
-; CHECK-NEXT: zip2 p0.b, p1.b, p2.b
-; CHECK-NEXT: zip1 p1.h, p0.h, p2.h
-; CHECK-NEXT: zip2 p0.h, p0.h, p2.h
+; CHECK-NEXT: punpkhi p0.h, p1.b
+; CHECK-NEXT: punpklo p1.h, p0.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: ld1w { z6.s }, p1/z, [x0, z6.s, sxtw #2]
; CHECK-NEXT: ld1w { z7.s }, p0/z, [x0, z7.s, sxtw #2]
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%ptrs = getelementptr i32, i32* %base, <vscale x 32 x i32> %indices
%data = call <vscale x 32 x i32> @llvm.masked.gather.nxv32i32(<vscale x 32 x i32*> %ptrs, i32 4, <vscale x 32 x i1> %mask, <vscale x 32 x i32> undef)
@@ -250,10 +237,9 @@ define <vscale x 32 x i32> @masked_gather_nxv32i32(i32* %base, <vscale x 32 x i3
define <vscale x 4 x i32> @masked_sgather_nxv4i8(<vscale x 4 x i8*> %ptrs, <vscale x 4 x i1> %mask) #0 {
; CHECK-LABEL: masked_sgather_nxv4i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1sb { z1.d }, p2/z, [z1.d]
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z1.d }, p1/z, [z1.d]
; CHECK-NEXT: ld1sb { z0.d }, p0/z, [z0.d]
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll
index e8705268dda1..705142566527 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll
@@ -9,28 +9,27 @@ target triple = "aarch64-linux-gnu"
define void @masked_scatter_nxv16i8(<vscale x 16 x i8> %data, i8* %base, <vscale x 16 x i8> %offsets, <vscale x 16 x i1> %mask) #0 {
; CHECK-LABEL: masked_scatter_nxv16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: zip1 p2.b, p0.b, p1.b
; CHECK-NEXT: sunpklo z2.h, z1.b
+; CHECK-NEXT: punpklo p1.h, p0.b
; CHECK-NEXT: uunpklo z4.h, z0.b
-; CHECK-NEXT: zip1 p3.h, p2.h, p1.h
; CHECK-NEXT: sunpklo z3.s, z2.h
+; CHECK-NEXT: punpklo p2.h, p1.b
; CHECK-NEXT: uunpklo z5.s, z4.h
-; CHECK-NEXT: st1b { z5.s }, p3, [x0, z3.s, sxtw]
-; CHECK-NEXT: zip2 p2.h, p2.h, p1.h
+; CHECK-NEXT: st1b { z5.s }, p2, [x0, z3.s, sxtw]
; CHECK-NEXT: sunpkhi z2.s, z2.h
+; CHECK-NEXT: punpkhi p1.h, p1.b
; CHECK-NEXT: uunpkhi z3.s, z4.h
-; CHECK-NEXT: zip2 p0.b, p0.b, p1.b
; CHECK-NEXT: sunpkhi z1.h, z1.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: uunpkhi z0.h, z0.b
-; CHECK-NEXT: st1b { z3.s }, p2, [x0, z2.s, sxtw]
-; CHECK-NEXT: zip1 p2.h, p0.h, p1.h
+; CHECK-NEXT: st1b { z3.s }, p1, [x0, z2.s, sxtw]
; CHECK-NEXT: sunpklo z2.s, z1.h
+; CHECK-NEXT: punpklo p1.h, p0.b
; CHECK-NEXT: uunpklo z3.s, z0.h
-; CHECK-NEXT: zip2 p0.h, p0.h, p1.h
; CHECK-NEXT: sunpkhi z1.s, z1.h
+; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: uunpkhi z0.s, z0.h
-; CHECK-NEXT: st1b { z3.s }, p2, [x0, z2.s, sxtw]
+; CHECK-NEXT: st1b { z3.s }, p1, [x0, z2.s, sxtw]
; CHECK-NEXT: st1b { z0.s }, p0, [x0, z1.s, sxtw]
; CHECK-NEXT: ret
%ptrs = getelementptr i8, i8* %base, <vscale x 16 x i8> %offsets
@@ -41,14 +40,13 @@ define void @masked_scatter_nxv16i8(<vscale x 16 x i8> %data, i8* %base, <vscale
define void @masked_scatter_nxv8i16(<vscale x 8 x i16> %data, i16* %base, <vscale x 8 x i16> %offsets, <vscale x 8 x i1> %mask) #0 {
; CHECK-LABEL: masked_scatter_nxv8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: zip1 p2.h, p0.h, p1.h
; CHECK-NEXT: sunpklo z2.s, z1.h
+; CHECK-NEXT: punpklo p1.h, p0.b
; CHECK-NEXT: uunpklo z3.s, z0.h
-; CHECK-NEXT: zip2 p0.h, p0.h, p1.h
; CHECK-NEXT: sunpkhi z1.s, z1.h
+; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: uunpkhi z0.s, z0.h
-; CHECK-NEXT: st1h { z3.s }, p2, [x0, z2.s, sxtw #1]
+; CHECK-NEXT: st1h { z3.s }, p1, [x0, z2.s, sxtw #1]
; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, sxtw #1]
; CHECK-NEXT: ret
%ptrs = getelementptr i16, i16* %base, <vscale x 8 x i16> %offsets
@@ -59,14 +57,13 @@ define void @masked_scatter_nxv8i16(<vscale x 8 x i16> %data, i16* %base, <vscal
define void @masked_scatter_nxv8bf16(<vscale x 8 x bfloat> %data, bfloat* %base, <vscale x 8 x i16> %offsets, <vscale x 8 x i1> %mask) #0 {
; CHECK-LABEL: masked_scatter_nxv8bf16:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: zip1 p2.h, p0.h, p1.h
; CHECK-NEXT: sunpklo z2.s, z1.h
+; CHECK-NEXT: punpklo p1.h, p0.b
; CHECK-NEXT: uunpklo z3.s, z0.h
-; CHECK-NEXT: zip2 p0.h, p0.h, p1.h
; CHECK-NEXT: sunpkhi z1.s, z1.h
+; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: uunpkhi z0.s, z0.h
-; CHECK-NEXT: st1h { z3.s }, p2, [x0, z2.s, sxtw #1]
+; CHECK-NEXT: st1h { z3.s }, p1, [x0, z2.s, sxtw #1]
; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, sxtw #1]
; CHECK-NEXT: ret
%ptrs = getelementptr bfloat, bfloat* %base, <vscale x 8 x i16> %offsets
@@ -77,10 +74,9 @@ define void @masked_scatter_nxv8bf16(<vscale x 8 x bfloat> %data, bfloat* %base,
define void @masked_scatter_nxv8f32(<vscale x 8 x float> %data, float* %base, <vscale x 8 x i32> %indexes, <vscale x 8 x i1> %masks) #0 {
; CHECK-LABEL: masked_scatter_nxv8f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: zip1 p2.h, p0.h, p1.h
-; CHECK-NEXT: zip2 p0.h, p0.h, p1.h
-; CHECK-NEXT: st1w { z0.s }, p2, [x0, z2.s, uxtw #2]
+; CHECK-NEXT: punpklo p1.h, p0.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: st1w { z0.s }, p1, [x0, z2.s, uxtw #2]
; CHECK-NEXT: st1w { z1.s }, p0, [x0, z3.s, uxtw #2]
; CHECK-NEXT: ret
%ext = zext <vscale x 8 x i32> %indexes to <vscale x 8 x i64>
@@ -93,9 +89,6 @@ define void @masked_scatter_nxv8f32(<vscale x 8 x float> %data, float* %base, <v
define void @masked_scatter_nxv32i32(<vscale x 32 x i32> %data, i32* %base, <vscale x 32 x i32> %offsets, <vscale x 32 x i1> %mask) #0 {
; CHECK-LABEL: masked_scatter_nxv32i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: ptrue p2.s
; CHECK-NEXT: ld1w { z24.s }, p2/z, [x1, #7, mul vl]
; CHECK-NEXT: ld1w { z25.s }, p2/z, [x1, #6, mul vl]
@@ -105,30 +98,26 @@ define void @masked_scatter_nxv32i32(<vscale x 32 x i32> %data, i32* %base, <vsc
; CHECK-NEXT: ld1w { z29.s }, p2/z, [x1, #2, mul vl]
; CHECK-NEXT: ld1w { z30.s }, p2/z, [x1, #1, mul vl]
; CHECK-NEXT: ld1w { z31.s }, p2/z, [x1]
-; CHECK-NEXT: pfalse p2.b
-; CHECK-NEXT: zip1 p3.b, p0.b, p2.b
-; CHECK-NEXT: zip1 p4.h, p3.h, p2.h
-; CHECK-NEXT: zip2 p3.h, p3.h, p2.h
-; CHECK-NEXT: zip2 p0.b, p0.b, p2.b
-; CHECK-NEXT: st1w { z0.s }, p4, [x0, z31.s, sxtw #2]
-; CHECK-NEXT: st1w { z1.s }, p3, [x0, z30.s, sxtw #2]
-; CHECK-NEXT: zip1 p3.h, p0.h, p2.h
-; CHECK-NEXT: zip2 p0.h, p0.h, p2.h
-; CHECK-NEXT: st1w { z2.s }, p3, [x0, z29.s, sxtw #2]
+; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: punpklo p3.h, p2.b
+; CHECK-NEXT: punpkhi p2.h, p2.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: st1w { z0.s }, p3, [x0, z31.s, sxtw #2]
+; CHECK-NEXT: st1w { z1.s }, p2, [x0, z30.s, sxtw #2]
+; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: st1w { z2.s }, p2, [x0, z29.s, sxtw #2]
; CHECK-NEXT: st1w { z3.s }, p0, [x0, z28.s, sxtw #2]
-; CHECK-NEXT: zip1 p0.b, p1.b, p2.b
-; CHECK-NEXT: zip1 p3.h, p0.h, p2.h
-; CHECK-NEXT: zip2 p0.h, p0.h, p2.h
-; CHECK-NEXT: st1w { z4.s }, p3, [x0, z27.s, sxtw #2]
+; CHECK-NEXT: punpklo p0.h, p1.b
+; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: st1w { z4.s }, p2, [x0, z27.s, sxtw #2]
; CHECK-NEXT: st1w { z5.s }, p0, [x0, z26.s, sxtw #2]
-; CHECK-NEXT: zip2 p0.b, p1.b, p2.b
-; CHECK-NEXT: zip1 p1.h, p0.h, p2.h
-; CHECK-NEXT: zip2 p0.h, p0.h, p2.h
+; CHECK-NEXT: punpkhi p0.h, p1.b
+; CHECK-NEXT: punpklo p1.h, p0.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: st1w { z6.s }, p1, [x0, z25.s, sxtw #2]
; CHECK-NEXT: st1w { z7.s }, p0, [x0, z24.s, sxtw #2]
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%ptrs = getelementptr i32, i32* %base, <vscale x 32 x i32> %offsets
call void @llvm.masked.scatter.nxv32i32(<vscale x 32 x i32> %data, <vscale x 32 x i32*> %ptrs, i32 4, <vscale x 32 x i1> %mask)
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll
index 4737805d6ab7..0d2e04c80d46 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll
@@ -76,11 +76,10 @@ define void @masked_scatter_nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x d
define void @masked_scatter_splat_constant_pointer (<vscale x 4 x i1> %pg) {
; CHECK-LABEL: masked_scatter_splat_constant_pointer:
; CHECK: // %bb.0: // %vector.body
-; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: punpklo p1.h, p0.b
; CHECK-NEXT: mov z0.d, #0 // =0x0
-; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
-; CHECK-NEXT: st1w { z0.d }, p2, [x8, z0.d, lsl #2]
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: st1w { z0.d }, p1, [x8, z0.d, lsl #2]
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z0.d, lsl #2]
; CHECK-NEXT: ret
vector.body:
diff --git a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
index e28c91c22917..b004bd7b0f6d 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
@@ -305,11 +305,10 @@ define <vscale x 4 x double> @scvtf_d_nxv4i32(<vscale x 4 x i32> %a) {
define <vscale x 4 x double> @scvtf_d_nxv4i1(<vscale x 4 x i1> %a) {
; CHECK-LABEL: scvtf_d_nxv4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: zip1 p3.s, p0.s, p1.s
-; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: punpklo p1.h, p0.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: ptrue p2.d
-; CHECK-NEXT: mov z0.d, p3/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: scvtf z0.d, p2/m, z0.d
; CHECK-NEXT: scvtf z1.d, p2/m, z1.d
@@ -366,11 +365,10 @@ define <vscale x 4 x double> @ucvtf_d_nxv4i32(<vscale x 4 x i32> %a) {
define <vscale x 4 x double> @ucvtf_d_nxv4i1(<vscale x 4 x i1> %a) {
; CHECK-LABEL: ucvtf_d_nxv4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: zip1 p3.s, p0.s, p1.s
-; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: punpklo p1.h, p0.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: ptrue p2.d
-; CHECK-NEXT: mov z0.d, p3/z, #1 // =0x1
+; CHECK-NEXT: mov z0.d, p1/z, #1 // =0x1
; CHECK-NEXT: mov z1.d, p0/z, #1 // =0x1
; CHECK-NEXT: ucvtf z0.d, p2/m, z0.d
; CHECK-NEXT: ucvtf z1.d, p2/m, z1.d
diff --git a/llvm/test/CodeGen/AArch64/sve-split-load.ll b/llvm/test/CodeGen/AArch64/sve-split-load.ll
index a291412cec5f..7568c0ea6eac 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-load.ll
@@ -90,15 +90,14 @@ define <vscale x 32 x i8> @masked_load_split_32i8(<vscale x 32 x i8> *%a, <vscal
define <vscale x 32 x i16> @masked_load_split_32i16(<vscale x 32 x i16> *%a, <vscale x 32 x i1> %pg) {
; CHECK-LABEL: masked_load_split_32i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p2.b
-; CHECK-NEXT: zip1 p3.b, p0.b, p2.b
-; CHECK-NEXT: zip2 p0.b, p0.b, p2.b
-; CHECK-NEXT: ld1h { z0.h }, p3/z, [x0]
-; CHECK-NEXT: zip1 p3.b, p1.b, p2.b
+; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: punpklo p3.h, p1.b
+; CHECK-NEXT: punpkhi p1.h, p1.b
+; CHECK-NEXT: ld1h { z0.h }, p2/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: zip2 p0.b, p1.b, p2.b
; CHECK-NEXT: ld1h { z2.h }, p3/z, [x0, #2, mul vl]
-; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT: ld1h { z3.h }, p1/z, [x0, #3, mul vl]
; CHECK-NEXT: ret
%load = call <vscale x 32 x i16> @llvm.masked.load.nxv32i16(<vscale x 32 x i16> *%a, i32 1, <vscale x 32 x i1> %pg, <vscale x 32 x i16> undef)
ret <vscale x 32 x i16> %load
@@ -107,10 +106,9 @@ define <vscale x 32 x i16> @masked_load_split_32i16(<vscale x 32 x i16> *%a, <vs
define <vscale x 8 x i32> @masked_load_split_8i32(<vscale x 8 x i32> *%a, <vscale x 8 x i1> %pg) {
; CHECK-LABEL: masked_load_split_8i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: zip1 p2.h, p0.h, p1.h
-; CHECK-NEXT: zip2 p0.h, p0.h, p1.h
-; CHECK-NEXT: ld1w { z0.s }, p2/z, [x0]
+; CHECK-NEXT: punpklo p1.h, p0.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: ret
%load = call <vscale x 8 x i32> @llvm.masked.load.nxv8i32(<vscale x 8 x i32> *%a, i32 1, <vscale x 8 x i1> %pg, <vscale x 8 x i32> undef)
@@ -120,16 +118,15 @@ define <vscale x 8 x i32> @masked_load_split_8i32(<vscale x 8 x i32> *%a, <vscal
define <vscale x 8 x i64> @masked_load_split_8i64(<vscale x 8 x i64> *%a, <vscale x 8 x i1> %pg) {
; CHECK-LABEL: masked_load_split_8i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: zip1 p2.h, p0.h, p1.h
-; CHECK-NEXT: zip2 p0.h, p0.h, p1.h
-; CHECK-NEXT: zip1 p3.s, p2.s, p1.s
-; CHECK-NEXT: zip2 p2.s, p2.s, p1.s
-; CHECK-NEXT: ld1d { z0.d }, p3/z, [x0]
-; CHECK-NEXT: ld1d { z1.d }, p2/z, [x0, #1, mul vl]
-; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #2, mul vl]
+; CHECK-NEXT: punpklo p1.h, p0.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: punpklo p2.h, p1.b
+; CHECK-NEXT: punpkhi p1.h, p1.b
+; CHECK-NEXT: ld1d { z0.d }, p2/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: punpklo p1.h, p0.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: ld1d { z2.d }, p1/z, [x0, #2, mul vl]
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0, #3, mul vl]
; CHECK-NEXT: ret
%load = call <vscale x 8 x i64> @llvm.masked.load.nxv8i64(<vscale x 8 x i64> *%a, i32 1, <vscale x 8 x i1> %pg, <vscale x 8 x i64> undef)
diff --git a/llvm/test/CodeGen/AArch64/sve-split-store.ll b/llvm/test/CodeGen/AArch64/sve-split-store.ll
index 27555e24c695..b19b18edd917 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-store.ll
@@ -78,12 +78,11 @@ define void @masked_store_split_32i8(<vscale x 32 x i8> %data, <vscale x 32 x i8
define void @masked_store_split_32i16(<vscale x 32 x i16> %data, <vscale x 32 x i16> *%a, <vscale x 32 x i1> %pg) {
; CHECK-LABEL: masked_store_split_32i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p2.b
-; CHECK-NEXT: zip2 p3.b, p1.b, p2.b
-; CHECK-NEXT: zip1 p1.b, p1.b, p2.b
-; CHECK-NEXT: st1h { z3.h }, p3, [x0, #3, mul vl]
-; CHECK-NEXT: zip2 p3.b, p0.b, p2.b
-; CHECK-NEXT: zip1 p0.b, p0.b, p2.b
+; CHECK-NEXT: punpkhi p2.h, p1.b
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: punpkhi p3.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1h { z3.h }, p2, [x0, #3, mul vl]
; CHECK-NEXT: st1h { z2.h }, p1, [x0, #2, mul vl]
; CHECK-NEXT: st1h { z1.h }, p3, [x0, #1, mul vl]
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
@@ -95,10 +94,9 @@ define void @masked_store_split_32i16(<vscale x 32 x i16> %data, <vscale x 32 x
define void @masked_store_split_8i32(<vscale x 8 x i32> %data, <vscale x 8 x i32> *%a, <vscale x 8 x i1> %pg) {
; CHECK-LABEL: masked_store_split_8i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: zip2 p2.h, p0.h, p1.h
-; CHECK-NEXT: zip1 p0.h, p0.h, p1.h
-; CHECK-NEXT: st1w { z1.s }, p2, [x0, #1, mul vl]
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1w { z1.s }, p1, [x0, #1, mul vl]
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
call void @llvm.masked.store.nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x i32> *%a, i32 1, <vscale x 8 x i1> %pg)
@@ -108,16 +106,15 @@ define void @masked_store_split_8i32(<vscale x 8 x i32> %data, <vscale x 8 x i32
define void @masked_store_split_8i64(<vscale x 8 x i64> %data, <vscale x 8 x i64> *%a, <vscale x 8 x i1> %pg) {
; CHECK-LABEL: masked_store_split_8i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: zip2 p2.h, p0.h, p1.h
-; CHECK-NEXT: zip1 p0.h, p0.h, p1.h
-; CHECK-NEXT: zip2 p3.s, p2.s, p1.s
-; CHECK-NEXT: zip1 p2.s, p2.s, p1.s
-; CHECK-NEXT: st1d { z3.d }, p3, [x0, #3, mul vl]
-; CHECK-NEXT: st1d { z2.d }, p2, [x0, #2, mul vl]
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: st1d { z1.d }, p2, [x0, #1, mul vl]
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: punpkhi p2.h, p1.b
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: st1d { z3.d }, p2, [x0, #3, mul vl]
+; CHECK-NEXT: st1d { z2.d }, p1, [x0, #2, mul vl]
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1d { z1.d }, p1, [x0, #1, mul vl]
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
call void @llvm.masked.store.nxv8i64(<vscale x 8 x i64> %data, <vscale x 8 x i64> *%a, i32 1, <vscale x 8 x i1> %pg)
More information about the llvm-commits
mailing list