[llvm] c95253b - [LLVM][SVE] Clean VLS tests to not use wide vectors as function return types.
Paul Walker via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 2 05:43:12 PDT 2023
Author: Paul Walker
Date: 2023-11-02T12:41:37Z
New Revision: c95253b1bac865b6d90cce186b7d665de163d50c
URL: https://github.com/llvm/llvm-project/commit/c95253b1bac865b6d90cce186b7d665de163d50c
DIFF: https://github.com/llvm/llvm-project/commit/c95253b1bac865b6d90cce186b7d665de163d50c.diff
LOG: [LLVM][SVE] Clean VLS tests to not use wide vectors as function return types.
Added:
llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll
Modified:
llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
Removed:
llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll
################################################################################
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
index 79962d441d1048d..0dd7320413a147f 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
@@ -446,7 +446,7 @@ define <2 x i64> @extract_fixed_v2i64_nxv2i64(<vscale x 2 x i64> %vec) nounwind
ret <2 x i64> %retval
}
-define <4 x i64> @extract_fixed_v4i64_nxv2i64(<vscale x 2 x i64> %vec) nounwind #0 {
+define void @extract_fixed_v4i64_nxv2i64(<vscale x 2 x i64> %vec, ptr %p) nounwind #0 {
; CHECK-LABEL: extract_fixed_v4i64_nxv2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
@@ -454,12 +454,13 @@ define <4 x i64> @extract_fixed_v4i64_nxv2i64(<vscale x 2 x i64> %vec) nounwind
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
-; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%retval = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> %vec, i64 4)
- ret <4 x i64> %retval
+ store <4 x i64> %retval, ptr %p
+ ret void
}
; Check that extract from load via bitcast-gep-of-scalar-ptr does not crash.
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll
index 3fdd08701053ebb..fb494afa11de269 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll
@@ -9,7 +9,7 @@ target triple = "aarch64"
; this is implemented, this test will be fleshed out.
;
-define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %right_ptr) #0 {
+define void @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %right_ptr, ptr %result_ptr) #0 {
; CHECK-LABEL: fixed_bitselect_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -22,7 +22,7 @@ define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %r
; CHECK-NEXT: and z0.d, z0.d, z2.d
; CHECK-NEXT: and z1.d, z1.d, z3.d
; CHECK-NEXT: orr z0.d, z1.d, z0.d
-; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: st1w { z0.s }, p0, [x3]
; CHECK-NEXT: ret
%pre_cond = load <8 x i32>, ptr %pre_cond_ptr
%left = load <8 x i32>, ptr %left_ptr
@@ -33,7 +33,8 @@ define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %r
%left_bits_0 = and <8 x i32> %neg_cond, %left
%right_bits_0 = and <8 x i32> %min_cond, %right
%bsl0000 = or <8 x i32> %right_bits_0, %left_bits_0
- ret <8 x i32> %bsl0000
+ store <8 x i32> %bsl0000, ptr %result_ptr
+ ret void
}
attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
index 94aef218d4de319..f6ed2e6a787f01d 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
@@ -34,62 +34,66 @@ define <2 x i256> @load_zext_v2i64i256(ptr %ap) #0 {
ret <2 x i256> %val
}
-define <8 x i32> @load_zext_v8i16i32(ptr %ap) vscale_range(2,0) #0 {
+define void @load_zext_v8i16i32(ptr %ap, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: load_zext_v8i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
-; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%a = load <8 x i16>, ptr %ap
%val = zext <8 x i16> %a to <8 x i32>
- ret <8 x i32> %val
+ store <8 x i32> %val, ptr %b
+ ret void
}
-define <16 x i32> @load_zext_v16i16i32(ptr %ap) vscale_range(4,0) #0 {
+define void @load_zext_v16i16i32(ptr %ap, ptr %b) vscale_range(4,0) #0 {
; CHECK-LABEL: load_zext_v16i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl16
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
-; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%a = load <16 x i16>, ptr %ap
%val = zext <16 x i16> %a to <16 x i32>
- ret <16 x i32> %val
+ store <16 x i32> %val, ptr %b
+ ret void
}
-define <32 x i32> @load_zext_v32i16i32(ptr %ap) vscale_range(8,0) #0 {
+define void @load_zext_v32i16i32(ptr %ap, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: load_zext_v32i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
-; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%a = load <32 x i16>, ptr %ap
%val = zext <32 x i16> %a to <32 x i32>
- ret <32 x i32> %val
+ store <32 x i32> %val, ptr %b
+ ret void
}
-define <64 x i32> @load_zext_v64i16i32(ptr %ap) #0 {
+define void @load_zext_v64i16i32(ptr %ap, ptr %b) #0 {
; VBITS_GE_1024-LABEL: load_zext_v64i16i32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: mov x9, #32 // =0x20
-; VBITS_GE_1024-NEXT: ld1h { z0.s }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_1024-NEXT: mov x8, #32 // =0x20
+; VBITS_GE_1024-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_1024-NEXT: ld1h { z1.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_zext_v64i16i32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1h { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%a = load <64 x i16>, ptr %ap
%val = zext <64 x i16> %a to <64 x i32>
- ret <64 x i32> %val
+ store <64 x i32> %val, ptr %b
+ ret void
}
define <4 x i32> @load_sext_v4i16i32(ptr %ap) vscale_range(2,0) #0 {
@@ -103,196 +107,206 @@ define <4 x i32> @load_sext_v4i16i32(ptr %ap) vscale_range(2,0) #0 {
ret <4 x i32> %val
}
-define <8 x i32> @load_sext_v8i16i32(ptr %ap) vscale_range(2,0) #0 {
+define void @load_sext_v8i16i32(ptr %ap, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: load_sext_v8i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
-; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%a = load <8 x i16>, ptr %ap
%val = sext <8 x i16> %a to <8 x i32>
- ret <8 x i32> %val
+ store <8 x i32> %val, ptr %b
+ ret void
}
-define <16 x i32> @load_sext_v16i16i32(ptr %ap) vscale_range(4,0) #0 {
+define void @load_sext_v16i16i32(ptr %ap, ptr %b) vscale_range(4,0) #0 {
; CHECK-LABEL: load_sext_v16i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl16
; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
-; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%a = load <16 x i16>, ptr %ap
%val = sext <16 x i16> %a to <16 x i32>
- ret <16 x i32> %val
+ store <16 x i32> %val, ptr %b
+ ret void
}
-define <32 x i32> @load_sext_v32i16i32(ptr %ap) vscale_range(8,0) #0 {
+define void @load_sext_v32i16i32(ptr %ap, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: load_sext_v32i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
-; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%a = load <32 x i16>, ptr %ap
%val = sext <32 x i16> %a to <32 x i32>
- ret <32 x i32> %val
+ store <32 x i32> %val, ptr %b
+ ret void
}
-define <64 x i32> @load_sext_v64i16i32(ptr %ap) #0 {
+define void @load_sext_v64i16i32(ptr %ap, ptr %b) #0 {
; VBITS_GE_1024-LABEL: load_sext_v64i16i32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: mov x9, #32 // =0x20
-; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_1024-NEXT: mov x8, #32 // =0x20
+; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_1024-NEXT: ld1sh { z1.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_sext_v64i16i32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%a = load <64 x i16>, ptr %ap
%val = sext <64 x i16> %a to <64 x i32>
- ret <64 x i32> %val
+ store <64 x i32> %val, ptr %b
+ ret void
}
-define <32 x i64> @load_zext_v32i8i64(ptr %ap) #0 {
+define void @load_zext_v32i8i64(ptr %ap, ptr %b) #0 {
; VBITS_GE_1024-LABEL: load_zext_v32i8i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: mov w9, #16 // =0x10
-; VBITS_GE_1024-NEXT: ld1b { z0.d }, p0/z, [x0, x9]
+; VBITS_GE_1024-NEXT: mov w8, #16 // =0x10
+; VBITS_GE_1024-NEXT: ld1b { z0.d }, p0/z, [x0, x8]
; VBITS_GE_1024-NEXT: ld1b { z1.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_1024-NEXT: mov x8, #16 // =0x10
+; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_zext_v32i8i64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1b { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%a = load <32 x i8>, ptr %ap
%val = zext <32 x i8> %a to <32 x i64>
- ret <32 x i64> %val
+ store <32 x i64> %val, ptr %b
+ ret void
}
-define <32 x i64> @load_sext_v32i8i64(ptr %ap) #0 {
+define void @load_sext_v32i8i64(ptr %ap, ptr %b) #0 {
; VBITS_GE_1024-LABEL: load_sext_v32i8i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: mov w9, #16 // =0x10
-; VBITS_GE_1024-NEXT: ld1sb { z0.d }, p0/z, [x0, x9]
+; VBITS_GE_1024-NEXT: mov w8, #16 // =0x10
+; VBITS_GE_1024-NEXT: ld1sb { z0.d }, p0/z, [x0, x8]
; VBITS_GE_1024-NEXT: ld1sb { z1.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_1024-NEXT: mov x8, #16 // =0x10
+; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_sext_v32i8i64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%a = load <32 x i8>, ptr %ap
%val = sext <32 x i8> %a to <32 x i64>
- ret <32 x i64> %val
+ store <32 x i64> %val, ptr %b
+ ret void
}
-define <32 x i64> @load_zext_v32i16i64(ptr %ap) #0 {
+define void @load_zext_v32i16i64(ptr %ap, ptr %b) #0 {
; VBITS_GE_1024-LABEL: load_zext_v32i16i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10
-; VBITS_GE_1024-NEXT: ld1h { z0.d }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_1024-NEXT: mov x8, #16 // =0x10
+; VBITS_GE_1024-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_1024-NEXT: ld1h { z1.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_zext_v32i16i64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%a = load <32 x i16>, ptr %ap
%val = zext <32 x i16> %a to <32 x i64>
- ret <32 x i64> %val
+ store <32 x i64> %val, ptr %b
+ ret void
}
-define <32 x i64> @load_sext_v32i16i64(ptr %ap) #0 {
+define void @load_sext_v32i16i64(ptr %ap, ptr %b) #0 {
; VBITS_GE_1024-LABEL: load_sext_v32i16i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10
-; VBITS_GE_1024-NEXT: ld1sh { z0.d }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_1024-NEXT: mov x8, #16 // =0x10
+; VBITS_GE_1024-NEXT: ld1sh { z0.d }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_1024-NEXT: ld1sh { z1.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_sext_v32i16i64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%a = load <32 x i16>, ptr %ap
%val = sext <32 x i16> %a to <32 x i64>
- ret <32 x i64> %val
+ store <32 x i64> %val, ptr %b
+ ret void
}
-define <32 x i64> @load_zext_v32i32i64(ptr %ap) #0 {
+define void @load_zext_v32i32i64(ptr %ap, ptr %b) #0 {
; VBITS_GE_1024-LABEL: load_zext_v32i32i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10
-; VBITS_GE_1024-NEXT: ld1w { z0.d }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_1024-NEXT: mov x8, #16 // =0x10
+; VBITS_GE_1024-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_1024-NEXT: ld1w { z1.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_zext_v32i32i64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%a = load <32 x i32>, ptr %ap
%val = zext <32 x i32> %a to <32 x i64>
- ret <32 x i64> %val
+ store <32 x i64> %val, ptr %b
+ ret void
}
-define <32 x i64> @load_sext_v32i32i64(ptr %ap) #0 {
+define void @load_sext_v32i32i64(ptr %ap, ptr %b) #0 {
; VBITS_GE_1024-LABEL: load_sext_v32i32i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10
-; VBITS_GE_1024-NEXT: ld1sw { z0.d }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_1024-NEXT: mov x8, #16 // =0x10
+; VBITS_GE_1024-NEXT: ld1sw { z0.d }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_1024-NEXT: ld1sw { z1.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_sext_v32i32i64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%a = load <32 x i32>, ptr %ap
%val = sext <32 x i32> %a to <32 x i64>
- ret <32 x i64> %val
+ store <32 x i64> %val, ptr %b
+ ret void
}
attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
index 2dd06e08d16b631..977c528e2583af3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
@@ -18,8 +18,8 @@ define <4 x half> @insertelement_v4f16(<4 x half> %op1) vscale_range(2,0) #0 {
; CHECK-NEXT: mov v0.h[3], v1.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
- %r = insertelement <4 x half> %op1, half 5.0, i64 3
- ret <4 x half> %r
+ %r = insertelement <4 x half> %op1, half 5.0, i64 3
+ ret <4 x half> %r
}
; Don't use SVE for 128-bit vectors.
@@ -29,101 +29,105 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) vscale_range(2,0) #0 {
; CHECK-NEXT: fmov h1, #5.00000000
; CHECK-NEXT: mov v0.h[7], v1.h[0]
; CHECK-NEXT: ret
- %r = insertelement <8 x half> %op1, half 5.0, i64 7
- ret <8 x half> %r
+ %r = insertelement <8 x half> %op1, half 5.0, i64 7
+ ret <8 x half> %r
}
-define <16 x half> @insertelement_v16f16(ptr %a) vscale_range(2,0) #0 {
+define void @insertelement_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: insertelement_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
-; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: mov w8, #15 // =0xf
; CHECK-NEXT: index z0.h, #0, #1
; CHECK-NEXT: ptrue p1.h
-; CHECK-NEXT: mov z1.h, w9
+; CHECK-NEXT: mov z1.h, w8
; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0]
; CHECK-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h
; CHECK-NEXT: fmov h0, #5.00000000
; CHECK-NEXT: mov z2.h, p1/m, h0
-; CHECK-NEXT: st1h { z2.h }, p0, [x8]
+; CHECK-NEXT: st1h { z2.h }, p0, [x1]
; CHECK-NEXT: ret
- %op1 = load <16 x half>, ptr %a
- %r = insertelement <16 x half> %op1, half 5.0, i64 15
- ret <16 x half> %r
+ %op1 = load <16 x half>, ptr %a
+ %r = insertelement <16 x half> %op1, half 5.0, i64 15
+ store <16 x half> %r, ptr %b
+ ret void
}
-define <32 x half> @insertelement_v32f16(ptr %a) #0 {
+define void @insertelement_v32f16(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: insertelement_v32f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov w9, #15 // =0xf
+; VBITS_GE_256-NEXT: mov w8, #15 // =0xf
; VBITS_GE_256-NEXT: index z0.h, #0, #1
; VBITS_GE_256-NEXT: ptrue p1.h
-; VBITS_GE_256-NEXT: mov z1.h, w9
-; VBITS_GE_256-NEXT: mov x9, #16 // =0x10
+; VBITS_GE_256-NEXT: mov z1.h, w8
+; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
; VBITS_GE_256-NEXT: fmov h2, #5.00000000
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov z3.h, p1/m, h2
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x8, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8]
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: insertelement_v32f16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
-; VBITS_GE_512-NEXT: mov w9, #31 // =0x1f
+; VBITS_GE_512-NEXT: mov w8, #31 // =0x1f
; VBITS_GE_512-NEXT: index z0.h, #0, #1
; VBITS_GE_512-NEXT: ptrue p1.h
-; VBITS_GE_512-NEXT: mov z1.h, w9
+; VBITS_GE_512-NEXT: mov z1.h, w8
; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h
; VBITS_GE_512-NEXT: fmov h0, #5.00000000
; VBITS_GE_512-NEXT: mov z2.h, p1/m, h0
-; VBITS_GE_512-NEXT: st1h { z2.h }, p0, [x8]
+; VBITS_GE_512-NEXT: st1h { z2.h }, p0, [x1]
; VBITS_GE_512-NEXT: ret
- %op1 = load <32 x half>, ptr %a
- %r = insertelement <32 x half> %op1, half 5.0, i64 31
- ret <32 x half> %r
+ %op1 = load <32 x half>, ptr %a
+ %r = insertelement <32 x half> %op1, half 5.0, i64 31
+ store <32 x half> %r, ptr %b
+ ret void
}
-define <64 x half> @insertelement_v64f16(ptr %a) vscale_range(8,0) #0 {
+define void @insertelement_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: insertelement_v64f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
-; CHECK-NEXT: mov w9, #63 // =0x3f
+; CHECK-NEXT: mov w8, #63 // =0x3f
; CHECK-NEXT: index z0.h, #0, #1
; CHECK-NEXT: ptrue p1.h
-; CHECK-NEXT: mov z1.h, w9
+; CHECK-NEXT: mov z1.h, w8
; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0]
; CHECK-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h
; CHECK-NEXT: fmov h0, #5.00000000
; CHECK-NEXT: mov z2.h, p1/m, h0
-; CHECK-NEXT: st1h { z2.h }, p0, [x8]
+; CHECK-NEXT: st1h { z2.h }, p0, [x1]
; CHECK-NEXT: ret
- %op1 = load <64 x half>, ptr %a
- %r = insertelement <64 x half> %op1, half 5.0, i64 63
- ret <64 x half> %r
+ %op1 = load <64 x half>, ptr %a
+ %r = insertelement <64 x half> %op1, half 5.0, i64 63
+ store <64 x half> %r, ptr %b
+ ret void
}
-define <128 x half> @insertelement_v128f16(ptr %a) vscale_range(16,0) #0 {
+define void @insertelement_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: insertelement_v128f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
-; CHECK-NEXT: mov w9, #127 // =0x7f
+; CHECK-NEXT: mov w8, #127 // =0x7f
; CHECK-NEXT: index z0.h, #0, #1
; CHECK-NEXT: ptrue p1.h
-; CHECK-NEXT: mov z1.h, w9
+; CHECK-NEXT: mov z1.h, w8
; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0]
; CHECK-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h
; CHECK-NEXT: fmov h0, #5.00000000
; CHECK-NEXT: mov z2.h, p1/m, h0
-; CHECK-NEXT: st1h { z2.h }, p0, [x8]
+; CHECK-NEXT: st1h { z2.h }, p0, [x1]
; CHECK-NEXT: ret
- %op1 = load <128 x half>, ptr %a
- %r = insertelement <128 x half> %op1, half 5.0, i64 127
- ret <128 x half> %r
+ %op1 = load <128 x half>, ptr %a
+ %r = insertelement <128 x half> %op1, half 5.0, i64 127
+ store <128 x half> %r, ptr %b
+ ret void
}
; Don't use SVE for 64-bit vectors.
@@ -135,8 +139,8 @@ define <2 x float> @insertelement_v2f32(<2 x float> %op1) vscale_range(2,0) #0 {
; CHECK-NEXT: mov v0.s[1], v1.s[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
- %r = insertelement <2 x float> %op1, float 5.0, i64 1
- ret <2 x float> %r
+ %r = insertelement <2 x float> %op1, float 5.0, i64 1
+ ret <2 x float> %r
}
; Don't use SVE for 128-bit vectors.
@@ -146,101 +150,105 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) vscale_range(2,0) #0 {
; CHECK-NEXT: fmov s1, #5.00000000
; CHECK-NEXT: mov v0.s[3], v1.s[0]
; CHECK-NEXT: ret
- %r = insertelement <4 x float> %op1, float 5.0, i64 3
- ret <4 x float> %r
+ %r = insertelement <4 x float> %op1, float 5.0, i64 3
+ ret <4 x float> %r
}
-define <8 x float> @insertelement_v8f32(ptr %a) vscale_range(2,0) #0 {
+define void @insertelement_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: insertelement_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
-; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: mov w8, #7 // =0x7
; CHECK-NEXT: index z0.s, #0, #1
; CHECK-NEXT: ptrue p1.s
-; CHECK-NEXT: mov z1.s, w9
+; CHECK-NEXT: mov z1.s, w8
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0]
; CHECK-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s
; CHECK-NEXT: fmov s0, #5.00000000
; CHECK-NEXT: mov z2.s, p1/m, s0
-; CHECK-NEXT: st1w { z2.s }, p0, [x8]
+; CHECK-NEXT: st1w { z2.s }, p0, [x1]
; CHECK-NEXT: ret
- %op1 = load <8 x float>, ptr %a
- %r = insertelement <8 x float> %op1, float 5.0, i64 7
- ret <8 x float> %r
+ %op1 = load <8 x float>, ptr %a
+ %r = insertelement <8 x float> %op1, float 5.0, i64 7
+ store <8 x float> %r, ptr %b
+ ret void
}
-define <16 x float> @insertelement_v16f32(ptr %a) #0 {
+define void @insertelement_v16f32(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: insertelement_v16f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov w9, #7 // =0x7
+; VBITS_GE_256-NEXT: mov w8, #7 // =0x7
; VBITS_GE_256-NEXT: index z0.s, #0, #1
; VBITS_GE_256-NEXT: ptrue p1.s
-; VBITS_GE_256-NEXT: mov z1.s, w9
-; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
+; VBITS_GE_256-NEXT: mov z1.s, w8
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: fmov s2, #5.00000000
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov z3.s, p1/m, s2
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: insertelement_v16f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: mov w9, #15 // =0xf
+; VBITS_GE_512-NEXT: mov w8, #15 // =0xf
; VBITS_GE_512-NEXT: index z0.s, #0, #1
; VBITS_GE_512-NEXT: ptrue p1.s
-; VBITS_GE_512-NEXT: mov z1.s, w9
+; VBITS_GE_512-NEXT: mov z1.s, w8
; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s
; VBITS_GE_512-NEXT: fmov s0, #5.00000000
; VBITS_GE_512-NEXT: mov z2.s, p1/m, s0
-; VBITS_GE_512-NEXT: st1w { z2.s }, p0, [x8]
+; VBITS_GE_512-NEXT: st1w { z2.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
- %op1 = load <16 x float>, ptr %a
- %r = insertelement <16 x float> %op1, float 5.0, i64 15
- ret <16 x float> %r
+ %op1 = load <16 x float>, ptr %a
+ %r = insertelement <16 x float> %op1, float 5.0, i64 15
+ store <16 x float> %r, ptr %b
+ ret void
}
-define <32 x float> @insertelement_v32f32(ptr %a) vscale_range(8,0) #0 {
+define void @insertelement_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: insertelement_v32f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
-; CHECK-NEXT: mov w9, #31 // =0x1f
+; CHECK-NEXT: mov w8, #31 // =0x1f
; CHECK-NEXT: index z0.s, #0, #1
; CHECK-NEXT: ptrue p1.s
-; CHECK-NEXT: mov z1.s, w9
+; CHECK-NEXT: mov z1.s, w8
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0]
; CHECK-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s
; CHECK-NEXT: fmov s0, #5.00000000
; CHECK-NEXT: mov z2.s, p1/m, s0
-; CHECK-NEXT: st1w { z2.s }, p0, [x8]
+; CHECK-NEXT: st1w { z2.s }, p0, [x1]
; CHECK-NEXT: ret
- %op1 = load <32 x float>, ptr %a
- %r = insertelement <32 x float> %op1, float 5.0, i64 31
- ret <32 x float> %r
+ %op1 = load <32 x float>, ptr %a
+ %r = insertelement <32 x float> %op1, float 5.0, i64 31
+ store <32 x float> %r, ptr %b
+ ret void
}
-define <64 x float> @insertelement_v64f32(ptr %a) vscale_range(16,0) #0 {
+define void @insertelement_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: insertelement_v64f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
-; CHECK-NEXT: mov w9, #63 // =0x3f
+; CHECK-NEXT: mov w8, #63 // =0x3f
; CHECK-NEXT: index z0.s, #0, #1
; CHECK-NEXT: ptrue p1.s
-; CHECK-NEXT: mov z1.s, w9
+; CHECK-NEXT: mov z1.s, w8
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0]
; CHECK-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s
; CHECK-NEXT: fmov s0, #5.00000000
; CHECK-NEXT: mov z2.s, p1/m, s0
-; CHECK-NEXT: st1w { z2.s }, p0, [x8]
+; CHECK-NEXT: st1w { z2.s }, p0, [x1]
; CHECK-NEXT: ret
- %op1 = load <64 x float>, ptr %a
- %r = insertelement <64 x float> %op1, float 5.0, i64 63
- ret <64 x float> %r
+ %op1 = load <64 x float>, ptr %a
+ %r = insertelement <64 x float> %op1, float 5.0, i64 63
+ store <64 x float> %r, ptr %b
+ ret void
}
; Don't use SVE for 64-bit vectors.
@@ -250,8 +258,8 @@ define <1 x double> @insertelement_v1f64(<1 x double> %op1) vscale_range(2,0) #0
; CHECK-NEXT: mov x8, #4617315517961601024 // =0x4014000000000000
; CHECK-NEXT: fmov d0, x8
; CHECK-NEXT: ret
- %r = insertelement <1 x double> %op1, double 5.0, i64 0
- ret <1 x double> %r
+ %r = insertelement <1 x double> %op1, double 5.0, i64 0
+ ret <1 x double> %r
}
; Don't use SVE for 128-bit vectors.
@@ -261,101 +269,105 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) vscale_range(2,0) #0
; CHECK-NEXT: fmov d1, #5.00000000
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ret
- %r = insertelement <2 x double> %op1, double 5.0, i64 1
- ret <2 x double> %r
+ %r = insertelement <2 x double> %op1, double 5.0, i64 1
+ ret <2 x double> %r
}
-define <4 x double> @insertelement_v4f64(ptr %a) vscale_range(2,0) #0 {
+define void @insertelement_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: insertelement_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
-; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: mov w8, #3 // =0x3
; CHECK-NEXT: index z0.d, #0, #1
; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: mov z1.d, x9
+; CHECK-NEXT: mov z1.d, x8
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0]
; CHECK-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d
; CHECK-NEXT: fmov d0, #5.00000000
; CHECK-NEXT: mov z2.d, p1/m, d0
-; CHECK-NEXT: st1d { z2.d }, p0, [x8]
+; CHECK-NEXT: st1d { z2.d }, p0, [x1]
; CHECK-NEXT: ret
- %op1 = load <4 x double>, ptr %a
- %r = insertelement <4 x double> %op1, double 5.0, i64 3
- ret <4 x double> %r
+ %op1 = load <4 x double>, ptr %a
+ %r = insertelement <4 x double> %op1, double 5.0, i64 3
+ store <4 x double> %r, ptr %b
+ ret void
}
-define <8 x double> @insertelement_v8f64(ptr %a) #0 {
+define void @insertelement_v8f64(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: insertelement_v8f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov w9, #3 // =0x3
+; VBITS_GE_256-NEXT: mov w8, #3 // =0x3
; VBITS_GE_256-NEXT: index z0.d, #0, #1
; VBITS_GE_256-NEXT: ptrue p1.d
-; VBITS_GE_256-NEXT: mov z1.d, x9
-; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
+; VBITS_GE_256-NEXT: mov z1.d, x8
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: fmov d2, #5.00000000
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov z3.d, p1/m, d2
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
+; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: insertelement_v8f64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: mov w9, #7 // =0x7
+; VBITS_GE_512-NEXT: mov w8, #7 // =0x7
; VBITS_GE_512-NEXT: index z0.d, #0, #1
; VBITS_GE_512-NEXT: ptrue p1.d
-; VBITS_GE_512-NEXT: mov z1.d, x9
+; VBITS_GE_512-NEXT: mov z1.d, x8
; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d
; VBITS_GE_512-NEXT: fmov d0, #5.00000000
; VBITS_GE_512-NEXT: mov z2.d, p1/m, d0
-; VBITS_GE_512-NEXT: st1d { z2.d }, p0, [x8]
+; VBITS_GE_512-NEXT: st1d { z2.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
- %op1 = load <8 x double>, ptr %a
- %r = insertelement <8 x double> %op1, double 5.0, i64 7
- ret <8 x double> %r
+ %op1 = load <8 x double>, ptr %a
+ %r = insertelement <8 x double> %op1, double 5.0, i64 7
+ store <8 x double> %r, ptr %b
+ ret void
}
-define <16 x double> @insertelement_v16f64(ptr %a) vscale_range(8,0) #0 {
+define void @insertelement_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: insertelement_v16f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
-; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: mov w8, #15 // =0xf
; CHECK-NEXT: index z0.d, #0, #1
; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: mov z1.d, x9
+; CHECK-NEXT: mov z1.d, x8
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0]
; CHECK-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d
; CHECK-NEXT: fmov d0, #5.00000000
; CHECK-NEXT: mov z2.d, p1/m, d0
-; CHECK-NEXT: st1d { z2.d }, p0, [x8]
+; CHECK-NEXT: st1d { z2.d }, p0, [x1]
; CHECK-NEXT: ret
- %op1 = load <16 x double>, ptr %a
- %r = insertelement <16 x double> %op1, double 5.0, i64 15
- ret <16 x double> %r
+ %op1 = load <16 x double>, ptr %a
+ %r = insertelement <16 x double> %op1, double 5.0, i64 15
+ store <16 x double> %r, ptr %b
+ ret void
}
-define <32 x double> @insertelement_v32f64(ptr %a) vscale_range(16,0) #0 {
+define void @insertelement_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: insertelement_v32f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
-; CHECK-NEXT: mov w9, #31 // =0x1f
+; CHECK-NEXT: mov w8, #31 // =0x1f
; CHECK-NEXT: index z0.d, #0, #1
; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: mov z1.d, x9
+; CHECK-NEXT: mov z1.d, x8
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0]
; CHECK-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d
; CHECK-NEXT: fmov d0, #5.00000000
; CHECK-NEXT: mov z2.d, p1/m, d0
-; CHECK-NEXT: st1d { z2.d }, p0, [x8]
+; CHECK-NEXT: st1d { z2.d }, p0, [x1]
; CHECK-NEXT: ret
- %op1 = load <32 x double>, ptr %a
- %r = insertelement <32 x double> %op1, double 5.0, i64 31
- ret <32 x double> %r
+ %op1 = load <32 x double>, ptr %a
+ %r = insertelement <32 x double> %op1, double 5.0, i64 31
+ store <32 x double> %r, ptr %b
+ ret void
}
attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll
similarity index 57%
rename from llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll
rename to llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll
index 8c574f8e4716a7d..becddaea31267a5 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll
@@ -19,178 +19,186 @@
target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE for 64-bit vectors.
-define <2 x float> @load_v2f32(ptr %a) #0 {
+define void @load_v2f32(ptr %a, ptr %b) #0 {
; CHECK-LABEL: load_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: str d0, [x1]
; CHECK-NEXT: ret
%load = load <2 x float>, ptr %a
- ret <2 x float> %load
+ store <2 x float> %load, ptr %b
+ ret void
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @load_v4f32(ptr %a) #0 {
+define void @load_v4f32(ptr %a, ptr %b) #0 {
; CHECK-LABEL: load_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: str q0, [x1]
; CHECK-NEXT: ret
%load = load <4 x float>, ptr %a
- ret <4 x float> %load
+ store <4 x float> %load, ptr %b
+ ret void
}
-define <8 x float> @load_v8f32(ptr %a) #0 {
+define void @load_v8f32(ptr %a, ptr %b) #0 {
; CHECK-LABEL: load_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%load = load <8 x float>, ptr %a
- ret <8 x float> %load
+ store <8 x float> %load, ptr %b
+ ret void
}
-define <16 x float> @load_v16f32(ptr %a) #0 {
+define void @load_v16f32(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: load_v16f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: load_v16f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
;
; VBITS_GE_1024-LABEL: load_v16f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_v16f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl16
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%load = load <16 x float>, ptr %a
- ret <16 x float> %load
+ store <16 x float> %load, ptr %b
+ ret void
}
-define <32 x float> @load_v32f32(ptr %a) #0 {
+define void @load_v32f32(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: load_v32f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
; VBITS_GE_256-NEXT: mov x9, #24 // =0x18
-; VBITS_GE_256-NEXT: mov x10, #16 // =0x10
-; VBITS_GE_256-NEXT: mov x11, #8 // =0x8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
+; VBITS_GE_256-NEXT: mov x10, #8 // =0x8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: load_v32f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: mov x9, #16 // =0x10
-; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_512-NEXT: mov x8, #16 // =0x10
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_512-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_512-NEXT: st1w { z1.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
;
; VBITS_GE_1024-LABEL: load_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_v32f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%load = load <32 x float>, ptr %a
- ret <32 x float> %load
+ store <32 x float> %load, ptr %b
+ ret void
}
-define <64 x float> @load_v64f32(ptr %a) #0 {
+define void @load_v64f32(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: load_v64f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT: mov x9, #24 // =0x18
; VBITS_GE_256-NEXT: mov x10, #16 // =0x10
-; VBITS_GE_256-NEXT: mov x11, #24 // =0x18
-; VBITS_GE_256-NEXT: mov x12, #56 // =0x38
-; VBITS_GE_256-NEXT: mov x13, #32 // =0x20
-; VBITS_GE_256-NEXT: mov x14, #48 // =0x30
-; VBITS_GE_256-NEXT: mov x15, #40 // =0x28
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x15, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x13, lsl #2]
+; VBITS_GE_256-NEXT: mov x11, #48 // =0x30
+; VBITS_GE_256-NEXT: mov x12, #40 // =0x28
+; VBITS_GE_256-NEXT: mov x13, #56 // =0x38
+; VBITS_GE_256-NEXT: mov x14, #32 // =0x20
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x11, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x13, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x10, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x14, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x8, x15, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x8, x13, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x8, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x8]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x11, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x13, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x14, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x12, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x10, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: load_v64f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: mov x8, #32 // =0x20
; VBITS_GE_512-NEXT: mov x9, #48 // =0x30
-; VBITS_GE_512-NEXT: mov x10, #32 // =0x20
-; VBITS_GE_512-NEXT: mov x11, #16 // =0x10
-; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
+; VBITS_GE_512-NEXT: mov x10, #16 // =0x10
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_512-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_512-NEXT: st1w { z1.s }, p0, [x8, x10, lsl #2]
-; VBITS_GE_512-NEXT: st1w { z2.s }, p0, [x8, x11, lsl #2]
-; VBITS_GE_512-NEXT: st1w { z3.s }, p0, [x8]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_512-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2]
+; VBITS_GE_512-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2]
+; VBITS_GE_512-NEXT: st1w { z3.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
;
; VBITS_GE_1024-LABEL: load_v64f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: mov x9, #32 // =0x20
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_1024-NEXT: mov x8, #32 // =0x20
+; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%load = load <64 x float>, ptr %a
- ret <64 x float> %load
+ store <64 x float> %load, ptr %b
+ ret void
}
attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
index 5dfce78af18b8e6..fee233643a8e569 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
@@ -68,7 +68,7 @@ define <4 x float> @masked_load_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 {
ret <4 x float> %load
}
-define <8 x float> @masked_load_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
+define void @masked_load_v8f32(ptr %ap, ptr %bp, ptr %c) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_load_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -76,30 +76,31 @@ define <8 x float> @masked_load_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0]
-; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: st1w { z0.s }, p0, [x2]
; CHECK-NEXT: ret
%a = load <8 x float>, ptr %ap
%b = load <8 x float>, ptr %bp
%mask = fcmp oeq <8 x float> %a, %b
%load = call <8 x float> @llvm.masked.load.v8f32(ptr %ap, i32 8, <8 x i1> %mask, <8 x float> zeroinitializer)
- ret <8 x float> %load
+ store <8 x float> %load, ptr %c
+ ret void
}
-define <16 x float> @masked_load_v16f32(ptr %ap, ptr %bp) #0 {
+define void @masked_load_v16f32(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_v16f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z2.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z3.s
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p2/z, [x0]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_v16f32:
@@ -109,16 +110,17 @@ define <16 x float> @masked_load_v16f32(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%a = load <16 x float>, ptr %ap
%b = load <16 x float>, ptr %bp
%mask = fcmp oeq <16 x float> %a, %b
%load = call <16 x float> @llvm.masked.load.v16f32(ptr %ap, i32 8, <16 x i1> %mask, <16 x float> zeroinitializer)
- ret <16 x float> %load
+ store <16 x float> %load, ptr %c
+ ret void
}
-define <32 x float> @masked_load_v32f32(ptr %ap, ptr %bp) vscale_range(8,0) #0 {
+define void @masked_load_v32f32(ptr %ap, ptr %bp, ptr %c) vscale_range(8,0) #0 {
; CHECK-LABEL: masked_load_v32f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
@@ -126,16 +128,17 @@ define <32 x float> @masked_load_v32f32(ptr %ap, ptr %bp) vscale_range(8,0) #0 {
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0]
-; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: st1w { z0.s }, p0, [x2]
; CHECK-NEXT: ret
%a = load <32 x float>, ptr %ap
%b = load <32 x float>, ptr %bp
%mask = fcmp oeq <32 x float> %a, %b
%load = call <32 x float> @llvm.masked.load.v32f32(ptr %ap, i32 8, <32 x i1> %mask, <32 x float> zeroinitializer)
- ret <32 x float> %load
+ store <32 x float> %load, ptr %c
+ ret void
}
-define <64 x float> @masked_load_v64f32(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
+define void @masked_load_v64f32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
; CHECK-LABEL: masked_load_v64f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
@@ -143,30 +146,31 @@ define <64 x float> @masked_load_v64f32(ptr %ap, ptr %bp) vscale_range(16,0) #0
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0]
-; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: st1w { z0.s }, p0, [x2]
; CHECK-NEXT: ret
%a = load <64 x float>, ptr %ap
%b = load <64 x float>, ptr %bp
%mask = fcmp oeq <64 x float> %a, %b
%load = call <64 x float> @llvm.masked.load.v64f32(ptr %ap, i32 8, <64 x i1> %mask, <64 x float> zeroinitializer)
- ret <64 x float> %load
+ store <64 x float> %load, ptr %c
+ ret void
}
-define <64 x i8> @masked_load_v64i8(ptr %ap, ptr %bp) #0 {
+define void @masked_load_v64i8(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_v64i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
-; VBITS_GE_256-NEXT: mov w9, #32 // =0x20
-; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x9]
+; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x9]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, z2.b
; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z1.b, z3.b
-; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0, x9]
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p2/z, [x0]
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x8, x9]
-; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x8]
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x2, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_v64i8:
@@ -176,30 +180,31 @@ define <64 x i8> @masked_load_v64i8(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
; VBITS_GE_512-NEXT: ld1b { z0.b }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x8]
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%a = load <64 x i8>, ptr %ap
%b = load <64 x i8>, ptr %bp
%mask = icmp eq <64 x i8> %a, %b
%load = call <64 x i8> @llvm.masked.load.v64i8(ptr %ap, i32 8, <64 x i1> %mask, <64 x i8> undef)
- ret <64 x i8> %load
+ store <64 x i8> %load, ptr %c
+ ret void
}
-define <32 x i16> @masked_load_v32i16(ptr %ap, ptr %bp) #0 {
+define void @masked_load_v32i16(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_v32i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x9, #16 // =0x10
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x9, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z2.h
; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, z3.h
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0, x9, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p2/z, [x0]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8]
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_v32i16:
@@ -209,30 +214,31 @@ define <32 x i16> @masked_load_v32i16(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_512-NEXT: ld1h { z0.h }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8]
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%a = load <32 x i16>, ptr %ap
%b = load <32 x i16>, ptr %bp
%mask = icmp eq <32 x i16> %a, %b
%load = call <32 x i16> @llvm.masked.load.v32i16(ptr %ap, i32 8, <32 x i1> %mask, <32 x i16> undef)
- ret <32 x i16> %load
+ store <32 x i16> %load, ptr %c
+ ret void
}
-define <16 x i32> @masked_load_v16i32(ptr %ap, ptr %bp) #0 {
+define void @masked_load_v16i32(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_v16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s
; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, z3.s
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p2/z, [x0]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_v16i32:
@@ -242,30 +248,31 @@ define <16 x i32> @masked_load_v16i32(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%a = load <16 x i32>, ptr %ap
%b = load <16 x i32>, ptr %bp
%mask = icmp eq <16 x i32> %a, %b
%load = call <16 x i32> @llvm.masked.load.v16i32(ptr %ap, i32 8, <16 x i1> %mask, <16 x i32> undef)
- ret <16 x i32> %load
+ store <16 x i32> %load, ptr %c
+ ret void
}
-define <8 x i64> @masked_load_v8i64(ptr %ap, ptr %bp) #0 {
+define void @masked_load_v8i64(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d
; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, z3.d
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x0]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_v8i64:
@@ -275,32 +282,33 @@ define <8 x i64> @masked_load_v8i64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, ptr %ap
%b = load <8 x i64>, ptr %bp
%mask = icmp eq <8 x i64> %a, %b
%load = call <8 x i64> @llvm.masked.load.v8i64(ptr %ap, i32 8, <8 x i1> %mask, <8 x i64> undef)
- ret <8 x i64> %load
+ store <8 x i64> %load, ptr %c
+ ret void
}
-define <8 x i64> @masked_load_passthru_v8i64(ptr %ap, ptr %bp) #0 {
+define void @masked_load_passthru_v8i64(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_passthru_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d
; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, z3.d
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x0]
; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z2.d
; VBITS_GE_256-NEXT: sel z1.d, p2, z1.d, z3.d
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_passthru_v8i64:
@@ -311,32 +319,33 @@ define <8 x i64> @masked_load_passthru_v8i64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0]
; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, ptr %ap
%b = load <8 x i64>, ptr %bp
%mask = icmp eq <8 x i64> %a, %b
%load = call <8 x i64> @llvm.masked.load.v8i64(ptr %ap, i32 8, <8 x i1> %mask, <8 x i64> %b)
- ret <8 x i64> %load
+ store <8 x i64> %load, ptr %c
+ ret void
}
-define <8 x double> @masked_load_passthru_v8f64(ptr %ap, ptr %bp) #0 {
+define void @masked_load_passthru_v8f64(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_passthru_v8f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z2.d
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z3.d
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x0]
; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z2.d
; VBITS_GE_256-NEXT: sel z1.d, p2, z1.d, z3.d
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_passthru_v8f64:
@@ -347,20 +356,21 @@ define <8 x double> @masked_load_passthru_v8f64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0]
; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%a = load <8 x double>, ptr %ap
%b = load <8 x double>, ptr %bp
%mask = fcmp oeq <8 x double> %a, %b
%load = call <8 x double> @llvm.masked.load.v8f64(ptr %ap, i32 8, <8 x i1> %mask, <8 x double> %b)
- ret <8 x double> %load
+ store <8 x double> %load, ptr %c
+ ret void
}
-define <32 x i16> @masked_load_sext_v32i8i16(ptr %ap, ptr %bp) #0 {
+define void @masked_load_sext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
-; VBITS_GE_256-NEXT: mov x9, #16 // =0x10
+; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
@@ -368,8 +378,8 @@ define <32 x i16> @masked_load_sext_v32i8i16(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2]
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16:
@@ -378,21 +388,22 @@ define <32 x i16> @masked_load_sext_v32i8i16(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0
; VBITS_GE_512-NEXT: ld1sb { z0.h }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8]
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <32 x i8>, ptr %bp
%mask = icmp eq <32 x i8> %b, zeroinitializer
%load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
%ext = sext <32 x i8> %load to <32 x i16>
- ret <32 x i16> %ext
+ store <32 x i16> %ext, ptr %c
+ ret void
}
-define <16 x i32> @masked_load_sext_v16i8i32(ptr %ap, ptr %bp) #0 {
+define void @masked_load_sext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.b, vl16
; VBITS_GE_256-NEXT: ldr q0, [x1]
-; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0
; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
@@ -402,8 +413,8 @@ define <16 x i32> @masked_load_sext_v16i8i32(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32:
@@ -412,21 +423,22 @@ define <16 x i32> @masked_load_sext_v16i8i32(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1b { z0.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
; VBITS_GE_512-NEXT: ld1sb { z0.s }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <16 x i8>, ptr %bp
%mask = icmp eq <16 x i8> %b, zeroinitializer
%load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
%ext = sext <16 x i8> %load to <16 x i32>
- ret <16 x i32> %ext
+ store <16 x i32> %ext, ptr %c
+ ret void
}
-define <8 x i64> @masked_load_sext_v8i8i64(ptr %ap, ptr %bp) #0 {
+define void @masked_load_sext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.b, vl8
; VBITS_GE_256-NEXT: ldr d0, [x1]
-; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0
; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
@@ -437,8 +449,8 @@ define <8 x i64> @masked_load_sext_v8i8i64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64:
@@ -447,20 +459,21 @@ define <8 x i64> @masked_load_sext_v8i8i64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1b { z0.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; VBITS_GE_512-NEXT: ld1sb { z0.d }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <8 x i8>, ptr %bp
%mask = icmp eq <8 x i8> %b, zeroinitializer
%load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
%ext = sext <8 x i8> %load to <8 x i64>
- ret <8 x i64> %ext
+ store <8 x i64> %ext, ptr %c
+ ret void
}
-define <16 x i32> @masked_load_sext_v16i16i32(ptr %ap, ptr %bp) #0 {
+define void @masked_load_sext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
@@ -468,8 +481,8 @@ define <16 x i32> @masked_load_sext_v16i16i32(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32:
@@ -478,21 +491,22 @@ define <16 x i32> @masked_load_sext_v16i16i32(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
; VBITS_GE_512-NEXT: ld1sh { z0.s }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <16 x i16>, ptr %bp
%mask = icmp eq <16 x i16> %b, zeroinitializer
%load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
%ext = sext <16 x i16> %load to <16 x i32>
- ret <16 x i32> %ext
+ store <16 x i32> %ext, ptr %c
+ ret void
}
-define <8 x i64> @masked_load_sext_v8i16i64(ptr %ap, ptr %bp) #0 {
+define void @masked_load_sext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl8
; VBITS_GE_256-NEXT: ldr q0, [x1]
-; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0
; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
@@ -502,8 +516,8 @@ define <8 x i64> @masked_load_sext_v8i16i64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64:
@@ -512,20 +526,21 @@ define <8 x i64> @masked_load_sext_v8i16i64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; VBITS_GE_512-NEXT: ld1sh { z0.d }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <8 x i16>, ptr %bp
%mask = icmp eq <8 x i16> %b, zeroinitializer
%load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
%ext = sext <8 x i16> %load to <8 x i64>
- ret <8 x i64> %ext
+ store <8 x i64> %ext, ptr %c
+ ret void
}
-define <8 x i64> @masked_load_sext_v8i32i64(ptr %ap, ptr %bp) #0 {
+define void @masked_load_sext_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
@@ -533,8 +548,8 @@ define <8 x i64> @masked_load_sext_v8i32i64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64:
@@ -543,20 +558,21 @@ define <8 x i64> @masked_load_sext_v8i32i64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <8 x i32>, ptr %bp
%mask = icmp eq <8 x i32> %b, zeroinitializer
%load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
%ext = sext <8 x i32> %load to <8 x i64>
- ret <8 x i64> %ext
+ store <8 x i64> %ext, ptr %c
+ ret void
}
-define <32 x i16> @masked_load_zext_v32i8i16(ptr %ap, ptr %bp) #0 {
+define void @masked_load_zext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
-; VBITS_GE_256-NEXT: mov x9, #16 // =0x10
+; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
@@ -564,8 +580,8 @@ define <32 x i16> @masked_load_zext_v32i8i16(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2]
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16:
@@ -574,21 +590,22 @@ define <32 x i16> @masked_load_zext_v32i8i16(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0
; VBITS_GE_512-NEXT: ld1b { z0.h }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8]
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <32 x i8>, ptr %bp
%mask = icmp eq <32 x i8> %b, zeroinitializer
%load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
%ext = zext <32 x i8> %load to <32 x i16>
- ret <32 x i16> %ext
+ store <32 x i16> %ext, ptr %c
+ ret void
}
-define <16 x i32> @masked_load_zext_v16i8i32(ptr %ap, ptr %bp) #0 {
+define void @masked_load_zext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.b, vl16
; VBITS_GE_256-NEXT: ldr q0, [x1]
-; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0
; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
@@ -598,8 +615,8 @@ define <16 x i32> @masked_load_zext_v16i8i32(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32:
@@ -608,21 +625,22 @@ define <16 x i32> @masked_load_zext_v16i8i32(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1b { z0.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
; VBITS_GE_512-NEXT: ld1b { z0.s }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <16 x i8>, ptr %bp
%mask = icmp eq <16 x i8> %b, zeroinitializer
%load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
%ext = zext <16 x i8> %load to <16 x i32>
- ret <16 x i32> %ext
+ store <16 x i32> %ext, ptr %c
+ ret void
}
-define <8 x i64> @masked_load_zext_v8i8i64(ptr %ap, ptr %bp) #0 {
+define void @masked_load_zext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.b, vl8
; VBITS_GE_256-NEXT: ldr d0, [x1]
-; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0
; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
@@ -633,8 +651,8 @@ define <8 x i64> @masked_load_zext_v8i8i64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64:
@@ -643,20 +661,21 @@ define <8 x i64> @masked_load_zext_v8i8i64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1b { z0.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <8 x i8>, ptr %bp
%mask = icmp eq <8 x i8> %b, zeroinitializer
%load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
%ext = zext <8 x i8> %load to <8 x i64>
- ret <8 x i64> %ext
+ store <8 x i64> %ext, ptr %c
+ ret void
}
-define <16 x i32> @masked_load_zext_v16i16i32(ptr %ap, ptr %bp) #0 {
+define void @masked_load_zext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
@@ -664,8 +683,8 @@ define <16 x i32> @masked_load_zext_v16i16i32(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32:
@@ -674,21 +693,22 @@ define <16 x i32> @masked_load_zext_v16i16i32(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
; VBITS_GE_512-NEXT: ld1h { z0.s }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <16 x i16>, ptr %bp
%mask = icmp eq <16 x i16> %b, zeroinitializer
%load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
%ext = zext <16 x i16> %load to <16 x i32>
- ret <16 x i32> %ext
+ store <16 x i32> %ext, ptr %c
+ ret void
}
-define <8 x i64> @masked_load_zext_v8i16i64(ptr %ap, ptr %bp) #0 {
+define void @masked_load_zext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl8
; VBITS_GE_256-NEXT: ldr q0, [x1]
-; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0
; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
@@ -698,8 +718,8 @@ define <8 x i64> @masked_load_zext_v8i16i64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64:
@@ -708,20 +728,21 @@ define <8 x i64> @masked_load_zext_v8i16i64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <8 x i16>, ptr %bp
%mask = icmp eq <8 x i16> %b, zeroinitializer
%load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
%ext = zext <8 x i16> %load to <8 x i64>
- ret <8 x i64> %ext
+ store <8 x i64> %ext, ptr %c
+ ret void
}
-define <8 x i64> @masked_load_zext_v8i32i64(ptr %ap, ptr %bp) #0 {
+define void @masked_load_zext_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
@@ -729,8 +750,8 @@ define <8 x i64> @masked_load_zext_v8i32i64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64:
@@ -739,21 +760,22 @@ define <8 x i64> @masked_load_zext_v8i32i64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <8 x i32>, ptr %bp
%mask = icmp eq <8 x i32> %b, zeroinitializer
%load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
%ext = zext <8 x i32> %load to <8 x i64>
- ret <8 x i64> %ext
+ store <8 x i64> %ext, ptr %c
+ ret void
}
-define <32 x i16> @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp) #0 {
+define void @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16_m16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x9, #16 // =0x10
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x9, lsl #1]
+; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, #0
; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, #0
@@ -769,8 +791,8 @@ define <32 x i16> @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2]
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16_m16:
@@ -779,21 +801,22 @@ define <32 x i16> @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0
; VBITS_GE_512-NEXT: ld1sb { z0.h }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8]
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <32 x i16>, ptr %bp
%mask = icmp eq <32 x i16> %b, zeroinitializer
%load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
%ext = sext <32 x i8> %load to <32 x i16>
- ret <32 x i16> %ext
+ store <32 x i16> %ext, ptr %c
+ ret void
}
-define <16 x i32> @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp) #0 {
+define void @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32_m32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0
; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0
@@ -812,8 +835,8 @@ define <16 x i32> @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32_m32:
@@ -822,21 +845,22 @@ define <16 x i32> @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
; VBITS_GE_512-NEXT: ld1sb { z0.s }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <16 x i32>, ptr %bp
%mask = icmp eq <16 x i32> %b, zeroinitializer
%load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
%ext = sext <16 x i8> %load to <16 x i32>
- ret <16 x i32> %ext
+ store <16 x i32> %ext, ptr %c
+ ret void
}
-define <8 x i64> @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp) #0 {
+define void @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64_m64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
@@ -857,8 +881,8 @@ define <8 x i64> @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64_m64:
@@ -867,21 +891,22 @@ define <8 x i64> @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; VBITS_GE_512-NEXT: ld1sb { z0.d }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <8 x i64>, ptr %bp
%mask = icmp eq <8 x i64> %b, zeroinitializer
%load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
%ext = sext <8 x i8> %load to <8 x i64>
- ret <8 x i64> %ext
+ store <8 x i64> %ext, ptr %c
+ ret void
}
-define <16 x i32> @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp) #0 {
+define void @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32_m32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0
; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0
@@ -899,8 +924,8 @@ define <16 x i32> @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32_m32:
@@ -909,21 +934,22 @@ define <16 x i32> @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
; VBITS_GE_512-NEXT: ld1sh { z0.s }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <16 x i32>, ptr %bp
%mask = icmp eq <16 x i32> %b, zeroinitializer
%load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
%ext = sext <16 x i16> %load to <16 x i32>
- ret <16 x i32> %ext
+ store <16 x i32> %ext, ptr %c
+ ret void
}
-define <8 x i64> @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp) #0 {
+define void @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64_m64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
@@ -942,8 +968,8 @@ define <8 x i64> @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64_m64:
@@ -952,21 +978,22 @@ define <8 x i64> @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; VBITS_GE_512-NEXT: ld1sh { z0.d }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <8 x i64>, ptr %bp
%mask = icmp eq <8 x i64> %b, zeroinitializer
%load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
%ext = sext <8 x i16> %load to <8 x i64>
- ret <8 x i64> %ext
+ store <8 x i64> %ext, ptr %c
+ ret void
}
-define <8 x i64> @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp) #0 {
+define void @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64_m64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
@@ -982,8 +1009,8 @@ define <8 x i64> @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64_m64:
@@ -992,21 +1019,22 @@ define <8 x i64> @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <8 x i64>, ptr %bp
%mask = icmp eq <8 x i64> %b, zeroinitializer
%load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
%ext = sext <8 x i32> %load to <8 x i64>
- ret <8 x i64> %ext
+ store <8 x i64> %ext, ptr %c
+ ret void
}
-define <32 x i16> @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp) #0 {
+define void @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16_m16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x9, #16 // =0x10
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x9, lsl #1]
+; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, #0
; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, #0
@@ -1022,8 +1050,8 @@ define <32 x i16> @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2]
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16_m16:
@@ -1032,21 +1060,22 @@ define <32 x i16> @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0
; VBITS_GE_512-NEXT: ld1b { z0.h }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8]
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <32 x i16>, ptr %bp
%mask = icmp eq <32 x i16> %b, zeroinitializer
%load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
%ext = zext <32 x i8> %load to <32 x i16>
- ret <32 x i16> %ext
+ store <32 x i16> %ext, ptr %c
+ ret void
}
-define <16 x i32> @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp) #0 {
+define void @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32_m32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0
; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0
@@ -1065,8 +1094,8 @@ define <16 x i32> @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32_m32:
@@ -1075,21 +1104,22 @@ define <16 x i32> @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
; VBITS_GE_512-NEXT: ld1b { z0.s }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <16 x i32>, ptr %bp
%mask = icmp eq <16 x i32> %b, zeroinitializer
%load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
%ext = zext <16 x i8> %load to <16 x i32>
- ret <16 x i32> %ext
+ store <16 x i32> %ext, ptr %c
+ ret void
}
-define <8 x i64> @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp) #0 {
+define void @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64_m64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
@@ -1110,8 +1140,8 @@ define <8 x i64> @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64_m64:
@@ -1120,21 +1150,22 @@ define <8 x i64> @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <8 x i64>, ptr %bp
%mask = icmp eq <8 x i64> %b, zeroinitializer
%load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
%ext = zext <8 x i8> %load to <8 x i64>
- ret <8 x i64> %ext
+ store <8 x i64> %ext, ptr %c
+ ret void
}
-define <16 x i32> @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp) #0 {
+define void @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32_m32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0
; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0
@@ -1152,8 +1183,8 @@ define <16 x i32> @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32_m32:
@@ -1162,21 +1193,22 @@ define <16 x i32> @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
; VBITS_GE_512-NEXT: ld1h { z0.s }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <16 x i32>, ptr %bp
%mask = icmp eq <16 x i32> %b, zeroinitializer
%load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
%ext = zext <16 x i16> %load to <16 x i32>
- ret <16 x i32> %ext
+ store <16 x i32> %ext, ptr %c
+ ret void
}
-define <8 x i64> @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp) #0 {
+define void @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64_m64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
@@ -1195,8 +1227,8 @@ define <8 x i64> @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64_m64:
@@ -1205,21 +1237,22 @@ define <8 x i64> @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <8 x i64>, ptr %bp
%mask = icmp eq <8 x i64> %b, zeroinitializer
%load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
%ext = zext <8 x i16> %load to <8 x i64>
- ret <8 x i64> %ext
+ store <8 x i64> %ext, ptr %c
+ ret void
}
-define <8 x i64> @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp) #0 {
+define void @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64_m64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
@@ -1235,8 +1268,8 @@ define <8 x i64> @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64_m64:
@@ -1245,212 +1278,225 @@ define <8 x i64> @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <8 x i64>, ptr %bp
%mask = icmp eq <8 x i64> %b, zeroinitializer
%load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
%ext = zext <8 x i32> %load to <8 x i64>
- ret <8 x i64> %ext
+ store <8 x i64> %ext, ptr %c
+ ret void
}
-define <128 x i16> @masked_load_sext_v128i8i16(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
+define void @masked_load_sext_v128i8i16(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
; CHECK-LABEL: masked_load_sext_v128i8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0
; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0]
-; CHECK-NEXT: st1h { z0.h }, p0, [x8]
+; CHECK-NEXT: st1h { z0.h }, p0, [x2]
; CHECK-NEXT: ret
%b = load <128 x i8>, ptr %bp
%mask = icmp eq <128 x i8> %b, zeroinitializer
%load = call <128 x i8> @llvm.masked.load.v128i8(ptr %ap, i32 8, <128 x i1> %mask, <128 x i8> undef)
%ext = sext <128 x i8> %load to <128 x i16>
- ret <128 x i16> %ext
+ store <128 x i16> %ext, ptr %c
+ ret void
}
-define <64 x i32> @masked_load_sext_v64i8i32(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
+define void @masked_load_sext_v64i8i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
; CHECK-LABEL: masked_load_sext_v64i8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0
; CHECK-NEXT: ld1sb { z0.s }, p1/z, [x0]
-; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: st1w { z0.s }, p0, [x2]
; CHECK-NEXT: ret
%b = load <64 x i8>, ptr %bp
%mask = icmp eq <64 x i8> %b, zeroinitializer
%load = call <64 x i8> @llvm.masked.load.v64i8(ptr %ap, i32 8, <64 x i1> %mask, <64 x i8> undef)
%ext = sext <64 x i8> %load to <64 x i32>
- ret <64 x i32> %ext
+ store <64 x i32> %ext, ptr %c
+ ret void
}
-define <32 x i64> @masked_load_sext_v32i8i64(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
+define void @masked_load_sext_v32i8i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
; CHECK-LABEL: masked_load_sext_v32i8i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; CHECK-NEXT: ld1sb { z0.d }, p1/z, [x0]
-; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: st1d { z0.d }, p0, [x2]
; CHECK-NEXT: ret
%b = load <32 x i8>, ptr %bp
%mask = icmp eq <32 x i8> %b, zeroinitializer
%load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
%ext = sext <32 x i8> %load to <32 x i64>
- ret <32 x i64> %ext
+ store <32 x i64> %ext, ptr %c
+ ret void
}
-define <64 x i32> @masked_load_sext_v64i16i32(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
+define void @masked_load_sext_v64i16i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
; CHECK-LABEL: masked_load_sext_v64i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0
; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0]
-; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: st1w { z0.s }, p0, [x2]
; CHECK-NEXT: ret
%b = load <64 x i16>, ptr %bp
%mask = icmp eq <64 x i16> %b, zeroinitializer
%load = call <64 x i16> @llvm.masked.load.v64i16(ptr %ap, i32 8, <64 x i1> %mask, <64 x i16> undef)
%ext = sext <64 x i16> %load to <64 x i32>
- ret <64 x i32> %ext
+ store <64 x i32> %ext, ptr %c
+ ret void
}
-define <32 x i64> @masked_load_sext_v32i16i64(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
+define void @masked_load_sext_v32i16i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
; CHECK-LABEL: masked_load_sext_v32i16i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; CHECK-NEXT: ld1sh { z0.d }, p1/z, [x0]
-; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: st1d { z0.d }, p0, [x2]
; CHECK-NEXT: ret
%b = load <32 x i16>, ptr %bp
%mask = icmp eq <32 x i16> %b, zeroinitializer
%load = call <32 x i16> @llvm.masked.load.v32i16(ptr %ap, i32 8, <32 x i1> %mask, <32 x i16> undef)
%ext = sext <32 x i16> %load to <32 x i64>
- ret <32 x i64> %ext
+ store <32 x i64> %ext, ptr %c
+ ret void
}
-define <32 x i64> @masked_load_sext_v32i32i64(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
+define void @masked_load_sext_v32i32i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
; CHECK-LABEL: masked_load_sext_v32i32i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x0]
-; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: st1d { z0.d }, p0, [x2]
; CHECK-NEXT: ret
%b = load <32 x i32>, ptr %bp
%mask = icmp eq <32 x i32> %b, zeroinitializer
%load = call <32 x i32> @llvm.masked.load.v32i32(ptr %ap, i32 8, <32 x i1> %mask, <32 x i32> undef)
%ext = sext <32 x i32> %load to <32 x i64>
- ret <32 x i64> %ext
+ store <32 x i64> %ext, ptr %c
+ ret void
}
-define <128 x i16> @masked_load_zext_v128i8i16(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
+define void @masked_load_zext_v128i8i16(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
; CHECK-LABEL: masked_load_zext_v128i8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0
; CHECK-NEXT: ld1b { z0.h }, p1/z, [x0]
-; CHECK-NEXT: st1h { z0.h }, p0, [x8]
+; CHECK-NEXT: st1h { z0.h }, p0, [x2]
; CHECK-NEXT: ret
%b = load <128 x i8>, ptr %bp
%mask = icmp eq <128 x i8> %b, zeroinitializer
%load = call <128 x i8> @llvm.masked.load.v128i8(ptr %ap, i32 8, <128 x i1> %mask, <128 x i8> undef)
%ext = zext <128 x i8> %load to <128 x i16>
- ret <128 x i16> %ext
+ store <128 x i16> %ext, ptr %c
+ ret void
}
-define <64 x i32> @masked_load_zext_v64i8i32(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
+define void @masked_load_zext_v64i8i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
; CHECK-LABEL: masked_load_zext_v64i8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0
; CHECK-NEXT: ld1b { z0.s }, p1/z, [x0]
-; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: st1w { z0.s }, p0, [x2]
; CHECK-NEXT: ret
%b = load <64 x i8>, ptr %bp
%mask = icmp eq <64 x i8> %b, zeroinitializer
%load = call <64 x i8> @llvm.masked.load.v64i8(ptr %ap, i32 8, <64 x i1> %mask, <64 x i8> undef)
%ext = zext <64 x i8> %load to <64 x i32>
- ret <64 x i32> %ext
+ store <64 x i32> %ext, ptr %c
+ ret void
}
-define <32 x i64> @masked_load_zext_v32i8i64(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
+define void @masked_load_zext_v32i8i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
; CHECK-LABEL: masked_load_zext_v32i8i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; CHECK-NEXT: ld1b { z0.d }, p1/z, [x0]
-; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: st1d { z0.d }, p0, [x2]
; CHECK-NEXT: ret
%b = load <32 x i8>, ptr %bp
%mask = icmp eq <32 x i8> %b, zeroinitializer
%load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
%ext = zext <32 x i8> %load to <32 x i64>
- ret <32 x i64> %ext
+ store <32 x i64> %ext, ptr %c
+ ret void
}
-define <64 x i32> @masked_load_zext_v64i16i32(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
+define void @masked_load_zext_v64i16i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
; CHECK-LABEL: masked_load_zext_v64i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0
; CHECK-NEXT: ld1h { z0.s }, p1/z, [x0]
-; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: st1w { z0.s }, p0, [x2]
; CHECK-NEXT: ret
%b = load <64 x i16>, ptr %bp
%mask = icmp eq <64 x i16> %b, zeroinitializer
%load = call <64 x i16> @llvm.masked.load.v64i16(ptr %ap, i32 8, <64 x i1> %mask, <64 x i16> undef)
%ext = zext <64 x i16> %load to <64 x i32>
- ret <64 x i32> %ext
+ store <64 x i32> %ext, ptr %c
+ ret void
}
-define <32 x i64> @masked_load_zext_v32i16i64(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
+define void @masked_load_zext_v32i16i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
; CHECK-LABEL: masked_load_zext_v32i16i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; CHECK-NEXT: ld1h { z0.d }, p1/z, [x0]
-; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: st1d { z0.d }, p0, [x2]
; CHECK-NEXT: ret
%b = load <32 x i16>, ptr %bp
%mask = icmp eq <32 x i16> %b, zeroinitializer
%load = call <32 x i16> @llvm.masked.load.v32i16(ptr %ap, i32 8, <32 x i1> %mask, <32 x i16> undef)
%ext = zext <32 x i16> %load to <32 x i64>
- ret <32 x i64> %ext
+ store <32 x i64> %ext, ptr %c
+ ret void
}
-define <32 x i64> @masked_load_zext_v32i32i64(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
+define void @masked_load_zext_v32i32i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
; CHECK-LABEL: masked_load_zext_v32i32i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; CHECK-NEXT: ld1w { z0.d }, p1/z, [x0]
-; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: st1d { z0.d }, p0, [x2]
; CHECK-NEXT: ret
%b = load <32 x i32>, ptr %bp
%mask = icmp eq <32 x i32> %b, zeroinitializer
%load = call <32 x i32> @llvm.masked.load.v32i32(ptr %ap, i32 8, <32 x i1> %mask, <32 x i32> undef)
%ext = zext <32 x i32> %load to <32 x i64>
- ret <32 x i64> %ext
+ store <32 x i64> %ext, ptr %c
+ ret void
}
-define <8 x i64> @masked_load_sext_ugt_v8i32i64(ptr %ap, ptr %bp) #0 {
+define void @masked_load_sext_ugt_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_sext_ugt_v8i32i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z0.s, #0
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
@@ -1458,8 +1504,8 @@ define <8 x i64> @masked_load_sext_ugt_v8i32i64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_sext_ugt_v8i32i64:
@@ -1468,20 +1514,21 @@ define <8 x i64> @masked_load_sext_ugt_v8i32i64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0
; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <8 x i32>, ptr %bp
%mask = icmp ugt <8 x i32> %b, zeroinitializer
%load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
%ext = sext <8 x i32> %load to <8 x i64>
- ret <8 x i64> %ext
+ store <8 x i64> %ext, ptr %c
+ ret void
}
-define <8 x i64> @masked_load_zext_sgt_v8i32i64(ptr %ap, ptr %bp) #0 {
+define void @masked_load_zext_sgt_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 {
; VBITS_GE_256-LABEL: masked_load_zext_sgt_v8i32i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpgt p0.s, p0/z, z0.s, #0
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
@@ -1489,8 +1536,8 @@ define <8 x i64> @masked_load_zext_sgt_v8i32i64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_load_zext_sgt_v8i32i64:
@@ -1499,13 +1546,14 @@ define <8 x i64> @masked_load_zext_sgt_v8i32i64(ptr %ap, ptr %bp) #0 {
; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpgt p1.d, p0/z, z0.d, #0
; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0]
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%b = load <8 x i32>, ptr %bp
%mask = icmp sgt <8 x i32> %b, zeroinitializer
%load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
%ext = zext <8 x i32> %load to <8 x i64>
- ret <8 x i64> %ext
+ store <8 x i64> %ext, ptr %c
+ ret void
}
declare <2 x half> @llvm.masked.load.v2f16(ptr, i32, <2 x i1>, <2 x half>)
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
index 113f7a9465a1fe8..f97ca05f3bdd4b3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
@@ -712,82 +712,88 @@ define void @splat_imm_v8f64(ptr %a) vscale_range(4,0) #0 {
ret void
}
-define <8 x float> @load_splat_v8f32(ptr %p) vscale_range(2,2) #0 {
+define void @load_splat_v8f32(ptr %a, ptr %b) vscale_range(2,2) #0 {
; CHECK-LABEL: load_splat_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: mov z0.s, s0
-; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
- %v = load <8 x float>, ptr %p
+ %v = load <8 x float>, ptr %a
%splat = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> zeroinitializer
- ret <8 x float> %splat
+ store <8 x float> %splat, ptr %b
+ ret void
}
-define <4 x double> @load_splat_v4f64(ptr %p) vscale_range(2,2) #0 {
+define void @load_splat_v4f64(ptr %a, ptr %b) vscale_range(2,2) #0 {
; CHECK-LABEL: load_splat_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: mov z0.d, d0
-; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
- %v = load <4 x double>, ptr %p
+ %v = load <4 x double>, ptr %a
%splat = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> zeroinitializer
- ret <4 x double> %splat
+ store <4 x double> %splat, ptr %b
+ ret void
}
-define <32 x i8> @load_splat_v32i8(ptr %p) vscale_range(2,2) #0 {
+define void @load_splat_v32i8(ptr %a, ptr %b) vscale_range(2,2) #0 {
; CHECK-LABEL: load_splat_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: mov z0.b, b0
-; CHECK-NEXT: st1b { z0.b }, p0, [x8]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; CHECK-NEXT: ret
- %v = load <32 x i8>, ptr %p
+ %v = load <32 x i8>, ptr %a
%splat = shufflevector <32 x i8> %v, <32 x i8> undef, <32 x i32> zeroinitializer
- ret <32 x i8> %splat
+ store <32 x i8> %splat, ptr %b
+ ret void
}
-define <16 x i16> @load_splat_v16i16(ptr %p) vscale_range(2,2) #0 {
+define void @load_splat_v16i16(ptr %a, ptr %b) vscale_range(2,2) #0 {
; CHECK-LABEL: load_splat_v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: mov z0.h, h0
-; CHECK-NEXT: st1h { z0.h }, p0, [x8]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
- %v = load <16 x i16>, ptr %p
+ %v = load <16 x i16>, ptr %a
%splat = shufflevector <16 x i16> %v, <16 x i16> undef, <16 x i32> zeroinitializer
- ret <16 x i16> %splat
+ store <16 x i16> %splat, ptr %b
+ ret void
}
-define <8 x i32> @load_splat_v8i32(ptr %p) vscale_range(2,2) #0 {
+define void @load_splat_v8i32(ptr %a, ptr %b) vscale_range(2,2) #0 {
; CHECK-LABEL: load_splat_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: mov z0.s, s0
-; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
- %v = load <8 x i32>, ptr %p
+ %v = load <8 x i32>, ptr %a
%splat = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> zeroinitializer
- ret <8 x i32> %splat
+ store <8 x i32> %splat, ptr %b
+ ret void
}
-define <4 x i64> @load_splat_v4i64(ptr %p) vscale_range(2,2) #0 {
+define void @load_splat_v4i64(ptr %a, ptr %b) vscale_range(2,2) #0 {
; CHECK-LABEL: load_splat_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: mov z0.d, d0
-; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
- %v = load <4 x i64>, ptr %p
+ %v = load <4 x i64>, ptr %a
%splat = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> zeroinitializer
- ret <4 x i64> %splat
+ store <4 x i64> %splat, ptr %b
+ ret void
}
attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll
deleted file mode 100644
index 03bff6cb9b62df2..000000000000000
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll
+++ /dev/null
@@ -1,180 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048
-
-target triple = "aarch64-unknown-linux-gnu"
-
-; Don't use SVE for 64-bit vectors.
-define void @store_v2f32(ptr %a) #0 {
-; CHECK-LABEL: store_v2f32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: str xzr, [x0]
-; CHECK-NEXT: ret
- store <2 x float> zeroinitializer, ptr %a
- ret void
-}
-
-; Don't use SVE for 128-bit vectors.
-define void @store_v4f32(ptr %a) #0 {
-; CHECK-LABEL: store_v4f32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: stp xzr, xzr, [x0]
-; CHECK-NEXT: ret
- store <4 x float> zeroinitializer, ptr %a
- ret void
-}
-
-define void @store_v8f32(ptr %a) #0 {
-; CHECK-LABEL: store_v8f32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.s, vl8
-; CHECK-NEXT: mov z0.s, #0 // =0x0
-; CHECK-NEXT: st1w { z0.s }, p0, [x0]
-; CHECK-NEXT: ret
- store <8 x float> zeroinitializer, ptr %a
- ret void
-}
-
-define void @store_v16f32(ptr %a) #0 {
-; VBITS_GE_256-LABEL: store_v16f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0
-; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: store_v16f32:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_512-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: store_v16f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT: mov z0.s, #0 // =0x0
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: store_v16f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl16
-; VBITS_GE_2048-NEXT: mov z0.s, #0 // =0x0
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
- store <16 x float> zeroinitializer, ptr %a
- ret void
-}
-
-define void @store_v32f32(ptr %a) #0 {
-; VBITS_GE_256-LABEL: store_v32f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0
-; VBITS_GE_256-NEXT: mov x8, #24 // =0x18
-; VBITS_GE_256-NEXT: mov x9, #16 // =0x10
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: store_v32f32:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0
-; VBITS_GE_512-NEXT: mov x8, #16 // =0x10
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_512-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: store_v32f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: mov z0.s, #0 // =0x0
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: store_v32f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: mov z0.s, #0 // =0x0
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
- store <32 x float> zeroinitializer, ptr %a
- ret void
-}
-
-define void @store_v64f32(ptr %a) #0 {
-; VBITS_GE_256-LABEL: store_v64f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0
-; VBITS_GE_256-NEXT: mov x8, #56 // =0x38
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #48 // =0x30
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #40 // =0x28
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #32 // =0x20
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #24 // =0x18
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: store_v64f32:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0
-; VBITS_GE_512-NEXT: mov x8, #48 // =0x30
-; VBITS_GE_512-NEXT: mov x9, #32 // =0x20
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_512-NEXT: mov x8, #16 // =0x10
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_512-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: store_v64f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: mov z0.s, #0 // =0x0
-; VBITS_GE_1024-NEXT: mov x8, #32 // =0x20
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: store_v64f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: mov z0.s, #0 // =0x0
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
- store <64 x float> zeroinitializer, ptr %a
- ret void
-}
-
-attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
index e742836d79fbe58..79ef20270eda85b 100644
--- a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
@@ -35,23 +35,23 @@ define <vscale x 2 x double> @test_post_ld1_dup(ptr %a, ptr %ptr, i64 %inc) {
ret <vscale x 2 x double> %dup
}
-define <4 x i64> @test_post_ld1_int_fixed(ptr %data, i64 %idx, ptr %addr) #1 {
+define void @test_post_ld1_int_fixed(ptr %data, i64 %idx, ptr %addr, ptr %res_ptr) #1 {
; CHECK-LABEL: test_post_ld1_int_fixed:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: mov w9, #2 // =0x2
+; CHECK-NEXT: mov w8, #2 // =0x2
; CHECK-NEXT: index z0.d, #0, #1
; CHECK-NEXT: ptrue p1.d, vl1
-; CHECK-NEXT: mov z1.d, x9
+; CHECK-NEXT: mov z1.d, x8
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2]
; CHECK-NEXT: cmpeq p2.d, p0/z, z0.d, z1.d
-; CHECK-NEXT: ldr x9, [x0]
-; CHECK-NEXT: ldr x10, [x0, x1, lsl #3]
+; CHECK-NEXT: ldr x8, [x0]
+; CHECK-NEXT: ldr x9, [x0, x1, lsl #3]
; CHECK-NEXT: mov z0.d, z2.d
-; CHECK-NEXT: mov z2.d, p2/m, x10
-; CHECK-NEXT: mov z0.d, p1/m, x9
+; CHECK-NEXT: mov z2.d, p2/m, x9
+; CHECK-NEXT: mov z0.d, p1/m, x8
; CHECK-NEXT: add z0.d, z0.d, z2.d
-; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: st1d { z0.d }, p0, [x3]
; CHECK-NEXT: ret
%A = load <4 x i64>, ptr %addr
%ld1 = load i64, ptr %data
@@ -60,16 +60,17 @@ define <4 x i64> @test_post_ld1_int_fixed(ptr %data, i64 %idx, ptr %addr) #1 {
%ld2 = load i64, ptr %gep
%vec2 = insertelement <4 x i64> %A, i64 %ld2, i32 2
%res = add <4 x i64> %vec1, %vec2
- ret <4 x i64> %res
+ store <4 x i64> %res, ptr %res_ptr
+ ret void
}
-define <4 x double> @test_post_ld1_double_fixed(ptr %data, i64 %idx, ptr %addr) #1 {
+define void @test_post_ld1_double_fixed(ptr %data, i64 %idx, ptr %addr, ptr %res_ptr) #1 {
; CHECK-LABEL: test_post_ld1_double_fixed:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: mov w9, #2 // =0x2
+; CHECK-NEXT: mov w8, #2 // =0x2
; CHECK-NEXT: index z0.d, #0, #1
-; CHECK-NEXT: mov z1.d, x9
+; CHECK-NEXT: mov z1.d, x8
; CHECK-NEXT: ptrue p1.d, vl1
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2]
; CHECK-NEXT: cmpeq p2.d, p0/z, z0.d, z1.d
@@ -78,7 +79,7 @@ define <4 x double> @test_post_ld1_double_fixed(ptr %data, i64 %idx, ptr %addr)
; CHECK-NEXT: sel z0.d, p1, z0.d, z2.d
; CHECK-NEXT: mov z2.d, p2/m, d1
; CHECK-NEXT: fadd z0.d, z0.d, z2.d
-; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: st1d { z0.d }, p0, [x3]
; CHECK-NEXT: ret
%A = load <4 x double>, ptr %addr
%ld1 = load double, ptr %data
@@ -87,7 +88,8 @@ define <4 x double> @test_post_ld1_double_fixed(ptr %data, i64 %idx, ptr %addr)
%ld2 = load double, ptr %gep
%vec2 = insertelement <4 x double> %A, double %ld2, i32 2
%res = fadd <4 x double> %vec1, %vec2
- ret <4 x double> %res
+ store <4 x double> %res, ptr %res_ptr
+ ret void
}
attributes #1 = { vscale_range(2,2) "target-features"="+neon,+sve" }
More information about the llvm-commits
mailing list