[llvm] [WIP][LLVM][CodeGen][SVE] Lower to multivector loads instead of splitting them. (PR #150421)
Paul Walker via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 24 07:11:29 PDT 2025
https://github.com/paulwalker-arm updated https://github.com/llvm/llvm-project/pull/150421
>From 31f0d5880655f680080a0fb8a6be7d1476301d3d Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Thu, 24 Jul 2025 14:47:33 +0100
Subject: [PATCH] [WIP][LLVM][CodeGen][SVE] Lower to multivector loads instead
of splitting them.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 123 ++
.../CodeGen/AArch64/sve-multivector-loads.ll | 1135 +++++++++++++++++
2 files changed, 1258 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/sve-multivector-loads.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ff23f76fadccd..a7cc628fd6ce1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1884,6 +1884,29 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
setOperationAction(ISD::INTRINSIC_WO_CHAIN, VT, Custom);
+
+ if (Subtarget->hasSVE2p1() ||
+ (Subtarget->hasSME2() && Subtarget->isStreaming())) {
+ // 2x loads
+ setOperationAction(ISD::LOAD, MVT::nxv32i8, Custom);
+ setOperationAction(ISD::LOAD, MVT::nxv16i16, Custom);
+ setOperationAction(ISD::LOAD, MVT::nxv8i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::nxv4i64, Custom);
+ setOperationAction(ISD::LOAD, MVT::nxv16f16, Custom);
+ setOperationAction(ISD::LOAD, MVT::nxv8f32, Custom);
+ setOperationAction(ISD::LOAD, MVT::nxv4f64, Custom);
+ setOperationAction(ISD::LOAD, MVT::nxv16bf16, Custom);
+
+ // 4x loads
+ setOperationAction(ISD::LOAD, MVT::nxv64i8, Custom);
+ setOperationAction(ISD::LOAD, MVT::nxv32i16, Custom);
+ setOperationAction(ISD::LOAD, MVT::nxv16i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::nxv8i64, Custom);
+ setOperationAction(ISD::LOAD, MVT::nxv32f16, Custom);
+ setOperationAction(ISD::LOAD, MVT::nxv16f32, Custom);
+ setOperationAction(ISD::LOAD, MVT::nxv8f64, Custom);
+ setOperationAction(ISD::LOAD, MVT::nxv32bf16, Custom);
+ }
}
// Handle partial reduction operations
@@ -27909,6 +27932,106 @@ void AArch64TargetLowering::ReplaceNodeResults(
return;
}
+ LSBaseSDNode *LSNode = dyn_cast<LSBaseSDNode>(N);
+ if (LSNode && LSNode->isSimple() && LSNode->isUnindexed() &&
+ LSNode->getValueType(0).isScalableVector() &&
+ N->getValueType(0).isSimple() && N->getValueType(0) == MemVT) {
+ MVT VT = N->getValueType(0).getSimpleVT();
+
+ unsigned IntID;
+ switch (VT.SimpleTy) {
+ default:
+ return;
+ case MVT::nxv32i8:
+ case MVT::nxv16i16:
+ case MVT::nxv8i32:
+ case MVT::nxv4i64:
+ case MVT::nxv16f16:
+ case MVT::nxv8f32:
+ case MVT::nxv4f64:
+ case MVT::nxv16bf16:
+ IntID = Intrinsic::aarch64_sve_ld1_pn_x2;
+ break;
+ case MVT::nxv64i8:
+ case MVT::nxv32i16:
+ case MVT::nxv16i32:
+ case MVT::nxv8i64:
+ case MVT::nxv32f16:
+ case MVT::nxv16f32:
+ case MVT::nxv8f64:
+ case MVT::nxv32bf16:
+ IntID = Intrinsic::aarch64_sve_ld1_pn_x4;
+ break;
+ }
+
+ unsigned PredIntID;
+ switch (VT.SimpleTy) {
+ default:
+ llvm_unreachable("covered by previous switch");
+
+ case MVT::nxv32i8:
+ case MVT::nxv64i8:
+ PredIntID = Intrinsic::aarch64_sve_ptrue_c8;
+ break;
+ case MVT::nxv16i16:
+ case MVT::nxv16f16:
+ case MVT::nxv16bf16:
+ case MVT::nxv32i16:
+ case MVT::nxv32f16:
+ case MVT::nxv32bf16:
+ PredIntID = Intrinsic::aarch64_sve_ptrue_c16;
+ break;
+ case MVT::nxv8i32:
+ case MVT::nxv8f32:
+ case MVT::nxv16i32:
+ case MVT::nxv16f32:
+ PredIntID = Intrinsic::aarch64_sve_ptrue_c32;
+ break;
+ case MVT::nxv4i64:
+ case MVT::nxv4f64:
+ case MVT::nxv8i64:
+ case MVT::nxv8f64:
+ PredIntID = Intrinsic::aarch64_sve_ptrue_c64;
+ break;
+ }
+
+ SDValue Chain = LSNode->getChain();
+ SDValue Addr = LSNode->getBasePtr();
+ SDValue Offset = LSNode->getOffset();
+
+ if (!Offset.isUndef())
+ return;
+
+ SDLoc DL(N);
+ SDValue PNg =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::aarch64svcount,
+ DAG.getConstant(PredIntID, DL, MVT::i64));
+
+ if (IntID == Intrinsic::aarch64_sve_ld1_pn_x2) {
+ MVT RegVT = VT.getHalfNumVectorElementsVT();
+ SDValue NewLoad = DAG.getNode(
+ ISD::INTRINSIC_W_CHAIN, DL, {RegVT, RegVT, MVT::Other},
+ {Chain, DAG.getConstant(IntID, DL, MVT::i64), PNg, Addr});
+ Results.push_back(
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+ {NewLoad.getValue(0), NewLoad.getValue(1)}));
+ Results.push_back(NewLoad.getValue(2) /* Chain */);
+ return;
+ }
+
+ assert(IntID == Intrinsic::aarch64_sve_ld1_pn_x4);
+ MVT RegVT = VT.getHalfNumVectorElementsVT().getHalfNumVectorElementsVT();
+ SDValue NewLoad = DAG.getNode(
+ ISD::INTRINSIC_W_CHAIN, DL, {RegVT, RegVT, RegVT, RegVT, MVT::Other},
+ {Chain, DAG.getConstant(IntID, DL, MVT::i64), PNg, Addr});
+ Results.push_back(
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+ {NewLoad.getValue(0), NewLoad.getValue(1),
+ NewLoad.getValue(2), NewLoad.getValue(3)}));
+ Results.push_back(NewLoad.getValue(4) /* Chain */);
+ return;
+ }
+
if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
LoadNode->getMemoryVT() != MVT::i128) {
// Non-volatile or atomic loads are optimized later in AArch64's load/store
diff --git a/llvm/test/CodeGen/AArch64/sve-multivector-loads.ll b/llvm/test/CodeGen/AArch64/sve-multivector-loads.ll
new file mode 100644
index 0000000000000..9bdd5547be0f8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-multivector-loads.ll
@@ -0,0 +1,1135 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve-b16b16,+sve2p1 < %s -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,SVE
+; RUN: llc -mattr=+sve-b16b16,+sme2 --force-streaming < %s -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,SSVE
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @load_2x_vectors_i8_r(ptr %addr) {
+; SVE-LABEL: load_2x_vectors_i8_r:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.b
+; SVE-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0]
+; SVE-NEXT: add z0.b, z0.b, #5 // =0x5
+; SVE-NEXT: add z1.b, z1.b, #5 // =0x5
+; SVE-NEXT: str z0, [x0]
+; SVE-NEXT: str z1, [x0, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_2x_vectors_i8_r:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.b
+; SSVE-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
+; SSVE-NEXT: add z16.b, z16.b, #5 // =0x5
+; SSVE-NEXT: add z24.b, z24.b, #5 // =0x5
+; SSVE-NEXT: str z16, [x0]
+; SSVE-NEXT: str z24, [x0, #1, mul vl]
+; SSVE-NEXT: ret
+ %a = load <vscale x 32 x i8>, ptr %addr
+ %b = add <vscale x 32 x i8> %a, splat (i8 5)
+ store <vscale x 32 x i8> %b, ptr %addr
+ ret void
+}
+
+define void @load_2x_vectors_i8_rr(ptr %base, i64 %idx) {
+; SVE-LABEL: load_2x_vectors_i8_rr:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.b
+; SVE-NEXT: ptrue p0.b
+; SVE-NEXT: add x8, x0, x1
+; SVE-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0, x1]
+; SVE-NEXT: add z1.b, z1.b, #5 // =0x5
+; SVE-NEXT: add z0.b, z0.b, #5 // =0x5
+; SVE-NEXT: st1b { z0.b }, p0, [x0, x1]
+; SVE-NEXT: str z1, [x8, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_2x_vectors_i8_rr:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.b
+; SSVE-NEXT: ptrue p0.b
+; SSVE-NEXT: add x8, x0, x1
+; SSVE-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0, x1]
+; SSVE-NEXT: add z24.b, z24.b, #5 // =0x5
+; SSVE-NEXT: add z16.b, z16.b, #5 // =0x5
+; SSVE-NEXT: st1b { z16.b }, p0, [x0, x1]
+; SSVE-NEXT: str z24, [x8, #1, mul vl]
+; SSVE-NEXT: ret
+ %addr = getelementptr i8, ptr %base, i64 %idx
+ %a = load <vscale x 32 x i8>, ptr %addr
+ %b = add <vscale x 32 x i8> %a, splat (i8 5)
+ store <vscale x 32 x i8> %b, ptr %addr
+ ret void
+}
+
+define void @load_2x_vectors_i16_r(ptr %addr) {
+; SVE-LABEL: load_2x_vectors_i16_r:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.h
+; SVE-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0]
+; SVE-NEXT: add z0.h, z0.h, #5 // =0x5
+; SVE-NEXT: add z1.h, z1.h, #5 // =0x5
+; SVE-NEXT: str z0, [x0]
+; SVE-NEXT: str z1, [x0, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_2x_vectors_i16_r:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.h
+; SSVE-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0]
+; SSVE-NEXT: add z16.h, z16.h, #5 // =0x5
+; SSVE-NEXT: add z24.h, z24.h, #5 // =0x5
+; SSVE-NEXT: str z16, [x0]
+; SSVE-NEXT: str z24, [x0, #1, mul vl]
+; SSVE-NEXT: ret
+ %a = load <vscale x 16 x i16>, ptr %addr
+ %b = add <vscale x 16 x i16> %a, splat (i16 5)
+ store <vscale x 16 x i16> %b, ptr %addr
+ ret void
+}
+
+define void @load_2x_vectors_i16_rr(ptr %base, i64 %idx) {
+; SVE-LABEL: load_2x_vectors_i16_rr:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.h
+; SVE-NEXT: ptrue p0.h
+; SVE-NEXT: add x8, x0, x1, lsl #1
+; SVE-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0, x1, lsl #1]
+; SVE-NEXT: add z1.h, z1.h, #5 // =0x5
+; SVE-NEXT: add z0.h, z0.h, #5 // =0x5
+; SVE-NEXT: st1h { z0.h }, p0, [x0, x1, lsl #1]
+; SVE-NEXT: str z1, [x8, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_2x_vectors_i16_rr:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.h
+; SSVE-NEXT: ptrue p0.h
+; SSVE-NEXT: add x8, x0, x1, lsl #1
+; SSVE-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0, x1, lsl #1]
+; SSVE-NEXT: add z24.h, z24.h, #5 // =0x5
+; SSVE-NEXT: add z16.h, z16.h, #5 // =0x5
+; SSVE-NEXT: st1h { z16.h }, p0, [x0, x1, lsl #1]
+; SSVE-NEXT: str z24, [x8, #1, mul vl]
+; SSVE-NEXT: ret
+ %addr = getelementptr i16, ptr %base, i64 %idx
+ %a = load <vscale x 16 x i16>, ptr %addr
+ %b = add <vscale x 16 x i16> %a, splat (i16 5)
+ store <vscale x 16 x i16> %b, ptr %addr
+ ret void
+}
+
+define void @load_2x_vectors_i32_r(ptr %addr) {
+; SVE-LABEL: load_2x_vectors_i32_r:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.s
+; SVE-NEXT: ld1w { z0.s, z1.s }, pn8/z, [x0]
+; SVE-NEXT: add z0.s, z0.s, #5 // =0x5
+; SVE-NEXT: add z1.s, z1.s, #5 // =0x5
+; SVE-NEXT: str z0, [x0]
+; SVE-NEXT: str z1, [x0, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_2x_vectors_i32_r:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.s
+; SSVE-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x0]
+; SSVE-NEXT: add z16.s, z16.s, #5 // =0x5
+; SSVE-NEXT: add z24.s, z24.s, #5 // =0x5
+; SSVE-NEXT: str z16, [x0]
+; SSVE-NEXT: str z24, [x0, #1, mul vl]
+; SSVE-NEXT: ret
+ %a = load <vscale x 8 x i32>, ptr %addr
+ %b = add <vscale x 8 x i32> %a, splat (i32 5)
+ store <vscale x 8 x i32> %b, ptr %addr
+ ret void
+}
+
+define void @load_2x_vectors_i32_rr(ptr %base, i64 %idx) {
+; SVE-LABEL: load_2x_vectors_i32_rr:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.s
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: add x8, x0, x1, lsl #2
+; SVE-NEXT: ld1w { z0.s, z1.s }, pn8/z, [x0, x1, lsl #2]
+; SVE-NEXT: add z1.s, z1.s, #5 // =0x5
+; SVE-NEXT: add z0.s, z0.s, #5 // =0x5
+; SVE-NEXT: st1w { z0.s }, p0, [x0, x1, lsl #2]
+; SVE-NEXT: str z1, [x8, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_2x_vectors_i32_rr:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.s
+; SSVE-NEXT: ptrue p0.s
+; SSVE-NEXT: add x8, x0, x1, lsl #2
+; SSVE-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x0, x1, lsl #2]
+; SSVE-NEXT: add z24.s, z24.s, #5 // =0x5
+; SSVE-NEXT: add z16.s, z16.s, #5 // =0x5
+; SSVE-NEXT: st1w { z16.s }, p0, [x0, x1, lsl #2]
+; SSVE-NEXT: str z24, [x8, #1, mul vl]
+; SSVE-NEXT: ret
+ %addr = getelementptr i32, ptr %base, i64 %idx
+ %a = load <vscale x 8 x i32>, ptr %addr
+ %b = add <vscale x 8 x i32> %a, splat (i32 5)
+ store <vscale x 8 x i32> %b, ptr %addr
+ ret void
+}
+
+define void @load_2x_vectors_i64_r(ptr %addr) {
+; SVE-LABEL: load_2x_vectors_i64_r:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.d
+; SVE-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x0]
+; SVE-NEXT: add z0.d, z0.d, #5 // =0x5
+; SVE-NEXT: add z1.d, z1.d, #5 // =0x5
+; SVE-NEXT: str z0, [x0]
+; SVE-NEXT: str z1, [x0, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_2x_vectors_i64_r:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.d
+; SSVE-NEXT: ld1d { z16.d, z24.d }, pn8/z, [x0]
+; SSVE-NEXT: add z16.d, z16.d, #5 // =0x5
+; SSVE-NEXT: add z24.d, z24.d, #5 // =0x5
+; SSVE-NEXT: str z16, [x0]
+; SSVE-NEXT: str z24, [x0, #1, mul vl]
+; SSVE-NEXT: ret
+ %a = load <vscale x 4 x i64>, ptr %addr
+ %b = add <vscale x 4 x i64> %a, splat (i64 5)
+ store <vscale x 4 x i64> %b, ptr %addr
+ ret void
+}
+
+define void @load_2x_vectors_i64_rr(ptr %base, i64 %idx) {
+; SVE-LABEL: load_2x_vectors_i64_rr:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.d
+; SVE-NEXT: ptrue p0.d
+; SVE-NEXT: add x8, x0, x1, lsl #3
+; SVE-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x0, x1, lsl #3]
+; SVE-NEXT: add z1.d, z1.d, #5 // =0x5
+; SVE-NEXT: add z0.d, z0.d, #5 // =0x5
+; SVE-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3]
+; SVE-NEXT: str z1, [x8, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_2x_vectors_i64_rr:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.d
+; SSVE-NEXT: ptrue p0.d
+; SSVE-NEXT: add x8, x0, x1, lsl #3
+; SSVE-NEXT: ld1d { z16.d, z24.d }, pn8/z, [x0, x1, lsl #3]
+; SSVE-NEXT: add z24.d, z24.d, #5 // =0x5
+; SSVE-NEXT: add z16.d, z16.d, #5 // =0x5
+; SSVE-NEXT: st1d { z16.d }, p0, [x0, x1, lsl #3]
+; SSVE-NEXT: str z24, [x8, #1, mul vl]
+; SSVE-NEXT: ret
+ %addr = getelementptr i64, ptr %base, i64 %idx
+ %a = load <vscale x 4 x i64>, ptr %addr
+ %b = add <vscale x 4 x i64> %a, splat (i64 5)
+ store <vscale x 4 x i64> %b, ptr %addr
+ ret void
+}
+
+define void @load_2x_vectors_f16_r(ptr %addr) {
+; SVE-LABEL: load_2x_vectors_f16_r:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.h
+; SVE-NEXT: ptrue p0.h
+; SVE-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0]
+; SVE-NEXT: movprfx z2, z1
+; SVE-NEXT: fadd z2.h, p0/m, z2.h, #1.0
+; SVE-NEXT: fadd z0.h, p0/m, z0.h, #1.0
+; SVE-NEXT: str z2, [x0, #1, mul vl]
+; SVE-NEXT: str z0, [x0]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_2x_vectors_f16_r:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.h
+; SSVE-NEXT: ptrue p0.h
+; SSVE-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0]
+; SSVE-NEXT: movprfx z0, z24
+; SSVE-NEXT: fadd z0.h, p0/m, z0.h, #1.0
+; SSVE-NEXT: movprfx z1, z16
+; SSVE-NEXT: fadd z1.h, p0/m, z1.h, #1.0
+; SSVE-NEXT: str z0, [x0, #1, mul vl]
+; SSVE-NEXT: str z1, [x0]
+; SSVE-NEXT: ret
+ %a = load <vscale x 16 x half>, ptr %addr
+ %b = fadd <vscale x 16 x half> %a, splat (half 1.0)
+ store <vscale x 16 x half> %b, ptr %addr
+ ret void
+}
+
+define void @load_2x_vectors_f16_rr(ptr %base, i64 %idx) {
+; SVE-LABEL: load_2x_vectors_f16_rr:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.h
+; SVE-NEXT: ptrue p0.h
+; SVE-NEXT: add x8, x0, x1, lsl #1
+; SVE-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0, x1, lsl #1]
+; SVE-NEXT: movprfx z2, z0
+; SVE-NEXT: fadd z2.h, p0/m, z2.h, #1.0
+; SVE-NEXT: movprfx z0, z1
+; SVE-NEXT: fadd z0.h, p0/m, z0.h, #1.0
+; SVE-NEXT: st1h { z2.h }, p0, [x0, x1, lsl #1]
+; SVE-NEXT: str z0, [x8, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_2x_vectors_f16_rr:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.h
+; SSVE-NEXT: ptrue p0.h
+; SSVE-NEXT: add x8, x0, x1, lsl #1
+; SSVE-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0, x1, lsl #1]
+; SSVE-NEXT: movprfx z0, z16
+; SSVE-NEXT: fadd z0.h, p0/m, z0.h, #1.0
+; SSVE-NEXT: movprfx z1, z24
+; SSVE-NEXT: fadd z1.h, p0/m, z1.h, #1.0
+; SSVE-NEXT: st1h { z0.h }, p0, [x0, x1, lsl #1]
+; SSVE-NEXT: str z1, [x8, #1, mul vl]
+; SSVE-NEXT: ret
+ %addr = getelementptr half, ptr %base, i64 %idx
+ %a = load <vscale x 16 x half>, ptr %addr
+ %b = fadd <vscale x 16 x half> %a, splat (half 1.0)
+ store <vscale x 16 x half> %b, ptr %addr
+ ret void
+}
+
+define void @load_2x_vectors_f32_r(ptr %addr) {
+; SVE-LABEL: load_2x_vectors_f32_r:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.s
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: ld1w { z0.s, z1.s }, pn8/z, [x0]
+; SVE-NEXT: movprfx z2, z1
+; SVE-NEXT: fadd z2.s, p0/m, z2.s, #1.0
+; SVE-NEXT: fadd z0.s, p0/m, z0.s, #1.0
+; SVE-NEXT: str z2, [x0, #1, mul vl]
+; SVE-NEXT: str z0, [x0]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_2x_vectors_f32_r:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.s
+; SSVE-NEXT: ptrue p0.s
+; SSVE-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x0]
+; SSVE-NEXT: movprfx z0, z24
+; SSVE-NEXT: fadd z0.s, p0/m, z0.s, #1.0
+; SSVE-NEXT: movprfx z1, z16
+; SSVE-NEXT: fadd z1.s, p0/m, z1.s, #1.0
+; SSVE-NEXT: str z0, [x0, #1, mul vl]
+; SSVE-NEXT: str z1, [x0]
+; SSVE-NEXT: ret
+ %a = load <vscale x 8 x float>, ptr %addr
+ %b = fadd <vscale x 8 x float> %a, splat (float 1.0)
+ store <vscale x 8 x float> %b, ptr %addr
+ ret void
+}
+
+define void @load_2x_vectors_f32_rr(ptr %base, i64 %idx) {
+; SVE-LABEL: load_2x_vectors_f32_rr:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.s
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: add x8, x0, x1, lsl #2
+; SVE-NEXT: ld1w { z0.s, z1.s }, pn8/z, [x0, x1, lsl #2]
+; SVE-NEXT: movprfx z2, z0
+; SVE-NEXT: fadd z2.s, p0/m, z2.s, #1.0
+; SVE-NEXT: movprfx z0, z1
+; SVE-NEXT: fadd z0.s, p0/m, z0.s, #1.0
+; SVE-NEXT: st1w { z2.s }, p0, [x0, x1, lsl #2]
+; SVE-NEXT: str z0, [x8, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_2x_vectors_f32_rr:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.s
+; SSVE-NEXT: ptrue p0.s
+; SSVE-NEXT: add x8, x0, x1, lsl #2
+; SSVE-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x0, x1, lsl #2]
+; SSVE-NEXT: movprfx z0, z16
+; SSVE-NEXT: fadd z0.s, p0/m, z0.s, #1.0
+; SSVE-NEXT: movprfx z1, z24
+; SSVE-NEXT: fadd z1.s, p0/m, z1.s, #1.0
+; SSVE-NEXT: st1w { z0.s }, p0, [x0, x1, lsl #2]
+; SSVE-NEXT: str z1, [x8, #1, mul vl]
+; SSVE-NEXT: ret
+ %addr = getelementptr float, ptr %base, i64 %idx
+ %a = load <vscale x 8 x float>, ptr %addr
+ %b = fadd <vscale x 8 x float> %a, splat (float 1.0)
+ store <vscale x 8 x float> %b, ptr %addr
+ ret void
+}
+
+define void @load_2x_vectors_f64_r(ptr %addr) {
+; SVE-LABEL: load_2x_vectors_f64_r:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.d
+; SVE-NEXT: ptrue p0.d
+; SVE-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x0]
+; SVE-NEXT: movprfx z2, z1
+; SVE-NEXT: fadd z2.d, p0/m, z2.d, #1.0
+; SVE-NEXT: fadd z0.d, p0/m, z0.d, #1.0
+; SVE-NEXT: str z2, [x0, #1, mul vl]
+; SVE-NEXT: str z0, [x0]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_2x_vectors_f64_r:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.d
+; SSVE-NEXT: ptrue p0.d
+; SSVE-NEXT: ld1d { z16.d, z24.d }, pn8/z, [x0]
+; SSVE-NEXT: movprfx z0, z24
+; SSVE-NEXT: fadd z0.d, p0/m, z0.d, #1.0
+; SSVE-NEXT: movprfx z1, z16
+; SSVE-NEXT: fadd z1.d, p0/m, z1.d, #1.0
+; SSVE-NEXT: str z0, [x0, #1, mul vl]
+; SSVE-NEXT: str z1, [x0]
+; SSVE-NEXT: ret
+ %a = load <vscale x 4 x double>, ptr %addr
+ %b = fadd <vscale x 4 x double> %a, splat (double 1.0)
+ store <vscale x 4 x double> %b, ptr %addr
+ ret void
+}
+
+define void @load_2x_vectors_f64_rr(ptr %base, i64 %idx) {
+; SVE-LABEL: load_2x_vectors_f64_rr:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.d
+; SVE-NEXT: ptrue p0.d
+; SVE-NEXT: add x8, x0, x1, lsl #3
+; SVE-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x0, x1, lsl #3]
+; SVE-NEXT: movprfx z2, z0
+; SVE-NEXT: fadd z2.d, p0/m, z2.d, #1.0
+; SVE-NEXT: movprfx z0, z1
+; SVE-NEXT: fadd z0.d, p0/m, z0.d, #1.0
+; SVE-NEXT: st1d { z2.d }, p0, [x0, x1, lsl #3]
+; SVE-NEXT: str z0, [x8, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_2x_vectors_f64_rr:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.d
+; SSVE-NEXT: ptrue p0.d
+; SSVE-NEXT: add x8, x0, x1, lsl #3
+; SSVE-NEXT: ld1d { z16.d, z24.d }, pn8/z, [x0, x1, lsl #3]
+; SSVE-NEXT: movprfx z0, z16
+; SSVE-NEXT: fadd z0.d, p0/m, z0.d, #1.0
+; SSVE-NEXT: movprfx z1, z24
+; SSVE-NEXT: fadd z1.d, p0/m, z1.d, #1.0
+; SSVE-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3]
+; SSVE-NEXT: str z1, [x8, #1, mul vl]
+; SSVE-NEXT: ret
+ %addr = getelementptr double, ptr %base, i64 %idx
+ %a = load <vscale x 4 x double>, ptr %addr
+ %b = fadd <vscale x 4 x double> %a, splat (double 1.0)
+ store <vscale x 4 x double> %b, ptr %addr
+ ret void
+}
+
+define void @load_2x_vectors_bf16_r(ptr %addr) {
+; SVE-LABEL: load_2x_vectors_bf16_r:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.h
+; SVE-NEXT: fmov z0.h, #1.87500000
+; SVE-NEXT: ld1h { z2.h, z3.h }, pn8/z, [x0]
+; SVE-NEXT: bfadd z1.h, z3.h, z0.h
+; SVE-NEXT: bfadd z0.h, z2.h, z0.h
+; SVE-NEXT: str z1, [x0, #1, mul vl]
+; SVE-NEXT: str z0, [x0]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_2x_vectors_bf16_r:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.h
+; SSVE-NEXT: fmov z0.h, #1.87500000
+; SSVE-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0]
+; SSVE-NEXT: bfadd z1.h, z24.h, z0.h
+; SSVE-NEXT: bfadd z0.h, z16.h, z0.h
+; SSVE-NEXT: str z1, [x0, #1, mul vl]
+; SSVE-NEXT: str z0, [x0]
+; SSVE-NEXT: ret
+ %a = load <vscale x 16 x bfloat>, ptr %addr
+ %b = fadd <vscale x 16 x bfloat> %a, splat (bfloat 1.0)
+ store <vscale x 16 x bfloat> %b, ptr %addr
+ ret void
+}
+
+define void @load_2x_vectors_bf16_rr(ptr %base, i64 %idx) {
+; SVE-LABEL: load_2x_vectors_bf16_rr:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.h
+; SVE-NEXT: fmov z0.h, #1.87500000
+; SVE-NEXT: add x8, x0, x1, lsl #1
+; SVE-NEXT: ld1h { z2.h, z3.h }, pn8/z, [x0, x1, lsl #1]
+; SVE-NEXT: ptrue p0.h
+; SVE-NEXT: bfadd z1.h, z2.h, z0.h
+; SVE-NEXT: bfadd z0.h, z3.h, z0.h
+; SVE-NEXT: st1h { z1.h }, p0, [x0, x1, lsl #1]
+; SVE-NEXT: str z0, [x8, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_2x_vectors_bf16_rr:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.h
+; SSVE-NEXT: fmov z0.h, #1.87500000
+; SSVE-NEXT: add x8, x0, x1, lsl #1
+; SSVE-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0, x1, lsl #1]
+; SSVE-NEXT: ptrue p0.h
+; SSVE-NEXT: bfadd z1.h, z16.h, z0.h
+; SSVE-NEXT: bfadd z0.h, z24.h, z0.h
+; SSVE-NEXT: st1h { z1.h }, p0, [x0, x1, lsl #1]
+; SSVE-NEXT: str z0, [x8, #1, mul vl]
+; SSVE-NEXT: ret
+ %addr = getelementptr bfloat, ptr %base, i64 %idx
+ %a = load <vscale x 16 x bfloat>, ptr %addr
+ %b = fadd <vscale x 16 x bfloat> %a, splat (bfloat 1.0)
+ store <vscale x 16 x bfloat> %b, ptr %addr
+ ret void
+}
+
+
+
+define void @load_4x_vectors_i8_r(ptr %addr) {
+; SVE-LABEL: load_4x_vectors_i8_r:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.b
+; SVE-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
+; SVE-NEXT: add z0.b, z0.b, #5 // =0x5
+; SVE-NEXT: add z1.b, z1.b, #5 // =0x5
+; SVE-NEXT: add z2.b, z2.b, #5 // =0x5
+; SVE-NEXT: add z3.b, z3.b, #5 // =0x5
+; SVE-NEXT: str z2, [x0, #2, mul vl]
+; SVE-NEXT: str z1, [x0, #1, mul vl]
+; SVE-NEXT: str z0, [x0]
+; SVE-NEXT: str z3, [x0, #3, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_4x_vectors_i8_r:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.b
+; SSVE-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; SSVE-NEXT: add z16.b, z16.b, #5 // =0x5
+; SSVE-NEXT: add z20.b, z20.b, #5 // =0x5
+; SSVE-NEXT: add z24.b, z24.b, #5 // =0x5
+; SSVE-NEXT: add z28.b, z28.b, #5 // =0x5
+; SSVE-NEXT: str z24, [x0, #2, mul vl]
+; SSVE-NEXT: str z20, [x0, #1, mul vl]
+; SSVE-NEXT: str z16, [x0]
+; SSVE-NEXT: str z28, [x0, #3, mul vl]
+; SSVE-NEXT: ret
+ %a = load <vscale x 64 x i8>, ptr %addr
+ %b = add <vscale x 64 x i8> %a, splat (i8 5)
+ store <vscale x 64 x i8> %b, ptr %addr
+ ret void
+}
+
+define void @load_4x_vectors_i8_rr(ptr %base, i64 %idx) {
+; SVE-LABEL: load_4x_vectors_i8_rr:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.b
+; SVE-NEXT: ptrue p0.b
+; SVE-NEXT: add x8, x0, x1
+; SVE-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0, x1]
+; SVE-NEXT: add z1.b, z1.b, #5 // =0x5
+; SVE-NEXT: add z2.b, z2.b, #5 // =0x5
+; SVE-NEXT: add z3.b, z3.b, #5 // =0x5
+; SVE-NEXT: add z0.b, z0.b, #5 // =0x5
+; SVE-NEXT: st1b { z0.b }, p0, [x0, x1]
+; SVE-NEXT: str z3, [x8, #3, mul vl]
+; SVE-NEXT: str z2, [x8, #2, mul vl]
+; SVE-NEXT: str z1, [x8, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_4x_vectors_i8_rr:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.b
+; SSVE-NEXT: ptrue p0.b
+; SSVE-NEXT: add x8, x0, x1
+; SSVE-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
+; SSVE-NEXT: add z20.b, z20.b, #5 // =0x5
+; SSVE-NEXT: add z24.b, z24.b, #5 // =0x5
+; SSVE-NEXT: add z28.b, z28.b, #5 // =0x5
+; SSVE-NEXT: add z16.b, z16.b, #5 // =0x5
+; SSVE-NEXT: st1b { z16.b }, p0, [x0, x1]
+; SSVE-NEXT: str z28, [x8, #3, mul vl]
+; SSVE-NEXT: str z24, [x8, #2, mul vl]
+; SSVE-NEXT: str z20, [x8, #1, mul vl]
+; SSVE-NEXT: ret
+ %addr = getelementptr i8, ptr %base, i64 %idx
+ %a = load <vscale x 64 x i8>, ptr %addr
+ %b = add <vscale x 64 x i8> %a, splat (i8 5)
+ store <vscale x 64 x i8> %b, ptr %addr
+ ret void
+}
+
+define void @load_4x_vectors_i16_r(ptr %addr) {
+; SVE-LABEL: load_4x_vectors_i16_r:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.h
+; SVE-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0]
+; SVE-NEXT: add z0.h, z0.h, #5 // =0x5
+; SVE-NEXT: add z1.h, z1.h, #5 // =0x5
+; SVE-NEXT: add z2.h, z2.h, #5 // =0x5
+; SVE-NEXT: add z3.h, z3.h, #5 // =0x5
+; SVE-NEXT: str z2, [x0, #2, mul vl]
+; SVE-NEXT: str z1, [x0, #1, mul vl]
+; SVE-NEXT: str z0, [x0]
+; SVE-NEXT: str z3, [x0, #3, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_4x_vectors_i16_r:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.h
+; SSVE-NEXT: ld1h { z16.h, z20.h, z24.h, z28.h }, pn8/z, [x0]
+; SSVE-NEXT: add z16.h, z16.h, #5 // =0x5
+; SSVE-NEXT: add z20.h, z20.h, #5 // =0x5
+; SSVE-NEXT: add z24.h, z24.h, #5 // =0x5
+; SSVE-NEXT: add z28.h, z28.h, #5 // =0x5
+; SSVE-NEXT: str z24, [x0, #2, mul vl]
+; SSVE-NEXT: str z20, [x0, #1, mul vl]
+; SSVE-NEXT: str z16, [x0]
+; SSVE-NEXT: str z28, [x0, #3, mul vl]
+; SSVE-NEXT: ret
+ %a = load <vscale x 32 x i16>, ptr %addr
+ %b = add <vscale x 32 x i16> %a, splat (i16 5)
+ store <vscale x 32 x i16> %b, ptr %addr
+ ret void
+}
+
+define void @load_4x_vectors_i16_rr(ptr %base, i64 %idx) {
+; SVE-LABEL: load_4x_vectors_i16_rr:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.h
+; SVE-NEXT: ptrue p0.h
+; SVE-NEXT: add x8, x0, x1, lsl #1
+; SVE-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0, x1, lsl #1]
+; SVE-NEXT: add z1.h, z1.h, #5 // =0x5
+; SVE-NEXT: add z2.h, z2.h, #5 // =0x5
+; SVE-NEXT: add z3.h, z3.h, #5 // =0x5
+; SVE-NEXT: add z0.h, z0.h, #5 // =0x5
+; SVE-NEXT: st1h { z0.h }, p0, [x0, x1, lsl #1]
+; SVE-NEXT: str z3, [x8, #3, mul vl]
+; SVE-NEXT: str z2, [x8, #2, mul vl]
+; SVE-NEXT: str z1, [x8, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_4x_vectors_i16_rr:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.h
+; SSVE-NEXT: ptrue p0.h
+; SSVE-NEXT: add x8, x0, x1, lsl #1
+; SSVE-NEXT: ld1h { z16.h, z20.h, z24.h, z28.h }, pn8/z, [x0, x1, lsl #1]
+; SSVE-NEXT: add z20.h, z20.h, #5 // =0x5
+; SSVE-NEXT: add z24.h, z24.h, #5 // =0x5
+; SSVE-NEXT: add z28.h, z28.h, #5 // =0x5
+; SSVE-NEXT: add z16.h, z16.h, #5 // =0x5
+; SSVE-NEXT: st1h { z16.h }, p0, [x0, x1, lsl #1]
+; SSVE-NEXT: str z28, [x8, #3, mul vl]
+; SSVE-NEXT: str z24, [x8, #2, mul vl]
+; SSVE-NEXT: str z20, [x8, #1, mul vl]
+; SSVE-NEXT: ret
+ %addr = getelementptr i16, ptr %base, i64 %idx
+ %a = load <vscale x 32 x i16>, ptr %addr
+ %b = add <vscale x 32 x i16> %a, splat (i16 5)
+ store <vscale x 32 x i16> %b, ptr %addr
+ ret void
+}
+
+define void @load_4x_vectors_i32_r(ptr %addr) {
+; SVE-LABEL: load_4x_vectors_i32_r:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.s
+; SVE-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x0]
+; SVE-NEXT: add z0.s, z0.s, #5 // =0x5
+; SVE-NEXT: add z1.s, z1.s, #5 // =0x5
+; SVE-NEXT: add z2.s, z2.s, #5 // =0x5
+; SVE-NEXT: add z3.s, z3.s, #5 // =0x5
+; SVE-NEXT: str z2, [x0, #2, mul vl]
+; SVE-NEXT: str z1, [x0, #1, mul vl]
+; SVE-NEXT: str z0, [x0]
+; SVE-NEXT: str z3, [x0, #3, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_4x_vectors_i32_r:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.s
+; SSVE-NEXT: ld1w { z16.s, z20.s, z24.s, z28.s }, pn8/z, [x0]
+; SSVE-NEXT: add z16.s, z16.s, #5 // =0x5
+; SSVE-NEXT: add z20.s, z20.s, #5 // =0x5
+; SSVE-NEXT: add z24.s, z24.s, #5 // =0x5
+; SSVE-NEXT: add z28.s, z28.s, #5 // =0x5
+; SSVE-NEXT: str z24, [x0, #2, mul vl]
+; SSVE-NEXT: str z20, [x0, #1, mul vl]
+; SSVE-NEXT: str z16, [x0]
+; SSVE-NEXT: str z28, [x0, #3, mul vl]
+; SSVE-NEXT: ret
+ %a = load <vscale x 16 x i32>, ptr %addr
+ %b = add <vscale x 16 x i32> %a, splat (i32 5)
+ store <vscale x 16 x i32> %b, ptr %addr
+ ret void
+}
+
+define void @load_4x_vectors_i32_rr(ptr %base, i64 %idx) {
+; SVE-LABEL: load_4x_vectors_i32_rr:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.s
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: add x8, x0, x1, lsl #2
+; SVE-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x0, x1, lsl #2]
+; SVE-NEXT: add z1.s, z1.s, #5 // =0x5
+; SVE-NEXT: add z2.s, z2.s, #5 // =0x5
+; SVE-NEXT: add z3.s, z3.s, #5 // =0x5
+; SVE-NEXT: add z0.s, z0.s, #5 // =0x5
+; SVE-NEXT: st1w { z0.s }, p0, [x0, x1, lsl #2]
+; SVE-NEXT: str z3, [x8, #3, mul vl]
+; SVE-NEXT: str z2, [x8, #2, mul vl]
+; SVE-NEXT: str z1, [x8, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_4x_vectors_i32_rr:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.s
+; SSVE-NEXT: ptrue p0.s
+; SSVE-NEXT: add x8, x0, x1, lsl #2
+; SSVE-NEXT: ld1w { z16.s, z20.s, z24.s, z28.s }, pn8/z, [x0, x1, lsl #2]
+; SSVE-NEXT: add z20.s, z20.s, #5 // =0x5
+; SSVE-NEXT: add z24.s, z24.s, #5 // =0x5
+; SSVE-NEXT: add z28.s, z28.s, #5 // =0x5
+; SSVE-NEXT: add z16.s, z16.s, #5 // =0x5
+; SSVE-NEXT: st1w { z16.s }, p0, [x0, x1, lsl #2]
+; SSVE-NEXT: str z28, [x8, #3, mul vl]
+; SSVE-NEXT: str z24, [x8, #2, mul vl]
+; SSVE-NEXT: str z20, [x8, #1, mul vl]
+; SSVE-NEXT: ret
+ %addr = getelementptr i32, ptr %base, i64 %idx
+ %a = load <vscale x 16 x i32>, ptr %addr
+ %b = add <vscale x 16 x i32> %a, splat (i32 5)
+ store <vscale x 16 x i32> %b, ptr %addr
+ ret void
+}
+
+define void @load_4x_vectors_i64_r(ptr %addr) {
+; SVE-LABEL: load_4x_vectors_i64_r:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.d
+; SVE-NEXT: ld1d { z0.d - z3.d }, pn8/z, [x0]
+; SVE-NEXT: add z0.d, z0.d, #5 // =0x5
+; SVE-NEXT: add z1.d, z1.d, #5 // =0x5
+; SVE-NEXT: add z2.d, z2.d, #5 // =0x5
+; SVE-NEXT: add z3.d, z3.d, #5 // =0x5
+; SVE-NEXT: str z2, [x0, #2, mul vl]
+; SVE-NEXT: str z1, [x0, #1, mul vl]
+; SVE-NEXT: str z0, [x0]
+; SVE-NEXT: str z3, [x0, #3, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_4x_vectors_i64_r:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.d
+; SSVE-NEXT: ld1d { z16.d, z20.d, z24.d, z28.d }, pn8/z, [x0]
+; SSVE-NEXT: add z16.d, z16.d, #5 // =0x5
+; SSVE-NEXT: add z20.d, z20.d, #5 // =0x5
+; SSVE-NEXT: add z24.d, z24.d, #5 // =0x5
+; SSVE-NEXT: add z28.d, z28.d, #5 // =0x5
+; SSVE-NEXT: str z24, [x0, #2, mul vl]
+; SSVE-NEXT: str z20, [x0, #1, mul vl]
+; SSVE-NEXT: str z16, [x0]
+; SSVE-NEXT: str z28, [x0, #3, mul vl]
+; SSVE-NEXT: ret
+ %a = load <vscale x 8 x i64>, ptr %addr
+ %b = add <vscale x 8 x i64> %a, splat (i64 5)
+ store <vscale x 8 x i64> %b, ptr %addr
+ ret void
+}
+
+define void @load_4x_vectors_i64_rr(ptr %base, i64 %idx) {
+; SVE-LABEL: load_4x_vectors_i64_rr:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.d
+; SVE-NEXT: ptrue p0.d
+; SVE-NEXT: add x8, x0, x1, lsl #3
+; SVE-NEXT: ld1d { z0.d - z3.d }, pn8/z, [x0, x1, lsl #3]
+; SVE-NEXT: add z1.d, z1.d, #5 // =0x5
+; SVE-NEXT: add z2.d, z2.d, #5 // =0x5
+; SVE-NEXT: add z3.d, z3.d, #5 // =0x5
+; SVE-NEXT: add z0.d, z0.d, #5 // =0x5
+; SVE-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3]
+; SVE-NEXT: str z3, [x8, #3, mul vl]
+; SVE-NEXT: str z2, [x8, #2, mul vl]
+; SVE-NEXT: str z1, [x8, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_4x_vectors_i64_rr:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.d
+; SSVE-NEXT: ptrue p0.d
+; SSVE-NEXT: add x8, x0, x1, lsl #3
+; SSVE-NEXT: ld1d { z16.d, z20.d, z24.d, z28.d }, pn8/z, [x0, x1, lsl #3]
+; SSVE-NEXT: add z20.d, z20.d, #5 // =0x5
+; SSVE-NEXT: add z24.d, z24.d, #5 // =0x5
+; SSVE-NEXT: add z28.d, z28.d, #5 // =0x5
+; SSVE-NEXT: add z16.d, z16.d, #5 // =0x5
+; SSVE-NEXT: st1d { z16.d }, p0, [x0, x1, lsl #3]
+; SSVE-NEXT: str z28, [x8, #3, mul vl]
+; SSVE-NEXT: str z24, [x8, #2, mul vl]
+; SSVE-NEXT: str z20, [x8, #1, mul vl]
+; SSVE-NEXT: ret
+ %addr = getelementptr i64, ptr %base, i64 %idx
+ %a = load <vscale x 8 x i64>, ptr %addr
+ %b = add <vscale x 8 x i64> %a, splat (i64 5)
+ store <vscale x 8 x i64> %b, ptr %addr
+ ret void
+}
+
+define void @load_4x_vectors_f16_r(ptr %addr) {
+; SVE-LABEL: load_4x_vectors_f16_r:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.h
+; SVE-NEXT: ptrue p0.h
+; SVE-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0]
+; SVE-NEXT: movprfx z4, z3
+; SVE-NEXT: fadd z4.h, p0/m, z4.h, #1.0
+; SVE-NEXT: movprfx z5, z2
+; SVE-NEXT: fadd z5.h, p0/m, z5.h, #1.0
+; SVE-NEXT: movprfx z6, z1
+; SVE-NEXT: fadd z6.h, p0/m, z6.h, #1.0
+; SVE-NEXT: fadd z0.h, p0/m, z0.h, #1.0
+; SVE-NEXT: str z4, [x0, #3, mul vl]
+; SVE-NEXT: str z5, [x0, #2, mul vl]
+; SVE-NEXT: str z6, [x0, #1, mul vl]
+; SVE-NEXT: str z0, [x0]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_4x_vectors_f16_r:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.h
+; SSVE-NEXT: ptrue p0.h
+; SSVE-NEXT: ld1h { z16.h, z20.h, z24.h, z28.h }, pn8/z, [x0]
+; SSVE-NEXT: movprfx z0, z28
+; SSVE-NEXT: fadd z0.h, p0/m, z0.h, #1.0
+; SSVE-NEXT: movprfx z1, z24
+; SSVE-NEXT: fadd z1.h, p0/m, z1.h, #1.0
+; SSVE-NEXT: movprfx z2, z20
+; SSVE-NEXT: fadd z2.h, p0/m, z2.h, #1.0
+; SSVE-NEXT: movprfx z3, z16
+; SSVE-NEXT: fadd z3.h, p0/m, z3.h, #1.0
+; SSVE-NEXT: str z0, [x0, #3, mul vl]
+; SSVE-NEXT: str z1, [x0, #2, mul vl]
+; SSVE-NEXT: str z2, [x0, #1, mul vl]
+; SSVE-NEXT: str z3, [x0]
+; SSVE-NEXT: ret
+ %a = load <vscale x 32 x half>, ptr %addr
+ %b = fadd <vscale x 32 x half> %a, splat (half 1.0)
+ store <vscale x 32 x half> %b, ptr %addr
+ ret void
+}
+
+define void @load_4x_vectors_f16_rr(ptr %base, i64 %idx) {
+; SVE-LABEL: load_4x_vectors_f16_rr:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.h
+; SVE-NEXT: ptrue p0.h
+; SVE-NEXT: add x8, x0, x1, lsl #1
+; SVE-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0, x1, lsl #1]
+; SVE-NEXT: movprfx z4, z0
+; SVE-NEXT: fadd z4.h, p0/m, z4.h, #1.0
+; SVE-NEXT: movprfx z5, z3
+; SVE-NEXT: fadd z5.h, p0/m, z5.h, #1.0
+; SVE-NEXT: movprfx z6, z2
+; SVE-NEXT: fadd z6.h, p0/m, z6.h, #1.0
+; SVE-NEXT: movprfx z0, z1
+; SVE-NEXT: fadd z0.h, p0/m, z0.h, #1.0
+; SVE-NEXT: st1h { z4.h }, p0, [x0, x1, lsl #1]
+; SVE-NEXT: str z5, [x8, #3, mul vl]
+; SVE-NEXT: str z6, [x8, #2, mul vl]
+; SVE-NEXT: str z0, [x8, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_4x_vectors_f16_rr:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.h
+; SSVE-NEXT: ptrue p0.h
+; SSVE-NEXT: add x8, x0, x1, lsl #1
+; SSVE-NEXT: ld1h { z16.h, z20.h, z24.h, z28.h }, pn8/z, [x0, x1, lsl #1]
+; SSVE-NEXT: movprfx z0, z16
+; SSVE-NEXT: fadd z0.h, p0/m, z0.h, #1.0
+; SSVE-NEXT: movprfx z1, z28
+; SSVE-NEXT: fadd z1.h, p0/m, z1.h, #1.0
+; SSVE-NEXT: movprfx z2, z24
+; SSVE-NEXT: fadd z2.h, p0/m, z2.h, #1.0
+; SSVE-NEXT: movprfx z3, z20
+; SSVE-NEXT: fadd z3.h, p0/m, z3.h, #1.0
+; SSVE-NEXT: st1h { z0.h }, p0, [x0, x1, lsl #1]
+; SSVE-NEXT: str z1, [x8, #3, mul vl]
+; SSVE-NEXT: str z2, [x8, #2, mul vl]
+; SSVE-NEXT: str z3, [x8, #1, mul vl]
+; SSVE-NEXT: ret
+ %addr = getelementptr half, ptr %base, i64 %idx
+ %a = load <vscale x 32 x half>, ptr %addr
+ %b = fadd <vscale x 32 x half> %a, splat (half 1.0)
+ store <vscale x 32 x half> %b, ptr %addr
+ ret void
+}
+
+define void @load_4x_vectors_f32_r(ptr %addr) {
+; SVE-LABEL: load_4x_vectors_f32_r:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.s
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x0]
+; SVE-NEXT: movprfx z4, z3
+; SVE-NEXT: fadd z4.s, p0/m, z4.s, #1.0
+; SVE-NEXT: movprfx z5, z2
+; SVE-NEXT: fadd z5.s, p0/m, z5.s, #1.0
+; SVE-NEXT: movprfx z6, z1
+; SVE-NEXT: fadd z6.s, p0/m, z6.s, #1.0
+; SVE-NEXT: fadd z0.s, p0/m, z0.s, #1.0
+; SVE-NEXT: str z4, [x0, #3, mul vl]
+; SVE-NEXT: str z5, [x0, #2, mul vl]
+; SVE-NEXT: str z6, [x0, #1, mul vl]
+; SVE-NEXT: str z0, [x0]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_4x_vectors_f32_r:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.s
+; SSVE-NEXT: ptrue p0.s
+; SSVE-NEXT: ld1w { z16.s, z20.s, z24.s, z28.s }, pn8/z, [x0]
+; SSVE-NEXT: movprfx z0, z28
+; SSVE-NEXT: fadd z0.s, p0/m, z0.s, #1.0
+; SSVE-NEXT: movprfx z1, z24
+; SSVE-NEXT: fadd z1.s, p0/m, z1.s, #1.0
+; SSVE-NEXT: movprfx z2, z20
+; SSVE-NEXT: fadd z2.s, p0/m, z2.s, #1.0
+; SSVE-NEXT: movprfx z3, z16
+; SSVE-NEXT: fadd z3.s, p0/m, z3.s, #1.0
+; SSVE-NEXT: str z0, [x0, #3, mul vl]
+; SSVE-NEXT: str z1, [x0, #2, mul vl]
+; SSVE-NEXT: str z2, [x0, #1, mul vl]
+; SSVE-NEXT: str z3, [x0]
+; SSVE-NEXT: ret
+ %a = load <vscale x 16 x float>, ptr %addr
+ %b = fadd <vscale x 16 x float> %a, splat (float 1.0)
+ store <vscale x 16 x float> %b, ptr %addr
+ ret void
+}
+
+define void @load_4x_vectors_f32_rr(ptr %base, i64 %idx) {
+; SVE-LABEL: load_4x_vectors_f32_rr:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.s
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: add x8, x0, x1, lsl #2
+; SVE-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x0, x1, lsl #2]
+; SVE-NEXT: movprfx z4, z0
+; SVE-NEXT: fadd z4.s, p0/m, z4.s, #1.0
+; SVE-NEXT: movprfx z5, z3
+; SVE-NEXT: fadd z5.s, p0/m, z5.s, #1.0
+; SVE-NEXT: movprfx z6, z2
+; SVE-NEXT: fadd z6.s, p0/m, z6.s, #1.0
+; SVE-NEXT: movprfx z0, z1
+; SVE-NEXT: fadd z0.s, p0/m, z0.s, #1.0
+; SVE-NEXT: st1w { z4.s }, p0, [x0, x1, lsl #2]
+; SVE-NEXT: str z5, [x8, #3, mul vl]
+; SVE-NEXT: str z6, [x8, #2, mul vl]
+; SVE-NEXT: str z0, [x8, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_4x_vectors_f32_rr:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.s
+; SSVE-NEXT: ptrue p0.s
+; SSVE-NEXT: add x8, x0, x1, lsl #2
+; SSVE-NEXT: ld1w { z16.s, z20.s, z24.s, z28.s }, pn8/z, [x0, x1, lsl #2]
+; SSVE-NEXT: movprfx z0, z16
+; SSVE-NEXT: fadd z0.s, p0/m, z0.s, #1.0
+; SSVE-NEXT: movprfx z1, z28
+; SSVE-NEXT: fadd z1.s, p0/m, z1.s, #1.0
+; SSVE-NEXT: movprfx z2, z24
+; SSVE-NEXT: fadd z2.s, p0/m, z2.s, #1.0
+; SSVE-NEXT: movprfx z3, z20
+; SSVE-NEXT: fadd z3.s, p0/m, z3.s, #1.0
+; SSVE-NEXT: st1w { z0.s }, p0, [x0, x1, lsl #2]
+; SSVE-NEXT: str z1, [x8, #3, mul vl]
+; SSVE-NEXT: str z2, [x8, #2, mul vl]
+; SSVE-NEXT: str z3, [x8, #1, mul vl]
+; SSVE-NEXT: ret
+ %addr = getelementptr float, ptr %base, i64 %idx
+ %a = load <vscale x 16 x float>, ptr %addr
+ %b = fadd <vscale x 16 x float> %a, splat (float 1.0)
+ store <vscale x 16 x float> %b, ptr %addr
+ ret void
+}
+
+define void @load_4x_vectors_f64_r(ptr %addr) {
+; SVE-LABEL: load_4x_vectors_f64_r:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.d
+; SVE-NEXT: ptrue p0.d
+; SVE-NEXT: ld1d { z0.d - z3.d }, pn8/z, [x0]
+; SVE-NEXT: movprfx z4, z3
+; SVE-NEXT: fadd z4.d, p0/m, z4.d, #1.0
+; SVE-NEXT: movprfx z5, z2
+; SVE-NEXT: fadd z5.d, p0/m, z5.d, #1.0
+; SVE-NEXT: movprfx z6, z1
+; SVE-NEXT: fadd z6.d, p0/m, z6.d, #1.0
+; SVE-NEXT: fadd z0.d, p0/m, z0.d, #1.0
+; SVE-NEXT: str z4, [x0, #3, mul vl]
+; SVE-NEXT: str z5, [x0, #2, mul vl]
+; SVE-NEXT: str z6, [x0, #1, mul vl]
+; SVE-NEXT: str z0, [x0]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_4x_vectors_f64_r:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.d
+; SSVE-NEXT: ptrue p0.d
+; SSVE-NEXT: ld1d { z16.d, z20.d, z24.d, z28.d }, pn8/z, [x0]
+; SSVE-NEXT: movprfx z0, z28
+; SSVE-NEXT: fadd z0.d, p0/m, z0.d, #1.0
+; SSVE-NEXT: movprfx z1, z24
+; SSVE-NEXT: fadd z1.d, p0/m, z1.d, #1.0
+; SSVE-NEXT: movprfx z2, z20
+; SSVE-NEXT: fadd z2.d, p0/m, z2.d, #1.0
+; SSVE-NEXT: movprfx z3, z16
+; SSVE-NEXT: fadd z3.d, p0/m, z3.d, #1.0
+; SSVE-NEXT: str z0, [x0, #3, mul vl]
+; SSVE-NEXT: str z1, [x0, #2, mul vl]
+; SSVE-NEXT: str z2, [x0, #1, mul vl]
+; SSVE-NEXT: str z3, [x0]
+; SSVE-NEXT: ret
+ %a = load <vscale x 8 x double>, ptr %addr
+ %b = fadd <vscale x 8 x double> %a, splat (double 1.0)
+ store <vscale x 8 x double> %b, ptr %addr
+ ret void
+}
+
+define void @load_4x_vectors_f64_rr(ptr %base, i64 %idx) {
+; SVE-LABEL: load_4x_vectors_f64_rr:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.d
+; SVE-NEXT: ptrue p0.d
+; SVE-NEXT: add x8, x0, x1, lsl #3
+; SVE-NEXT: ld1d { z0.d - z3.d }, pn8/z, [x0, x1, lsl #3]
+; SVE-NEXT: movprfx z4, z0
+; SVE-NEXT: fadd z4.d, p0/m, z4.d, #1.0
+; SVE-NEXT: movprfx z5, z3
+; SVE-NEXT: fadd z5.d, p0/m, z5.d, #1.0
+; SVE-NEXT: movprfx z6, z2
+; SVE-NEXT: fadd z6.d, p0/m, z6.d, #1.0
+; SVE-NEXT: movprfx z0, z1
+; SVE-NEXT: fadd z0.d, p0/m, z0.d, #1.0
+; SVE-NEXT: st1d { z4.d }, p0, [x0, x1, lsl #3]
+; SVE-NEXT: str z5, [x8, #3, mul vl]
+; SVE-NEXT: str z6, [x8, #2, mul vl]
+; SVE-NEXT: str z0, [x8, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_4x_vectors_f64_rr:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.d
+; SSVE-NEXT: ptrue p0.d
+; SSVE-NEXT: add x8, x0, x1, lsl #3
+; SSVE-NEXT: ld1d { z16.d, z20.d, z24.d, z28.d }, pn8/z, [x0, x1, lsl #3]
+; SSVE-NEXT: movprfx z0, z16
+; SSVE-NEXT: fadd z0.d, p0/m, z0.d, #1.0
+; SSVE-NEXT: movprfx z1, z28
+; SSVE-NEXT: fadd z1.d, p0/m, z1.d, #1.0
+; SSVE-NEXT: movprfx z2, z24
+; SSVE-NEXT: fadd z2.d, p0/m, z2.d, #1.0
+; SSVE-NEXT: movprfx z3, z20
+; SSVE-NEXT: fadd z3.d, p0/m, z3.d, #1.0
+; SSVE-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3]
+; SSVE-NEXT: str z1, [x8, #3, mul vl]
+; SSVE-NEXT: str z2, [x8, #2, mul vl]
+; SSVE-NEXT: str z3, [x8, #1, mul vl]
+; SSVE-NEXT: ret
+ %addr = getelementptr double, ptr %base, i64 %idx
+ %a = load <vscale x 8 x double>, ptr %addr
+ %b = fadd <vscale x 8 x double> %a, splat (double 1.0)
+ store <vscale x 8 x double> %b, ptr %addr
+ ret void
+}
+
+define void @load_4x_vectors_bf16_r(ptr %addr) {
+; SVE-LABEL: load_4x_vectors_bf16_r:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.h
+; SVE-NEXT: fmov z0.h, #1.87500000
+; SVE-NEXT: ld1h { z4.h - z7.h }, pn8/z, [x0]
+; SVE-NEXT: bfadd z1.h, z7.h, z0.h
+; SVE-NEXT: bfadd z2.h, z6.h, z0.h
+; SVE-NEXT: str z1, [x0, #3, mul vl]
+; SVE-NEXT: bfadd z1.h, z5.h, z0.h
+; SVE-NEXT: bfadd z0.h, z4.h, z0.h
+; SVE-NEXT: str z2, [x0, #2, mul vl]
+; SVE-NEXT: str z1, [x0, #1, mul vl]
+; SVE-NEXT: str z0, [x0]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_4x_vectors_bf16_r:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.h
+; SSVE-NEXT: fmov z0.h, #1.87500000
+; SSVE-NEXT: ld1h { z16.h, z20.h, z24.h, z28.h }, pn8/z, [x0]
+; SSVE-NEXT: bfadd z1.h, z28.h, z0.h
+; SSVE-NEXT: bfadd z2.h, z24.h, z0.h
+; SSVE-NEXT: str z1, [x0, #3, mul vl]
+; SSVE-NEXT: bfadd z1.h, z20.h, z0.h
+; SSVE-NEXT: bfadd z0.h, z16.h, z0.h
+; SSVE-NEXT: str z2, [x0, #2, mul vl]
+; SSVE-NEXT: str z1, [x0, #1, mul vl]
+; SSVE-NEXT: str z0, [x0]
+; SSVE-NEXT: ret
+ %a = load <vscale x 32 x bfloat>, ptr %addr
+ %b = fadd <vscale x 32 x bfloat> %a, splat (bfloat 1.0)
+ store <vscale x 32 x bfloat> %b, ptr %addr
+ ret void
+}
+
+define void @load_4x_vectors_bf16_rr(ptr %base, i64 %idx) {
+; SVE-LABEL: load_4x_vectors_bf16_rr:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue pn8.h
+; SVE-NEXT: fmov z0.h, #1.87500000
+; SVE-NEXT: add x8, x0, x1, lsl #1
+; SVE-NEXT: ld1h { z4.h - z7.h }, pn8/z, [x0, x1, lsl #1]
+; SVE-NEXT: ptrue p0.h
+; SVE-NEXT: bfadd z1.h, z4.h, z0.h
+; SVE-NEXT: bfadd z2.h, z7.h, z0.h
+; SVE-NEXT: st1h { z1.h }, p0, [x0, x1, lsl #1]
+; SVE-NEXT: bfadd z1.h, z6.h, z0.h
+; SVE-NEXT: bfadd z0.h, z5.h, z0.h
+; SVE-NEXT: str z2, [x8, #3, mul vl]
+; SVE-NEXT: str z1, [x8, #2, mul vl]
+; SVE-NEXT: str z0, [x8, #1, mul vl]
+; SVE-NEXT: ret
+;
+; SSVE-LABEL: load_4x_vectors_bf16_rr:
+; SSVE: // %bb.0:
+; SSVE-NEXT: ptrue pn8.h
+; SSVE-NEXT: fmov z0.h, #1.87500000
+; SSVE-NEXT: add x8, x0, x1, lsl #1
+; SSVE-NEXT: ld1h { z16.h, z20.h, z24.h, z28.h }, pn8/z, [x0, x1, lsl #1]
+; SSVE-NEXT: ptrue p0.h
+; SSVE-NEXT: bfadd z1.h, z16.h, z0.h
+; SSVE-NEXT: bfadd z2.h, z28.h, z0.h
+; SSVE-NEXT: st1h { z1.h }, p0, [x0, x1, lsl #1]
+; SSVE-NEXT: bfadd z1.h, z24.h, z0.h
+; SSVE-NEXT: bfadd z0.h, z20.h, z0.h
+; SSVE-NEXT: str z2, [x8, #3, mul vl]
+; SSVE-NEXT: str z1, [x8, #2, mul vl]
+; SSVE-NEXT: str z0, [x8, #1, mul vl]
+; SSVE-NEXT: ret
+ %addr = getelementptr bfloat, ptr %base, i64 %idx
+ %a = load <vscale x 32 x bfloat>, ptr %addr
+ %b = fadd <vscale x 32 x bfloat> %a, splat (bfloat 1.0)
+ store <vscale x 32 x bfloat> %b, ptr %addr
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
More information about the llvm-commits
mailing list