[llvm] 2e585dd - [AArch64][SVE] Lower vector.insert to predicated merged MOV
Matt Devereau via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 13 03:18:08 PST 2021
Author: Matt Devereau
Date: 2021-12-13T11:17:55Z
New Revision: 2e585dd91a87d15cd68b4a3f63ffb15800c13bc8
URL: https://github.com/llvm/llvm-project/commit/2e585dd91a87d15cd68b4a3f63ffb15800c13bc8
DIFF: https://github.com/llvm/llvm-project/commit/2e585dd91a87d15cd68b4a3f63ffb15800c13bc8.diff
LOG: [AArch64][SVE] Lower vector.insert to predicated merged MOV
Use predicated SEL for vector.insert instead of going through memory
Differential Revision: https://reviews.llvm.org/D115259
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
llvm/test/CodeGen/AArch64/split-vector-insert.ll
llvm/test/CodeGen/AArch64/sve-insert-vector.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index afafaa7facb74..32d1b6adc286f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10958,16 +10958,15 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
EVT InVT = Op.getOperand(1).getValueType();
unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
- if (InVT.isScalableVector()) {
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
+ SDValue Vec0 = Op.getOperand(0);
+ SDValue Vec1 = Op.getOperand(1);
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ if (InVT.isScalableVector()) {
if (!isTypeLegal(VT))
return SDValue();
- SDValue Vec0 = Op.getOperand(0);
- SDValue Vec1 = Op.getOperand(1);
-
// Ensure the subvector is half the size of the main vector.
if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
return SDValue();
@@ -10997,9 +10996,18 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
return SDValue();
}
- // This will be matched by custom code during ISelDAGToDAG.
- if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef())
- return Op;
+ if (Idx == 0 && isPackedVectorType(VT, DAG)) {
+ // This will be matched by custom code during ISelDAGToDAG.
+ if (Vec0.isUndef())
+ return Op;
+
+ unsigned int PredPattern =
+ getSVEPredPatternFromNumElements(InVT.getVectorNumElements());
+ auto PredTy = VT.changeVectorElementType(MVT::i1);
+ SDValue PTrue = getPTrue(DAG, DL, PredTy, PredPattern);
+ SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
+ return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
+ }
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
index 98a9a4a35d375..3445968721c87 100644
--- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
@@ -74,17 +74,12 @@ define <vscale x 4 x i16> @vec_scalable_subvec_scalable_idx_nonzero_i16(<vscale
define <vscale x 8 x i8> @vec_scalable_subvec_fixed_idx_zero_i8(<vscale x 8 x i8>* %a, <8 x i8>* %b) #0 {
; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: st1h { z0.h }, p0, [sp]
-; CHECK-NEXT: str q1, [sp]
-; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: mov z0.h, p0/m, z1.h
; CHECK-NEXT: ret
%vec = load <vscale x 8 x i8>, <vscale x 8 x i8>* %a
%subvec = load <8 x i8>, <8 x i8>* %b
@@ -123,17 +118,12 @@ define <vscale x 8 x i8> @vec_scalable_subvec_fixed_idx_nonzero_i8(<vscale x 8 x
define <vscale x 4 x i16> @vec_scalable_subvec_fixed_idx_zero_i16(<vscale x 4 x i16>* %a, <4 x i16>* %b) #0 {
; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: st1w { z0.s }, p0, [sp]
-; CHECK-NEXT: str q1, [sp]
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: mov z0.s, p0/m, z1.s
; CHECK-NEXT: ret
%vec = load <vscale x 4 x i16>, <vscale x 4 x i16>* %a
%subvec = load <4 x i16>, <4 x i16>* %b
@@ -172,17 +162,12 @@ define <vscale x 4 x i16> @vec_scalable_subvec_fixed_idx_nonzero_i16(<vscale x 4
define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_zero_i32(<vscale x 2 x i32>* %a, <2 x i32>* %b) #0 {
; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: str q1, [sp]
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: mov z0.d, p0/m, z1.d
; CHECK-NEXT: ret
%vec = load <vscale x 2 x i32>, <vscale x 2 x i32>* %a
%subvec = load <2 x i32>, <2 x i32>* %b
diff --git a/llvm/test/CodeGen/AArch64/split-vector-insert.ll b/llvm/test/CodeGen/AArch64/split-vector-insert.ll
index 754e96b8072df..aa3d9fb8cf6e7 100644
--- a/llvm/test/CodeGen/AArch64/split-vector-insert.ll
+++ b/llvm/test/CodeGen/AArch64/split-vector-insert.ll
@@ -17,44 +17,46 @@ define <vscale x 2 x i64> @test_nxv2i64_v8i64(<vscale x 2 x i64> %a, <8 x i64> %
; CHECK-LABEL: test_nxv2i64_v8i64:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-4
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: cntd x8
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sub x8, x8, #2
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: str q1, [sp]
; CHECK-NEXT: mov w9, #2
+; CHECK-NEXT: sub x8, x8, #2
+; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: cmp x8, #2
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT: mov x10, sp
; CHECK-NEXT: csel x9, x8, x9, lo
-; CHECK-NEXT: addvl x10, sp, #1
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: mov z0.d, p0/m, z1.d
; CHECK-NEXT: lsl x9, x9, #3
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: cmp x8, #4
-; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: str q2, [x10, x9]
; CHECK-NEXT: mov w9, #4
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
; CHECK-NEXT: csel x9, x8, x9, lo
; CHECK-NEXT: lsl x9, x9, #3
-; CHECK-NEXT: addvl x10, sp, #2
+; CHECK-NEXT: addvl x10, sp, #1
; CHECK-NEXT: cmp x8, #6
-; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: str q3, [x10, x9]
; CHECK-NEXT: mov w9, #6
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: csel x8, x8, x9, lo
-; CHECK-NEXT: addvl x9, sp, #3
+; CHECK-NEXT: addvl x9, sp, #2
; CHECK-NEXT: lsl x8, x8, #3
-; CHECK-NEXT: st1d { z0.d }, p0, [sp, #3, mul vl]
+; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl]
; CHECK-NEXT: str q4, [x9, x8]
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #3, mul vl]
-; CHECK-NEXT: addvl sp, sp, #4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
+
+
%r = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> %a, <8 x i64> %b, i64 0)
ret <vscale x 2 x i64> %r
}
@@ -68,44 +70,46 @@ define <vscale x 2 x double> @test_nxv2f64_v8f64(<vscale x 2 x double> %a, <8 x
; CHECK-LABEL: test_nxv2f64_v8f64:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-4
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: cntd x8
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sub x8, x8, #2
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: str q1, [sp]
; CHECK-NEXT: mov w9, #2
+; CHECK-NEXT: sub x8, x8, #2
+; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: cmp x8, #2
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT: mov x10, sp
; CHECK-NEXT: csel x9, x8, x9, lo
-; CHECK-NEXT: addvl x10, sp, #1
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: mov z0.d, p0/m, z1.d
; CHECK-NEXT: lsl x9, x9, #3
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: cmp x8, #4
-; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: str q2, [x10, x9]
; CHECK-NEXT: mov w9, #4
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
; CHECK-NEXT: csel x9, x8, x9, lo
; CHECK-NEXT: lsl x9, x9, #3
-; CHECK-NEXT: addvl x10, sp, #2
+; CHECK-NEXT: addvl x10, sp, #1
; CHECK-NEXT: cmp x8, #6
-; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: str q3, [x10, x9]
; CHECK-NEXT: mov w9, #6
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: csel x8, x8, x9, lo
-; CHECK-NEXT: addvl x9, sp, #3
+; CHECK-NEXT: addvl x9, sp, #2
; CHECK-NEXT: lsl x8, x8, #3
-; CHECK-NEXT: st1d { z0.d }, p0, [sp, #3, mul vl]
+; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl]
; CHECK-NEXT: str q4, [x9, x8]
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #3, mul vl]
-; CHECK-NEXT: addvl sp, sp, #4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
+
+
%r = call <vscale x 2 x double> @llvm.experimental.vector.insert.nxv2f64.v8f64(<vscale x 2 x double> %a, <8 x double> %b, i64 0)
ret <vscale x 2 x double> %r
}
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
index c312ecd726f6e..58034406be465 100644
--- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
@@ -4,14 +4,9 @@
define <vscale x 2 x i64> @insert_v2i64_nxv2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind {
; CHECK-LABEL: insert_v2i64_nxv2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: str q1, [sp]
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: mov z0.d, p0/m, z1.d
; CHECK-NEXT: ret
%retval = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 0)
ret <vscale x 2 x i64> %retval
@@ -43,14 +38,9 @@ define <vscale x 2 x i64> @insert_v2i64_nxv2i64_idx2(<vscale x 2 x i64> %vec, <2
define <vscale x 4 x i32> @insert_v4i32_nxv4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec) nounwind {
; CHECK-LABEL: insert_v4i32_nxv4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: st1w { z0.s }, p0, [sp]
-; CHECK-NEXT: str q1, [sp]
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: mov z0.s, p0/m, z1.s
; CHECK-NEXT: ret
%retval = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 0)
ret <vscale x 4 x i32> %retval
@@ -82,14 +72,9 @@ define <vscale x 4 x i32> @insert_v4i32_nxv4i32_idx4(<vscale x 4 x i32> %vec, <4
define <vscale x 8 x i16> @insert_v8i16_nxv8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec) nounwind {
; CHECK-LABEL: insert_v8i16_nxv8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: st1h { z0.h }, p0, [sp]
-; CHECK-NEXT: str q1, [sp]
-; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: mov z0.h, p0/m, z1.h
; CHECK-NEXT: ret
%retval = call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec, i64 0)
ret <vscale x 8 x i16> %retval
@@ -121,14 +106,9 @@ define <vscale x 8 x i16> @insert_v8i16_nxv8i16_idx8(<vscale x 8 x i16> %vec, <8
define <vscale x 16 x i8> @insert_v16i8_nxv16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec) nounwind {
; CHECK-LABEL: insert_v16i8_nxv16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: ptrue p0.b
-; CHECK-NEXT: st1b { z0.b }, p0, [sp]
-; CHECK-NEXT: str q1, [sp]
-; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: mov z0.b, p0/m, z1.b
; CHECK-NEXT: ret
%retval = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec, i64 0)
ret <vscale x 16 x i8> %retval
@@ -469,7 +449,7 @@ define <vscale x 12 x i32> @insert_nxv12i32_nxv4i32(<vscale x 4 x i32> %sv0, <vs
define <vscale x 2 x bfloat> @insert_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1) nounwind {
; CHECK-LABEL: insert_nxv2bf16_nxv2bf16:
; CHECK: // %bb.0:
-; CHECK-NEXT: z0.d, z1.d
+; CHECK-NEXT: mov z0.d, z1.d
; CHECK-NEXT: ret
%v0 = call <vscale x 2 x bfloat> @llvm.experimental.vector.insert.nxv2bf16.nxv2bf16(<vscale x 2 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1, i64 0)
ret <vscale x 2 x bfloat> %v0
@@ -478,7 +458,7 @@ define <vscale x 2 x bfloat> @insert_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %sv
define <vscale x 4 x bfloat> @insert_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1) nounwind {
; CHECK-LABEL: insert_nxv4bf16_nxv4bf16:
; CHECK: // %bb.0:
-; CHECK-NEXT: z0.d, z1.d
+; CHECK-NEXT: mov z0.d, z1.d
; CHECK-NEXT: ret
%v0 = call <vscale x 4 x bfloat> @llvm.experimental.vector.insert.nxv4bf16.nxv4bf16(<vscale x 4 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1, i64 0)
ret <vscale x 4 x bfloat> %v0
@@ -487,15 +467,15 @@ define <vscale x 4 x bfloat> @insert_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %sv
define <vscale x 4 x bfloat> @insert_nxv4bf16_v4bf16(<vscale x 4 x bfloat> %sv0, <4 x bfloat> %v1) nounwind {
; CHECK-LABEL: insert_nxv4bf16_v4bf16:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK-NEXT: addpl x8, sp, #4
-; CHECK-NEXT: str d1, [x8]
-; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: addpl x8, sp, #4
+; CHECK-NEXT: str d1, [x8]
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%v0 = call <vscale x 4 x bfloat> @llvm.experimental.vector.insert.nxv4bf16.v4bf16(<vscale x 4 x bfloat> %sv0, <4 x bfloat> %v1, i64 0)
ret <vscale x 4 x bfloat> %v0
@@ -504,7 +484,7 @@ define <vscale x 4 x bfloat> @insert_nxv4bf16_v4bf16(<vscale x 4 x bfloat> %sv0,
define <vscale x 8 x bfloat> @insert_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %sv0, <vscale x 8 x bfloat> %sv1) nounwind {
; CHECK-LABEL: insert_nxv8bf16_nxv8bf16:
; CHECK: // %bb.0:
-; CHECK-NEXT: z0.d, z1.d
+; CHECK-NEXT: mov z0.d, z1.d
; CHECK-NEXT: ret
%v0 = call <vscale x 8 x bfloat> @llvm.experimental.vector.insert.nxv8bf16.nxv8bf16(<vscale x 8 x bfloat> %sv0, <vscale x 8 x bfloat> %sv1, i64 0)
ret <vscale x 8 x bfloat> %v0
@@ -513,14 +493,9 @@ define <vscale x 8 x bfloat> @insert_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %sv
define <vscale x 8 x bfloat> @insert_nxv8bf16_v8bf16(<vscale x 8 x bfloat> %sv0, <8 x bfloat> %v1) nounwind {
; CHECK-LABEL: insert_nxv8bf16_v8bf16:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: st1h { z0.h }, p0, [sp]
-; CHECK-NEXT: str q1, [sp]
-; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: mov z0.h, p0/m, z1.h
; CHECK-NEXT: ret
%v0 = call <vscale x 8 x bfloat> @llvm.experimental.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> %sv0, <8 x bfloat> %v1, i64 0)
ret <vscale x 8 x bfloat> %v0
More information about the llvm-commits
mailing list