[llvm] 15406d2 - [AArch64][SVE][ISel] Combine dup of load to replicating load
Peter Waller via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 14 02:38:33 PST 2022
Author: Peter Waller
Date: 2022-12-14T10:34:26Z
New Revision: 15406d2cd69290855077f93f9d3fba05869f4364
URL: https://github.com/llvm/llvm-project/commit/15406d2cd69290855077f93f9d3fba05869f4364
DIFF: https://github.com/llvm/llvm-project/commit/15406d2cd69290855077f93f9d3fba05869f4364.diff
LOG: [AArch64][SVE][ISel] Combine dup of load to replicating load
(dup (load) z_or_x_passthrough) => (replicating load)
Differential Revision: https://reviews.llvm.org/D139637
Added:
Modified:
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
llvm/test/CodeGen/AArch64/sve-ld1r.ll
llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s
llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3900abd175cbd..4186d43aefb0d 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -327,7 +327,8 @@ def SDT_AArch64PTest : SDTypeProfile<0, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
def AArch64ptest : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>;
def AArch64ptest_any : SDNode<"AArch64ISD::PTEST_ANY", SDT_AArch64PTest>;
-def SDT_AArch64DUP_PRED : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 3>, SDTCisVec<1>, SDTCVecEltisVT<1,i1>]>;
+def SDT_AArch64DUP_PRED : SDTypeProfile<1, 3,
+ [SDTCisVec<0>, SDTCisSameAs<0, 3>, SDTCisVec<1>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0, 1>]>;
def AArch64dup_mt : SDNode<"AArch64ISD::DUP_MERGE_PASSTHRU", SDT_AArch64DUP_PRED>;
def AArch64splice : SDNode<"AArch64ISD::SPLICE", SDT_AArch64Arith>;
@@ -2297,43 +2298,46 @@ let Predicates = [HasSVEorSME] in {
}
let AddedComplexity = 1 in {
- class LD1RPat<ValueType vt, SDPatternOperator operator,
- Instruction load, Instruction ptrue, ValueType index_vt, ComplexPattern CP, Operand immtype> :
- Pat<(vt (splat_vector (index_vt (operator (CP GPR64:$base, immtype:$offset))))),
- (load (ptrue 31), GPR64:$base, $offset)>;
+ multiclass LD1RPat<ValueType vt, SDPatternOperator operator,
+ Instruction load, Instruction ptrue, ValueType index_vt, ComplexPattern CP, Operand immtype> {
+ def : Pat<(vt (splat_vector (index_vt (operator (CP GPR64:$base, immtype:$offset))))),
+ (load (ptrue 31), GPR64:$base, $offset)>;
+ def : Pat<(vt (AArch64dup_mt PPR:$pg, (index_vt (operator (CP GPR64:$base, immtype:$offset))), (SVEDup0Undef))),
+ (load $pg, GPR64:$base, $offset)>;
+ }
}
// LDR1 of 8-bit data
- def : LD1RPat<nxv16i8, extloadi8, LD1RB_IMM, PTRUE_B, i32, am_indexed8_6b, uimm6s1>;
- def : LD1RPat<nxv8i16, zextloadi8, LD1RB_H_IMM, PTRUE_H, i32, am_indexed8_6b, uimm6s1>;
- def : LD1RPat<nxv4i32, zextloadi8, LD1RB_S_IMM, PTRUE_S, i32, am_indexed8_6b, uimm6s1>;
- def : LD1RPat<nxv2i64, zextloadi8, LD1RB_D_IMM, PTRUE_D, i64, am_indexed8_6b, uimm6s1>;
- def : LD1RPat<nxv8i16, sextloadi8, LD1RSB_H_IMM, PTRUE_H, i32, am_indexed8_6b, uimm6s1>;
- def : LD1RPat<nxv4i32, sextloadi8, LD1RSB_S_IMM, PTRUE_S, i32, am_indexed8_6b, uimm6s1>;
- def : LD1RPat<nxv2i64, sextloadi8, LD1RSB_D_IMM, PTRUE_D, i64, am_indexed8_6b, uimm6s1>;
+ defm : LD1RPat<nxv16i8, extloadi8, LD1RB_IMM, PTRUE_B, i32, am_indexed8_6b, uimm6s1>;
+ defm : LD1RPat<nxv8i16, zextloadi8, LD1RB_H_IMM, PTRUE_H, i32, am_indexed8_6b, uimm6s1>;
+ defm : LD1RPat<nxv4i32, zextloadi8, LD1RB_S_IMM, PTRUE_S, i32, am_indexed8_6b, uimm6s1>;
+ defm : LD1RPat<nxv2i64, zextloadi8, LD1RB_D_IMM, PTRUE_D, i64, am_indexed8_6b, uimm6s1>;
+ defm : LD1RPat<nxv8i16, sextloadi8, LD1RSB_H_IMM, PTRUE_H, i32, am_indexed8_6b, uimm6s1>;
+ defm : LD1RPat<nxv4i32, sextloadi8, LD1RSB_S_IMM, PTRUE_S, i32, am_indexed8_6b, uimm6s1>;
+ defm : LD1RPat<nxv2i64, sextloadi8, LD1RSB_D_IMM, PTRUE_D, i64, am_indexed8_6b, uimm6s1>;
// LDR1 of 16-bit data
- def : LD1RPat<nxv8i16, extloadi16, LD1RH_IMM, PTRUE_H, i32, am_indexed16_6b, uimm6s2>;
- def : LD1RPat<nxv4i32, zextloadi16, LD1RH_S_IMM, PTRUE_S, i32, am_indexed16_6b, uimm6s2>;
- def : LD1RPat<nxv2i64, zextloadi16, LD1RH_D_IMM, PTRUE_D, i64, am_indexed16_6b, uimm6s2>;
- def : LD1RPat<nxv4i32, sextloadi16, LD1RSH_S_IMM, PTRUE_S, i32, am_indexed16_6b, uimm6s2>;
- def : LD1RPat<nxv2i64, sextloadi16, LD1RSH_D_IMM, PTRUE_D, i64, am_indexed16_6b, uimm6s2>;
+ defm : LD1RPat<nxv8i16, extloadi16, LD1RH_IMM, PTRUE_H, i32, am_indexed16_6b, uimm6s2>;
+ defm : LD1RPat<nxv4i32, zextloadi16, LD1RH_S_IMM, PTRUE_S, i32, am_indexed16_6b, uimm6s2>;
+ defm : LD1RPat<nxv2i64, zextloadi16, LD1RH_D_IMM, PTRUE_D, i64, am_indexed16_6b, uimm6s2>;
+ defm : LD1RPat<nxv4i32, sextloadi16, LD1RSH_S_IMM, PTRUE_S, i32, am_indexed16_6b, uimm6s2>;
+ defm : LD1RPat<nxv2i64, sextloadi16, LD1RSH_D_IMM, PTRUE_D, i64, am_indexed16_6b, uimm6s2>;
// LDR1 of 32-bit data
- def : LD1RPat<nxv4i32, load, LD1RW_IMM, PTRUE_S, i32, am_indexed32_6b, uimm6s4>;
- def : LD1RPat<nxv2i64, zextloadi32, LD1RW_D_IMM, PTRUE_D, i64, am_indexed32_6b, uimm6s4>;
- def : LD1RPat<nxv2i64, sextloadi32, LD1RSW_IMM, PTRUE_D, i64, am_indexed32_6b, uimm6s4>;
+ defm : LD1RPat<nxv4i32, load, LD1RW_IMM, PTRUE_S, i32, am_indexed32_6b, uimm6s4>;
+ defm : LD1RPat<nxv2i64, zextloadi32, LD1RW_D_IMM, PTRUE_D, i64, am_indexed32_6b, uimm6s4>;
+ defm : LD1RPat<nxv2i64, sextloadi32, LD1RSW_IMM, PTRUE_D, i64, am_indexed32_6b, uimm6s4>;
// LDR1 of 64-bit data
- def : LD1RPat<nxv2i64, load, LD1RD_IMM, PTRUE_D, i64, am_indexed64_6b, uimm6s8>;
+ defm : LD1RPat<nxv2i64, load, LD1RD_IMM, PTRUE_D, i64, am_indexed64_6b, uimm6s8>;
// LD1R of FP data
- def : LD1RPat<nxv8f16, load, LD1RH_IMM, PTRUE_H, f16, am_indexed16_6b, uimm6s2>;
- def : LD1RPat<nxv4f16, load, LD1RH_S_IMM, PTRUE_S, f16, am_indexed16_6b, uimm6s2>;
- def : LD1RPat<nxv2f16, load, LD1RH_D_IMM, PTRUE_D, f16, am_indexed16_6b, uimm6s2>;
- def : LD1RPat<nxv4f32, load, LD1RW_IMM, PTRUE_S, f32, am_indexed32_6b, uimm6s4>;
- def : LD1RPat<nxv2f32, load, LD1RW_D_IMM, PTRUE_D, f32, am_indexed32_6b, uimm6s4>;
- def : LD1RPat<nxv2f64, load, LD1RD_IMM, PTRUE_D, f64, am_indexed64_6b, uimm6s8>;
+ defm : LD1RPat<nxv8f16, load, LD1RH_IMM, PTRUE_H, f16, am_indexed16_6b, uimm6s2>;
+ defm : LD1RPat<nxv4f16, load, LD1RH_S_IMM, PTRUE_S, f16, am_indexed16_6b, uimm6s2>;
+ defm : LD1RPat<nxv2f16, load, LD1RH_D_IMM, PTRUE_D, f16, am_indexed16_6b, uimm6s2>;
+ defm : LD1RPat<nxv4f32, load, LD1RW_IMM, PTRUE_S, f32, am_indexed32_6b, uimm6s4>;
+ defm : LD1RPat<nxv2f32, load, LD1RW_D_IMM, PTRUE_D, f32, am_indexed32_6b, uimm6s4>;
+ defm : LD1RPat<nxv2f64, load, LD1RD_IMM, PTRUE_D, f64, am_indexed64_6b, uimm6s8>;
// LD1R of 128-bit masked data
multiclass ld1rq_pat<ValueType vt1, SDPatternOperator op, Instruction load_instr, ComplexPattern AddrCP>{
diff --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll
index ce019e5d220c4..57aeb230f3bd8 100644
--- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll
@@ -819,6 +819,373 @@ define <vscale x 16 x i8> @dupq_ld1rqw_i8(<16 x i8>* %a) #0 {
ret <vscale x 16 x i8> %3
}
+;
+;
+; Tests for dup:
+;
+; Positive tests:
+; * dup with passthru=undef or passthrue=zero.
+; * sign/zero extending.
+; * unpacked types.
+;
+; Negative tests:
+; * dup with passthru as a parameter.
+;
+;
+
+define <vscale x 16 x i8> @dup_ld1rb_i8_passthruundef_nxv16i8(<vscale x 16 x i1> %pg, i8* %addr) {
+; CHECK-LABEL: dup_ld1rb_i8_passthruundef_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i8, i8* %addr
+ %res = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> %pg, i8 %ld)
+ ret <vscale x 16 x i8> %res
+}
+define <vscale x 8 x i16> @dup_ld1rh_i16_passthruundef_nxv8i16(<vscale x 8 x i1> %pg, i16* %addr) {
+; CHECK-LABEL: dup_ld1rh_i16_passthruundef_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i16, i16* %addr
+ %res = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, i16 %ld)
+ ret <vscale x 8 x i16> %res
+}
+define <vscale x 8 x i16> @dup_ld1rh_i8_passthruundef_nxv8i16_sext(<vscale x 8 x i1> %pg, i8* %addr) {
+; CHECK-LABEL: dup_ld1rh_i8_passthruundef_nxv8i16_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rsb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i8, i8* %addr
+ %ext = sext i8 %ld to i16
+ %res = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, i16 %ext)
+ ret <vscale x 8 x i16> %res
+}
+define <vscale x 8 x i16> @dup_ld1rh_i8_passthruundef_nxv8i16_zext(<vscale x 8 x i1> %pg, i8* %addr) {
+; CHECK-LABEL: dup_ld1rh_i8_passthruundef_nxv8i16_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i8, i8* %addr
+ %ext = zext i8 %ld to i16
+ %res = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, i16 %ext)
+ ret <vscale x 8 x i16> %res
+}
+define <vscale x 4 x i32> @dup_ld1rs_i32_passthruundef_nxv4i32(<vscale x 4 x i1> %pg, i32* %addr) {
+; CHECK-LABEL: dup_ld1rs_i32_passthruundef_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i32, i32* %addr
+ %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, i32 %ld)
+ ret <vscale x 4 x i32> %res
+}
+define <vscale x 4 x i32> @dup_ld1rs_i8_passthruundef_nxv4i32_sext(<vscale x 4 x i1> %pg, i8* %addr) {
+; CHECK-LABEL: dup_ld1rs_i8_passthruundef_nxv4i32_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rsb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i8, i8* %addr
+ %ext = sext i8 %ld to i32
+ %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, i32 %ext)
+ ret <vscale x 4 x i32> %res
+}
+define <vscale x 4 x i32> @dup_ld1rs_i8_passthruundef_nxv4i32_zext(<vscale x 4 x i1> %pg, i8* %addr) {
+; CHECK-LABEL: dup_ld1rs_i8_passthruundef_nxv4i32_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i8, i8* %addr
+ %ext = zext i8 %ld to i32
+ %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, i32 %ext)
+ ret <vscale x 4 x i32> %res
+}
+define <vscale x 4 x i32> @dup_ld1rs_i16_passthruundef_nxv4i32_sext(<vscale x 4 x i1> %pg, i16* %addr) {
+; CHECK-LABEL: dup_ld1rs_i16_passthruundef_nxv4i32_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rsh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i16, i16* %addr
+ %ext = sext i16 %ld to i32
+ %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, i32 %ext)
+ ret <vscale x 4 x i32> %res
+}
+define <vscale x 4 x i32> @dup_ld1rs_i16_passthruundef_nxv4i32_zext(<vscale x 4 x i1> %pg, i16* %addr) {
+; CHECK-LABEL: dup_ld1rs_i16_passthruundef_nxv4i32_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i16, i16* %addr
+ %ext = zext i16 %ld to i32
+ %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, i32 %ext)
+ ret <vscale x 4 x i32> %res
+}
+define <vscale x 2 x i64> @dup_ld1rd_i64_passthruundef_nxv2i64(<vscale x 2 x i1> %pg, i64* %addr) {
+; CHECK-LABEL: dup_ld1rd_i64_passthruundef_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i64, i64* %addr
+ %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ld)
+ ret <vscale x 2 x i64> %res
+}
+define <vscale x 2 x i64> @dup_ld1rs_i8_passthruundef_nxv2i64_sext(<vscale x 2 x i1> %pg, i8* %addr) {
+; CHECK-LABEL: dup_ld1rs_i8_passthruundef_nxv2i64_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i8, i8* %addr
+ %ext = sext i8 %ld to i64
+ %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
+ ret <vscale x 2 x i64> %res
+}
+define <vscale x 2 x i64> @dup_ld1rs_i8_passthruundef_nxv2i64_zext(<vscale x 2 x i1> %pg, i8* %addr) {
+; CHECK-LABEL: dup_ld1rs_i8_passthruundef_nxv2i64_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rb { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i8, i8* %addr
+ %ext = zext i8 %ld to i64
+ %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
+ ret <vscale x 2 x i64> %res
+}
+define <vscale x 2 x i64> @dup_ld1rs_i16_passthruundef_nxv2i64_sext(<vscale x 2 x i1> %pg, i16* %addr) {
+; CHECK-LABEL: dup_ld1rs_i16_passthruundef_nxv2i64_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rsh { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i16, i16* %addr
+ %ext = sext i16 %ld to i64
+ %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
+ ret <vscale x 2 x i64> %res
+}
+define <vscale x 2 x i64> @dup_ld1rs_i16_passthruundef_nxv2i64_zext(<vscale x 2 x i1> %pg, i16* %addr) {
+; CHECK-LABEL: dup_ld1rs_i16_passthruundef_nxv2i64_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i16, i16* %addr
+ %ext = zext i16 %ld to i64
+ %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
+ ret <vscale x 2 x i64> %res
+}
+define <vscale x 2 x i64> @dup_ld1rs_i32_passthruundef_nxv2i64_sext(<vscale x 2 x i1> %pg, i32* %addr) {
+; CHECK-LABEL: dup_ld1rs_i32_passthruundef_nxv2i64_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rsw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i32, i32* %addr
+ %ext = sext i32 %ld to i64
+ %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
+ ret <vscale x 2 x i64> %res
+}
+define <vscale x 2 x i64> @dup_ld1rs_i32_passthruundef_nxv2i64_zext(<vscale x 2 x i1> %pg, i32* %addr) {
+; CHECK-LABEL: dup_ld1rs_i32_passthruundef_nxv2i64_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i32, i32* %addr
+ %ext = zext i32 %ld to i64
+ %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
+ ret <vscale x 2 x i64> %res
+}
+define <vscale x 8 x half> @dup_ld1rh_half_passthruundef_nxv8f16(<vscale x 8 x i1> %pg, half* %addr) {
+; CHECK-LABEL: dup_ld1rh_half_passthruundef_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load half, half* %addr
+ %res = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, half %ld)
+ ret <vscale x 8 x half> %res
+}
+define <vscale x 4 x float> @dup_ld1rs_float_passthruundef_nxv4f32(<vscale x 4 x i1> %pg, float* %addr) {
+; CHECK-LABEL: dup_ld1rs_float_passthruundef_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load float, float* %addr
+ %res = call <vscale x 4 x float> @llvm.aarch64.sve.dup.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, float %ld)
+ ret <vscale x 4 x float> %res
+}
+define <vscale x 2 x double> @dup_ld1rd_double_passthruundef_nxv2f64(<vscale x 2 x i1> %pg, double* %addr) {
+; CHECK-LABEL: dup_ld1rd_double_passthruundef_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load double, double* %addr
+ %res = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, double %ld)
+ ret <vscale x 2 x double> %res
+}
+define <vscale x 4 x half> @dup_ld1rh_half_passthruundef_nxv4f16(<vscale x 4 x i1> %pg, half* %addr) {
+; CHECK-LABEL: dup_ld1rh_half_passthruundef_nxv4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load half, half* %addr
+ %res = call <vscale x 4 x half> @llvm.aarch64.sve.dup.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x i1> %pg, half %ld)
+ ret <vscale x 4 x half> %res
+}
+define <vscale x 16 x i8> @dup_ld1rb_i8_passthruzero_nxv16i8(<vscale x 16 x i1> %pg, i8* %addr) {
+; CHECK-LABEL: dup_ld1rb_i8_passthruzero_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i8, i8* %addr
+ %res = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, i8 %ld)
+ ret <vscale x 16 x i8> %res
+}
+define <vscale x 8 x i16> @dup_ld1rh_i16_passthruzero_nxv8i16(<vscale x 8 x i1> %pg, i16* %addr) {
+; CHECK-LABEL: dup_ld1rh_i16_passthruzero_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i16, i16* %addr
+ %res = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> %pg, i16 %ld)
+ ret <vscale x 8 x i16> %res
+}
+define <vscale x 4 x i32> @dup_ld1rs_i32_passthruzero_nxv4i32(<vscale x 4 x i1> %pg, i32* %addr) {
+; CHECK-LABEL: dup_ld1rs_i32_passthruzero_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i32, i32* %addr
+ %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, i32 %ld)
+ ret <vscale x 4 x i32> %res
+}
+define <vscale x 2 x i64> @dup_ld1rd_i64_passthruzero_nxv2i64(<vscale x 2 x i1> %pg, i64* %addr) {
+; CHECK-LABEL: dup_ld1rd_i64_passthruzero_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load i64, i64* %addr
+ %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, i64 %ld)
+ ret <vscale x 2 x i64> %res
+}
+define <vscale x 8 x half> @dup_ld1rh_half_passthruzero_nxv8f16(<vscale x 8 x i1> %pg, half* %addr) {
+; CHECK-LABEL: dup_ld1rh_half_passthruzero_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load half, half* %addr
+ %res = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x i1> %pg, half %ld)
+ ret <vscale x 8 x half> %res
+}
+define <vscale x 4 x float> @dup_ld1rs_float_passthruzero_nxv4f32(<vscale x 4 x i1> %pg, float* %addr) {
+; CHECK-LABEL: dup_ld1rs_float_passthruzero_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load float, float* %addr
+ %res = call <vscale x 4 x float> @llvm.aarch64.sve.dup.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> %pg, float %ld)
+ ret <vscale x 4 x float> %res
+}
+define <vscale x 2 x double> @dup_ld1rd_double_passthruzero_nxv2f64(<vscale x 2 x i1> %pg, double* %addr) {
+; CHECK-LABEL: dup_ld1rd_double_passthruzero_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load double, double* %addr
+ %res = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x i1> %pg, double %ld)
+ ret <vscale x 2 x double> %res
+}
+define <vscale x 4 x half> @dup_ld1rh_half_passthruzero_nxv4f16(<vscale x 4 x i1> %pg, half* %addr) {
+; CHECK-LABEL: dup_ld1rh_half_passthruzero_nxv4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load half, half* %addr
+ %res = call <vscale x 4 x half> @llvm.aarch64.sve.dup.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x i1> %pg, half %ld)
+ ret <vscale x 4 x half> %res
+}
+define <vscale x 2 x half> @dup_ld1rh_half_passthruzero_nxv2f16(<vscale x 2 x i1> %pg, half* %addr) {
+; CHECK-LABEL: dup_ld1rh_half_passthruzero_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load half, half* %addr
+ %res = call <vscale x 2 x half> @llvm.aarch64.sve.dup.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x i1> %pg, half %ld)
+ ret <vscale x 2 x half> %res
+}
+define <vscale x 2 x float> @dup_ld1rs_float_passthruzero_nxv2f32(<vscale x 2 x i1> %pg, float* %addr) {
+; CHECK-LABEL: dup_ld1rs_float_passthruzero_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %ld = load float, float* %addr
+ %res = call <vscale x 2 x float> @llvm.aarch64.sve.dup.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x i1> %pg, float %ld)
+ ret <vscale x 2 x float> %res
+}
+define <vscale x 16 x i8> @negtest_dup_ld1rb_i8_passthru_nxv16i8(<vscale x 16 x i8> %pt, <vscale x 16 x i1> %pg, i8* %addr) {
+; CHECK-LABEL: negtest_dup_ld1rb_i8_passthru_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrb w8, [x0]
+; CHECK-NEXT: mov z0.b, p0/m, w8
+; CHECK-NEXT: ret
+ %ld = load i8, i8* %addr
+ %res = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %pt, <vscale x 16 x i1> %pg, i8 %ld)
+ ret <vscale x 16 x i8> %res
+}
+define <vscale x 8 x i16> @negtest_dup_ld1rh_i16_passthru_nxv8i16(<vscale x 8 x i16> %pt, <vscale x 8 x i1> %pg, i16* %addr) {
+; CHECK-LABEL: negtest_dup_ld1rh_i16_passthru_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: mov z0.h, p0/m, w8
+; CHECK-NEXT: ret
+ %ld = load i16, i16* %addr
+ %res = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> %pt, <vscale x 8 x i1> %pg, i16 %ld)
+ ret <vscale x 8 x i16> %res
+}
+define <vscale x 4 x i32> @negtest_dup_ld1rs_i32_passthru_nxv4i32(<vscale x 4 x i32> %pt, <vscale x 4 x i1> %pg, i32* %addr) {
+; CHECK-LABEL: negtest_dup_ld1rs_i32_passthru_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: mov z0.s, p0/m, w8
+; CHECK-NEXT: ret
+ %ld = load i32, i32* %addr
+ %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> %pt, <vscale x 4 x i1> %pg, i32 %ld)
+ ret <vscale x 4 x i32> %res
+}
+define <vscale x 2 x i64> @negtest_dup_ld1rd_i64_passthru_nxv2i64(<vscale x 2 x i64> %pt, <vscale x 2 x i1> %pg, i64* %addr) {
+; CHECK-LABEL: negtest_dup_ld1rd_i64_passthru_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr x8, [x0]
+; CHECK-NEXT: mov z0.d, p0/m, x8
+; CHECK-NEXT: ret
+ %ld = load i64, i64* %addr
+ %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> %pt, <vscale x 2 x i1> %pg, i64 %ld)
+ ret <vscale x 2 x i64> %res
+}
+define <vscale x 8 x half> @negtest_dup_ld1rh_half_passthru_nxv8f16(<vscale x 8 x half> %pt, <vscale x 8 x i1> %pg, half* %addr) {
+; CHECK-LABEL: negtest_dup_ld1rh_half_passthru_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr h1, [x0]
+; CHECK-NEXT: mov z0.h, p0/m, h1
+; CHECK-NEXT: ret
+ %ld = load half, half* %addr
+ %res = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> %pt, <vscale x 8 x i1> %pg, half %ld)
+ ret <vscale x 8 x half> %res
+}
+define <vscale x 4 x float> @negtest_dup_ld1rs_float_passthru_nxv4f32(<vscale x 4 x float> %pt, <vscale x 4 x i1> %pg, float* %addr) {
+; CHECK-LABEL: negtest_dup_ld1rs_float_passthru_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr s1, [x0]
+; CHECK-NEXT: mov z0.s, p0/m, s1
+; CHECK-NEXT: ret
+ %ld = load float, float* %addr
+ %res = call <vscale x 4 x float> @llvm.aarch64.sve.dup.nxv4f32(<vscale x 4 x float> %pt, <vscale x 4 x i1> %pg, float %ld)
+ ret <vscale x 4 x float> %res
+}
+define <vscale x 2 x double> @negtest_dup_ld1rd_double_passthru_nxv2f64(<vscale x 2 x double> %pt, <vscale x 2 x i1> %pg, double* %addr) {
+; CHECK-LABEL: negtest_dup_ld1rd_double_passthru_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: mov z0.d, p0/m, d1
+; CHECK-NEXT: ret
+ %ld = load double, double* %addr
+ %res = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> %pt, <vscale x 2 x i1> %pg, double %ld)
+ ret <vscale x 2 x double> %res
+}
+
declare <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8>, i64)
declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16>, i64)
declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64)
@@ -837,4 +1204,16 @@ declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>,
declare <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat>, <8 x bfloat>, i64)
+declare <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64)
+declare <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half)
+declare <vscale x 4 x float> @llvm.aarch64.sve.dup.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float)
+declare <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double)
+declare <vscale x 4 x half> @llvm.aarch64.sve.dup.nxv4f16(<vscale x 4 x half>, <vscale x 4 x i1>, half)
+declare <vscale x 2 x half> @llvm.aarch64.sve.dup.nxv2f16(<vscale x 2 x half>, <vscale x 2 x i1>, half)
+declare <vscale x 2 x float> @llvm.aarch64.sve.dup.nxv2f32(<vscale x 2 x float>, <vscale x 2 x i1>, float)
+
+
attributes #0 = { "target-features"="+sve,+bf16" }
diff --git a/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s
index ee1eb521a1e2b..abc1ad4cdafcb 100644
--- a/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s
@@ -3416,22 +3416,22 @@ zip2 z31.s, z31.s, z31.s
# CHECK-NEXT: 1 19 4.00 * U ld1h { z31.s }, p7/z, [z31.s, #62]
# CHECK-NEXT: 1 11 0.50 * ld1h { z5.h }, p3/z, [sp, x16, lsl #1]
# CHECK-NEXT: 1 11 0.50 * ld1h { z5.h }, p3/z, [x17, x16, lsl #1]
-# CHECK-NEXT: 1 11 0.50 * U ld1rb { z0.b }, p0/z, [x0]
-# CHECK-NEXT: 1 11 0.50 * U ld1rb { z0.d }, p0/z, [x0]
-# CHECK-NEXT: 1 11 0.50 * U ld1rb { z0.h }, p0/z, [x0]
-# CHECK-NEXT: 1 11 0.50 * U ld1rb { z0.s }, p0/z, [x0]
-# CHECK-NEXT: 1 11 0.50 * U ld1rb { z31.b }, p7/z, [sp, #63]
-# CHECK-NEXT: 1 11 0.50 * U ld1rb { z31.d }, p7/z, [sp, #63]
-# CHECK-NEXT: 1 11 0.50 * U ld1rb { z31.h }, p7/z, [sp, #63]
-# CHECK-NEXT: 1 11 0.50 * U ld1rb { z31.s }, p7/z, [sp, #63]
-# CHECK-NEXT: 1 11 0.50 * U ld1rd { z0.d }, p0/z, [x0]
-# CHECK-NEXT: 1 11 0.50 * U ld1rd { z31.d }, p7/z, [sp, #504]
-# CHECK-NEXT: 1 11 0.50 * U ld1rh { z0.d }, p0/z, [x0]
-# CHECK-NEXT: 1 11 0.50 * U ld1rh { z0.h }, p0/z, [x0]
-# CHECK-NEXT: 1 11 0.50 * U ld1rh { z0.s }, p0/z, [x0]
-# CHECK-NEXT: 1 11 0.50 * U ld1rh { z31.d }, p7/z, [sp, #126]
-# CHECK-NEXT: 1 11 0.50 * U ld1rh { z31.h }, p7/z, [sp, #126]
-# CHECK-NEXT: 1 11 0.50 * U ld1rh { z31.s }, p7/z, [sp, #126]
+# CHECK-NEXT: 1 11 0.50 * ld1rb { z0.b }, p0/z, [x0]
+# CHECK-NEXT: 1 11 0.50 * ld1rb { z0.d }, p0/z, [x0]
+# CHECK-NEXT: 1 11 0.50 * ld1rb { z0.h }, p0/z, [x0]
+# CHECK-NEXT: 1 11 0.50 * ld1rb { z0.s }, p0/z, [x0]
+# CHECK-NEXT: 1 11 0.50 * ld1rb { z31.b }, p7/z, [sp, #63]
+# CHECK-NEXT: 1 11 0.50 * ld1rb { z31.d }, p7/z, [sp, #63]
+# CHECK-NEXT: 1 11 0.50 * ld1rb { z31.h }, p7/z, [sp, #63]
+# CHECK-NEXT: 1 11 0.50 * ld1rb { z31.s }, p7/z, [sp, #63]
+# CHECK-NEXT: 1 11 0.50 * ld1rd { z0.d }, p0/z, [x0]
+# CHECK-NEXT: 1 11 0.50 * ld1rd { z31.d }, p7/z, [sp, #504]
+# CHECK-NEXT: 1 11 0.50 * ld1rh { z0.d }, p0/z, [x0]
+# CHECK-NEXT: 1 11 0.50 * ld1rh { z0.h }, p0/z, [x0]
+# CHECK-NEXT: 1 11 0.50 * ld1rh { z0.s }, p0/z, [x0]
+# CHECK-NEXT: 1 11 0.50 * ld1rh { z31.d }, p7/z, [sp, #126]
+# CHECK-NEXT: 1 11 0.50 * ld1rh { z31.h }, p7/z, [sp, #126]
+# CHECK-NEXT: 1 11 0.50 * ld1rh { z31.s }, p7/z, [sp, #126]
# CHECK-NEXT: 1 11 0.50 * ld1rqb { z0.b }, p0/z, [x0, x0]
# CHECK-NEXT: 1 11 0.50 * ld1rqb { z0.b }, p0/z, [x0]
# CHECK-NEXT: 1 11 0.50 * ld1rqb { z21.b }, p5/z, [x10, #112]
@@ -3452,22 +3452,22 @@ zip2 z31.s, z31.s, z31.s
# CHECK-NEXT: 1 11 0.50 * ld1rqw { z23.s }, p3/z, [x13, #-128]
# CHECK-NEXT: 1 11 0.50 * ld1rqw { z23.s }, p3/z, [x13, #112]
# CHECK-NEXT: 1 11 0.50 * ld1rqw { z31.s }, p7/z, [sp, #-16]
-# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z0.d }, p0/z, [x0]
-# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z0.h }, p0/z, [x0]
-# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z0.s }, p0/z, [x0]
-# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z31.d }, p7/z, [sp, #63]
-# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z31.h }, p7/z, [sp, #63]
-# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z31.s }, p7/z, [sp, #63]
-# CHECK-NEXT: 1 11 0.50 * U ld1rsh { z0.d }, p0/z, [x0]
-# CHECK-NEXT: 1 11 0.50 * U ld1rsh { z0.s }, p0/z, [x0]
-# CHECK-NEXT: 1 11 0.50 * U ld1rsh { z31.d }, p7/z, [sp, #126]
-# CHECK-NEXT: 1 11 0.50 * U ld1rsh { z31.s }, p7/z, [sp, #126]
-# CHECK-NEXT: 1 11 0.50 * U ld1rsw { z0.d }, p0/z, [x0]
-# CHECK-NEXT: 1 11 0.50 * U ld1rsw { z31.d }, p7/z, [sp, #252]
-# CHECK-NEXT: 1 11 0.50 * U ld1rw { z0.d }, p0/z, [x0]
-# CHECK-NEXT: 1 11 0.50 * U ld1rw { z0.s }, p0/z, [x0]
-# CHECK-NEXT: 1 11 0.50 * U ld1rw { z31.d }, p7/z, [sp, #252]
-# CHECK-NEXT: 1 11 0.50 * U ld1rw { z31.s }, p7/z, [sp, #252]
+# CHECK-NEXT: 1 11 0.50 * ld1rsb { z0.d }, p0/z, [x0]
+# CHECK-NEXT: 1 11 0.50 * ld1rsb { z0.h }, p0/z, [x0]
+# CHECK-NEXT: 1 11 0.50 * ld1rsb { z0.s }, p0/z, [x0]
+# CHECK-NEXT: 1 11 0.50 * ld1rsb { z31.d }, p7/z, [sp, #63]
+# CHECK-NEXT: 1 11 0.50 * ld1rsb { z31.h }, p7/z, [sp, #63]
+# CHECK-NEXT: 1 11 0.50 * ld1rsb { z31.s }, p7/z, [sp, #63]
+# CHECK-NEXT: 1 11 0.50 * ld1rsh { z0.d }, p0/z, [x0]
+# CHECK-NEXT: 1 11 0.50 * ld1rsh { z0.s }, p0/z, [x0]
+# CHECK-NEXT: 1 11 0.50 * ld1rsh { z31.d }, p7/z, [sp, #126]
+# CHECK-NEXT: 1 11 0.50 * ld1rsh { z31.s }, p7/z, [sp, #126]
+# CHECK-NEXT: 1 11 0.50 * ld1rsw { z0.d }, p0/z, [x0]
+# CHECK-NEXT: 1 11 0.50 * ld1rsw { z31.d }, p7/z, [sp, #252]
+# CHECK-NEXT: 1 11 0.50 * ld1rw { z0.d }, p0/z, [x0]
+# CHECK-NEXT: 1 11 0.50 * ld1rw { z0.s }, p0/z, [x0]
+# CHECK-NEXT: 1 11 0.50 * ld1rw { z31.d }, p7/z, [sp, #252]
+# CHECK-NEXT: 1 11 0.50 * ld1rw { z31.s }, p7/z, [sp, #252]
# CHECK-NEXT: 1 11 0.50 * U ld1sb { z0.d }, p0/z, [x0]
# CHECK-NEXT: 1 16 2.00 * U ld1sb { z0.d }, p0/z, [z0.d]
# CHECK-NEXT: 1 11 0.50 * ld1sb { z0.h }, p0/z, [sp, x0]
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
index 5ba286f21ba6e..f1d43e2a88ccf 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
@@ -4472,22 +4472,22 @@ zip2 z31.s, z31.s, z31.s
# CHECK-NEXT: 2 9 0.50 * U ld1h { z31.s }, p7/z, [z31.s, #62]
# CHECK-NEXT: 1 6 0.50 * ld1h { z5.h }, p3/z, [sp, x16, lsl #1]
# CHECK-NEXT: 1 6 0.50 * ld1h { z5.h }, p3/z, [x17, x16, lsl #1]
-# CHECK-NEXT: 1 6 0.33 * U ld1rb { z0.b }, p0/z, [x0]
-# CHECK-NEXT: 1 6 0.33 * U ld1rb { z0.d }, p0/z, [x0]
-# CHECK-NEXT: 1 6 0.33 * U ld1rb { z0.h }, p0/z, [x0]
-# CHECK-NEXT: 1 6 0.33 * U ld1rb { z0.s }, p0/z, [x0]
-# CHECK-NEXT: 1 6 0.33 * U ld1rb { z31.b }, p7/z, [sp, #63]
-# CHECK-NEXT: 1 6 0.33 * U ld1rb { z31.d }, p7/z, [sp, #63]
-# CHECK-NEXT: 1 6 0.33 * U ld1rb { z31.h }, p7/z, [sp, #63]
-# CHECK-NEXT: 1 6 0.33 * U ld1rb { z31.s }, p7/z, [sp, #63]
-# CHECK-NEXT: 1 6 0.33 * U ld1rd { z0.d }, p0/z, [x0]
-# CHECK-NEXT: 1 6 0.33 * U ld1rd { z31.d }, p7/z, [sp, #504]
-# CHECK-NEXT: 1 6 0.33 * U ld1rh { z0.d }, p0/z, [x0]
-# CHECK-NEXT: 1 6 0.33 * U ld1rh { z0.h }, p0/z, [x0]
-# CHECK-NEXT: 1 6 0.33 * U ld1rh { z0.s }, p0/z, [x0]
-# CHECK-NEXT: 1 6 0.33 * U ld1rh { z31.d }, p7/z, [sp, #126]
-# CHECK-NEXT: 1 6 0.33 * U ld1rh { z31.h }, p7/z, [sp, #126]
-# CHECK-NEXT: 1 6 0.33 * U ld1rh { z31.s }, p7/z, [sp, #126]
+# CHECK-NEXT: 1 6 0.33 * ld1rb { z0.b }, p0/z, [x0]
+# CHECK-NEXT: 1 6 0.33 * ld1rb { z0.d }, p0/z, [x0]
+# CHECK-NEXT: 1 6 0.33 * ld1rb { z0.h }, p0/z, [x0]
+# CHECK-NEXT: 1 6 0.33 * ld1rb { z0.s }, p0/z, [x0]
+# CHECK-NEXT: 1 6 0.33 * ld1rb { z31.b }, p7/z, [sp, #63]
+# CHECK-NEXT: 1 6 0.33 * ld1rb { z31.d }, p7/z, [sp, #63]
+# CHECK-NEXT: 1 6 0.33 * ld1rb { z31.h }, p7/z, [sp, #63]
+# CHECK-NEXT: 1 6 0.33 * ld1rb { z31.s }, p7/z, [sp, #63]
+# CHECK-NEXT: 1 6 0.33 * ld1rd { z0.d }, p0/z, [x0]
+# CHECK-NEXT: 1 6 0.33 * ld1rd { z31.d }, p7/z, [sp, #504]
+# CHECK-NEXT: 1 6 0.33 * ld1rh { z0.d }, p0/z, [x0]
+# CHECK-NEXT: 1 6 0.33 * ld1rh { z0.h }, p0/z, [x0]
+# CHECK-NEXT: 1 6 0.33 * ld1rh { z0.s }, p0/z, [x0]
+# CHECK-NEXT: 1 6 0.33 * ld1rh { z31.d }, p7/z, [sp, #126]
+# CHECK-NEXT: 1 6 0.33 * ld1rh { z31.h }, p7/z, [sp, #126]
+# CHECK-NEXT: 1 6 0.33 * ld1rh { z31.s }, p7/z, [sp, #126]
# CHECK-NEXT: 1 6 0.33 * ld1rqb { z0.b }, p0/z, [x0, x0]
# CHECK-NEXT: 1 6 0.33 * ld1rqb { z0.b }, p0/z, [x0]
# CHECK-NEXT: 1 6 0.33 * ld1rqb { z21.b }, p5/z, [x10, #112]
@@ -4508,22 +4508,22 @@ zip2 z31.s, z31.s, z31.s
# CHECK-NEXT: 1 6 0.33 * ld1rqw { z23.s }, p3/z, [x13, #-128]
# CHECK-NEXT: 1 6 0.33 * ld1rqw { z23.s }, p3/z, [x13, #112]
# CHECK-NEXT: 1 6 0.33 * ld1rqw { z31.s }, p7/z, [sp, #-16]
-# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z0.d }, p0/z, [x0]
-# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z0.h }, p0/z, [x0]
-# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z0.s }, p0/z, [x0]
-# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z31.d }, p7/z, [sp, #63]
-# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z31.h }, p7/z, [sp, #63]
-# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z31.s }, p7/z, [sp, #63]
-# CHECK-NEXT: 1 6 0.33 * U ld1rsh { z0.d }, p0/z, [x0]
-# CHECK-NEXT: 1 6 0.33 * U ld1rsh { z0.s }, p0/z, [x0]
-# CHECK-NEXT: 1 6 0.33 * U ld1rsh { z31.d }, p7/z, [sp, #126]
-# CHECK-NEXT: 1 6 0.33 * U ld1rsh { z31.s }, p7/z, [sp, #126]
-# CHECK-NEXT: 1 6 0.33 * U ld1rsw { z0.d }, p0/z, [x0]
-# CHECK-NEXT: 1 6 0.33 * U ld1rsw { z31.d }, p7/z, [sp, #252]
-# CHECK-NEXT: 1 6 0.33 * U ld1rw { z0.d }, p0/z, [x0]
-# CHECK-NEXT: 1 6 0.33 * U ld1rw { z0.s }, p0/z, [x0]
-# CHECK-NEXT: 1 6 0.33 * U ld1rw { z31.d }, p7/z, [sp, #252]
-# CHECK-NEXT: 1 6 0.33 * U ld1rw { z31.s }, p7/z, [sp, #252]
+# CHECK-NEXT: 1 6 0.33 * ld1rsb { z0.d }, p0/z, [x0]
+# CHECK-NEXT: 1 6 0.33 * ld1rsb { z0.h }, p0/z, [x0]
+# CHECK-NEXT: 1 6 0.33 * ld1rsb { z0.s }, p0/z, [x0]
+# CHECK-NEXT: 1 6 0.33 * ld1rsb { z31.d }, p7/z, [sp, #63]
+# CHECK-NEXT: 1 6 0.33 * ld1rsb { z31.h }, p7/z, [sp, #63]
+# CHECK-NEXT: 1 6 0.33 * ld1rsb { z31.s }, p7/z, [sp, #63]
+# CHECK-NEXT: 1 6 0.33 * ld1rsh { z0.d }, p0/z, [x0]
+# CHECK-NEXT: 1 6 0.33 * ld1rsh { z0.s }, p0/z, [x0]
+# CHECK-NEXT: 1 6 0.33 * ld1rsh { z31.d }, p7/z, [sp, #126]
+# CHECK-NEXT: 1 6 0.33 * ld1rsh { z31.s }, p7/z, [sp, #126]
+# CHECK-NEXT: 1 6 0.33 * ld1rsw { z0.d }, p0/z, [x0]
+# CHECK-NEXT: 1 6 0.33 * ld1rsw { z31.d }, p7/z, [sp, #252]
+# CHECK-NEXT: 1 6 0.33 * ld1rw { z0.d }, p0/z, [x0]
+# CHECK-NEXT: 1 6 0.33 * ld1rw { z0.s }, p0/z, [x0]
+# CHECK-NEXT: 1 6 0.33 * ld1rw { z31.d }, p7/z, [sp, #252]
+# CHECK-NEXT: 1 6 0.33 * ld1rw { z31.s }, p7/z, [sp, #252]
# CHECK-NEXT: 1 6 0.33 * U ld1sb { z0.d }, p0/z, [x0]
# CHECK-NEXT: 4 9 1.00 * U ld1sb { z0.d }, p0/z, [z0.d]
# CHECK-NEXT: 1 6 0.50 * ld1sb { z0.h }, p0/z, [sp, x0]
More information about the llvm-commits
mailing list