[llvm] 30b045a - [AArch64][SVE] Extend LD1RQ ISel patterns to cover missing addressing modes
Matt Devereau via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 25 06:07:52 PDT 2022
Author: Matt Devereau
Date: 2022-08-25T13:07:37Z
New Revision: 30b045aba603e61239ff121de908597c536d2c8e
URL: https://github.com/llvm/llvm-project/commit/30b045aba603e61239ff121de908597c536d2c8e
DIFF: https://github.com/llvm/llvm-project/commit/30b045aba603e61239ff121de908597c536d2c8e.diff
LOG: [AArch64][SVE] Extend LD1RQ ISel patterns to cover missing addressing modes
Add some missing patterns for ld1rq's scalar + scalar addressing mode.
Also, adds the scalar + imm and scalar + scalar addressing modes for
the patterns added in https://reviews.llvm.org/D130010
Differential Revision: https://reviews.llvm.org/D130993
Added:
Modified:
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s
llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 08dd920c91378..983631af47c76 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -937,15 +937,20 @@ let Predicates = [HasSVEorSME] in {
defm LD1RQ_W : sve_mem_ldqr_ss<0b10, "ld1rqw", Z_s, ZPR32, GPR64NoXZRshifted32>;
defm LD1RQ_D : sve_mem_ldqr_ss<0b11, "ld1rqd", Z_d, ZPR64, GPR64NoXZRshifted64>;
- let AddedComplexity = 1 in {
- class LD1RQPat<ValueType vt1, ValueType vt2, SDPatternOperator op, Instruction load_instr, Instruction ptrue> :
- Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load GPR64sp:$Xn)), (i64 0))), (i64 0))),
- (load_instr (ptrue 31), GPR64sp:$Xn, 0)>;
+ multiclass sve_ld1rq_duplane_pat<ValueType vt1, ValueType vt2, SDPatternOperator op, Instruction load_instr_imm, Instruction ptrue, Instruction load_instr_scalar, ComplexPattern AddrCP> {
+ def : Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load GPR64sp:$Xn)), (i64 0))), (i64 0))),
+ (load_instr_imm (ptrue 31), GPR64sp:$Xn, 0)>;
+ let AddedComplexity = 2 in {
+ def : Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load (add GPR64sp:$Xn, simm4s16:$imm))), (i64 0))), (i64 0))),
+ (load_instr_imm (ptrue 31), GPR64sp:$Xn, simm4s16:$imm)>;
+ }
+ def : Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load (AddrCP GPR64sp:$Xn, GPR64sp:$idx))), (i64 0))), (i64 0))),
+ (load_instr_scalar (ptrue 31), GPR64sp:$Xn, $idx)>;
}
- def : LD1RQPat<nxv16i8, v16i8, AArch64duplane128, LD1RQ_B_IMM, PTRUE_B>;
- def : LD1RQPat<nxv8i16, v8i16, AArch64duplane128, LD1RQ_H_IMM, PTRUE_H>;
- def : LD1RQPat<nxv4i32, v4i32, AArch64duplane128, LD1RQ_W_IMM, PTRUE_S>;
- def : LD1RQPat<nxv2i64, v2i64, AArch64duplane128, LD1RQ_D_IMM, PTRUE_D>;
+ defm : sve_ld1rq_duplane_pat<nxv16i8, v16i8, AArch64duplane128, LD1RQ_B_IMM, PTRUE_B, LD1RQ_B, am_sve_regreg_lsl0>;
+ defm : sve_ld1rq_duplane_pat<nxv8i16, v8i16, AArch64duplane128, LD1RQ_H_IMM, PTRUE_H, LD1RQ_H, am_sve_regreg_lsl1>;
+ defm : sve_ld1rq_duplane_pat<nxv4i32, v4i32, AArch64duplane128, LD1RQ_W_IMM, PTRUE_S, LD1RQ_W, am_sve_regreg_lsl2>;
+ defm : sve_ld1rq_duplane_pat<nxv2i64, v2i64, AArch64duplane128, LD1RQ_D_IMM, PTRUE_D, LD1RQ_D, am_sve_regreg_lsl3>;
// continuous load with reg+reg addressing.
defm LD1B : sve_mem_cld_ss<0b0000, "ld1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
@@ -2260,24 +2265,22 @@ let Predicates = [HasSVEorSME] in {
def : LD1RPat<nxv2f32, load, LD1RW_D_IMM, PTRUE_D, f32, am_indexed32_6b, uimm6s4>;
def : LD1RPat<nxv2f64, load, LD1RD_IMM, PTRUE_D, f64, am_indexed64_6b, uimm6s8>;
- // LD1R of 128-bit masked data
- def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
- (LD1RQ_B_IMM $gp, $base, (i64 0))>;
- def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
- (LD1RQ_H_IMM $gp, $base, (i64 0))>;
- def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
- (LD1RQ_W_IMM $gp, $base, (i64 0))>;
- def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
- (LD1RQ_D_IMM $gp, $base, (i64 0))>;
-
- def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
- (LD1RQ_B_IMM $gp, $base, simm4s16:$imm)>;
- def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
- (LD1RQ_H_IMM $gp, $base, simm4s16:$imm)>;
- def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
- (LD1RQ_W_IMM $gp, $base, simm4s16:$imm)>;
- def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
- (LD1RQ_D_IMM $gp, $base, simm4s16:$imm)>;
+// LD1R of 128-bit masked data
+ multiclass ld1rq_pat<ValueType vt1, SDPatternOperator op, Instruction load_instr, ComplexPattern AddrCP>{
+ def : Pat<(vt1 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
+ (!cast<Instruction>(load_instr # _IMM) $gp, $base, (i64 0))>;
+ let AddedComplexity = 2 in {
+ def : Pat<(vt1 (op PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
+ (!cast<Instruction>(load_instr # _IMM) $gp, $base, simm4s16:$imm)>;
+ }
+ def : Pat<(vt1 (op PPR:$gp, (AddrCP GPR64:$base, GPR64:$idx))),
+ (load_instr $gp, $base, $idx)>;
+ }
+
+ defm : ld1rq_pat<nxv16i8, AArch64ld1rq_z, LD1RQ_B, am_sve_regreg_lsl0>;
+ defm : ld1rq_pat<nxv8i16, AArch64ld1rq_z, LD1RQ_H, am_sve_regreg_lsl1>;
+ defm : ld1rq_pat<nxv4i32, AArch64ld1rq_z, LD1RQ_W, am_sve_regreg_lsl2>;
+ defm : ld1rq_pat<nxv2i64, AArch64ld1rq_z, LD1RQ_D, am_sve_regreg_lsl3>;
def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_UNDEF_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_UNDEF_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
index ea5ad352c28b7..423bebeb2bdc4 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
@@ -24,6 +24,16 @@ define <vscale x 16 x i8> @ld1rqb_i8_imm(<vscale x 16 x i1> %pred, i8* %addr) {
ret <vscale x 16 x i8> %res
}
+define <vscale x 16 x i8> @ld1rqb_i8_scalar(<vscale x 16 x i1> %pred, i8* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqb_i8_scalar:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0, x1]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds i8, i8* %addr, i64 %idx
+ %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, i8* %ptr)
+ ret <vscale x 16 x i8> %res
+}
+
define <vscale x 16 x i8> @ld1rqb_i8_imm_lower_bound(<vscale x 16 x i1> %pred, i8* %addr) {
; CHECK-LABEL: ld1rqb_i8_imm_lower_bound:
; CHECK: // %bb.0:
@@ -47,8 +57,8 @@ define <vscale x 16 x i8> @ld1rqb_i8_imm_upper_bound(<vscale x 16 x i1> %pred, i
define <vscale x 16 x i8> @ld1rqb_i8_imm_out_of_lower_bound(<vscale x 16 x i1> %pred, i8* %addr) {
; CHECK-LABEL: ld1rqb_i8_imm_out_of_lower_bound:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub x8, x0, #129
-; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x8]
+; CHECK-NEXT: mov x8, #-129
+; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0, x8]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i8, i8* %addr, i64 -129
%res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, i8* %ptr)
@@ -58,14 +68,41 @@ define <vscale x 16 x i8> @ld1rqb_i8_imm_out_of_lower_bound(<vscale x 16 x i1> %
define <vscale x 16 x i8> @ld1rqb_i8_imm_out_of_upper_bound(<vscale x 16 x i1> %pred, i8* %addr) {
; CHECK-LABEL: ld1rqb_i8_imm_out_of_upper_bound:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #113
-; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x8]
+; CHECK-NEXT: mov w8, #113
+; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0, x8]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i8, i8* %addr, i64 113
%res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, i8* %ptr)
ret <vscale x 16 x i8> %res
}
+define <vscale x 16 x i8> @ld1rqb_i8_imm_dupqlane(<vscale x 8 x i1> %pred, <16 x i8>* %addr) {
+; CHECK-LABEL: ld1rqb_i8_imm_dupqlane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0, #-16]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds <16 x i8>, <16 x i8>* %addr, i16 -1
+ %load = load <16 x i8>, <16 x i8>* %ptr
+ %1 = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> %load, i64 0)
+ %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %1, i64 0)
+ ret <vscale x 16 x i8> %2
+}
+
+define <vscale x 16 x i8> @ld1rqb_i8_scalar_dupqlane(<vscale x 8 x i1> %pred, i8* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqb_i8_scalar_dupqlane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0, x1]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds i8, i8* %addr, i64 %idx
+ %ptr_bitcast = bitcast i8* %ptr to <16 x i8>*
+ %load = load <16 x i8>, <16 x i8>* %ptr_bitcast
+ %1 = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> %load, i64 0)
+ %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %1, i64 0)
+ ret <vscale x 16 x i8> %2
+}
+
;
; LD1RQH
;
@@ -108,6 +145,26 @@ define <vscale x 8 x half> @ld1rqh_f16_imm(<vscale x 8 x i1> %pred, half* %addr)
ret <vscale x 8 x half> %res
}
+define <vscale x 8 x i16> @ld1rqh_i16_scalar(<vscale x 8 x i1> %pred, i16* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqh_i16_scalar:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds i16, i16* %addr, i64 %idx
+ %res = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1> %pred, i16* %ptr)
+ ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x half> @ld1rqh_f16_scalar(<vscale x 8 x i1> %pred, half* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqh_f16_scalar:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds half, half* %addr, i64 %idx
+ %res = call <vscale x 8 x half> @llvm.aarch64.sve.ld1rq.nxv8f16(<vscale x 8 x i1> %pred, half* %ptr)
+ ret <vscale x 8 x half> %res
+}
+
define <vscale x 8 x bfloat> @ld1rqh_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) {
; CHECK-LABEL: ld1rqh_bf16:
; CHECK: // %bb.0:
@@ -127,6 +184,97 @@ define <vscale x 8 x bfloat> @ld1rqh_bf16_imm(<vscale x 8 x i1> %pred, bfloat* %
ret <vscale x 8 x bfloat> %res
}
+define <vscale x 8 x bfloat> @ld1rqh_bf16_scalar(<vscale x 8 x i1> %pred, bfloat* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqh_bf16_scalar:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds bfloat, bfloat* %addr, i64 %idx
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %pred, bfloat* %ptr)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x i16> @ld1rqh_i16_imm_dupqlane(<vscale x 8 x i1> %pred, <8 x i16>* %addr) {
+; CHECK-LABEL: ld1rqh_i16_imm_dupqlane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, #-16]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds <8 x i16>, <8 x i16>* %addr, i16 -1
+ %load = load <8 x i16>, <8 x i16>* %ptr
+ %1 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> %load, i64 0)
+ %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %1, i64 0)
+ ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 8 x i16> @ld1rqh_i16_scalar_dupqlane(<vscale x 8 x i1> %pred, i16* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqh_i16_scalar_dupqlane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds i16, i16* %addr, i64 %idx
+ %ptr_bitcast = bitcast i16* %ptr to <8 x i16>*
+ %load = load <8 x i16>, <8 x i16>* %ptr_bitcast
+ %1 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> %load, i64 0)
+ %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %1, i64 0)
+ ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 8 x half> @ld1rqh_f16_imm_dupqlane(<vscale x 8 x i1> %pred, <8 x half>* %addr) {
+; CHECK-LABEL: ld1rqh_f16_imm_dupqlane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, #-16]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds <8 x half>, <8 x half>* %addr, i16 -1
+ %load = load <8 x half>, <8 x half>* %ptr
+ %1 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %load, i64 0)
+ %2 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %1, i64 0)
+ ret <vscale x 8 x half> %2
+}
+
+define <vscale x 8 x half> @ld1rqh_f16_scalar_dupqlane(<vscale x 8 x i1> %pred, half* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqh_f16_scalar_dupqlane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds half, half* %addr, i64 %idx
+ %ptr_bitcast = bitcast half* %ptr to <8 x half>*
+ %load = load <8 x half>, <8 x half>* %ptr_bitcast
+ %1 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %load, i64 0)
+ %2 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %1, i64 0)
+ ret <vscale x 8 x half> %2
+}
+
+define <vscale x 8 x bfloat> @ld1rqh_bf16_imm_dupqlane(<vscale x 8 x i1> %pred, <8 x bfloat>* %addr) {
+; CHECK-LABEL: ld1rqh_bf16_imm_dupqlane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, #-16]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds <8 x bfloat>, <8 x bfloat>* %addr, i16 -1
+ %load = load <8 x bfloat>, <8 x bfloat>* %ptr
+ %1 = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> undef, <8 x bfloat> %load, i64 0)
+ %2 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %1, i64 0)
+ ret <vscale x 8 x bfloat> %2
+}
+
+define <vscale x 8 x bfloat> @ld1rqh_bf16_scalar_dupqlane(<vscale x 8 x i1> %pred, bfloat* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqh_bf16_scalar_dupqlane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds bfloat, bfloat* %addr, i64 %idx
+ %ptr_bitcast = bitcast bfloat* %ptr to <8 x bfloat>*
+ %load = load <8 x bfloat>, <8 x bfloat>* %ptr_bitcast
+ %1 = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> undef, <8 x bfloat> %load, i64 0)
+ %2 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %1, i64 0)
+ ret <vscale x 8 x bfloat> %2
+}
+
;
; LD1RQW
;
@@ -169,6 +317,80 @@ define <vscale x 4 x float> @ld1rqw_f32_imm(<vscale x 4 x i1> %pred, float* %add
ret <vscale x 4 x float> %res
}
+define <vscale x 4 x i32> @ld1rqw_i32_scalar(<vscale x 4 x i1> %pred, i32* %base, i64 %idx) {
+; CHECK-LABEL: ld1rqw_i32_scalar:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds i32, i32* %base, i64 %idx
+ %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1> %pred, i32* %ptr)
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x float> @ld1rqw_f32_scalar(<vscale x 4 x i1> %pred, float* %base, i64 %idx) {
+; CHECK-LABEL: ld1rqw_f32_scalar:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds float, float* %base, i64 %idx
+ %res = call <vscale x 4 x float> @llvm.aarch64.sve.ld1rq.nxv4f32(<vscale x 4 x i1> %pred, float* %ptr)
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 4 x i32> @ld1rqw_i32_imm_dupqlane(<vscale x 4 x i1> %pred, <4 x i32>* %addr) {
+; CHECK-LABEL: ld1rqw_i32_imm_dupqlane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, #16]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %addr, i32 1
+ %load = load <4 x i32>, <4 x i32>* %ptr
+ %1 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> %load, i64 0)
+ %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %1, i64 0)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @ld1rqw_i32_scalar_dupqlane(<vscale x 4 x i1> %pred, i32* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqw_i32_scalar_dupqlane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds i32, i32* %addr, i64 %idx
+ %ptr_bitcast = bitcast i32* %ptr to <4 x i32>*
+ %load = load <4 x i32>, <4 x i32>* %ptr_bitcast
+ %1 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> %load, i64 0)
+ %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %1, i64 0)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x float> @ld1rqw_f32_imm_dupqlane(<vscale x 4 x i1> %pred, <4 x float>* %addr) {
+; CHECK-LABEL: ld1rqw_f32_imm_dupqlane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, #16]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds <4 x float>, <4 x float>* %addr, i32 1
+ %load = load <4 x float>, <4 x float>* %ptr
+ %1 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %load, i64 0)
+ %2 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %1, i64 0)
+ ret <vscale x 4 x float> %2
+}
+
+define <vscale x 4 x float> @ld1rqw_f32_scalar_dupqlane(<vscale x 4 x i1> %pred, float* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqw_f32_scalar_dupqlane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds float, float* %addr, i64 %idx
+ %ptr_bitcast = bitcast float* %ptr to <4 x float>*
+ %load = load <4 x float>, <4 x float>* %ptr_bitcast
+ %1 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %load, i64 0)
+ %2 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %1, i64 0)
+ ret <vscale x 4 x float> %2
+}
+
;
; LD1RQD
;
@@ -211,6 +433,80 @@ define <vscale x 2 x double> @ld1rqd_f64_imm(<vscale x 2 x i1> %pred, double* %a
ret <vscale x 2 x double> %res
}
+define <vscale x 2 x i64> @ld1rqd_i64_scalar(<vscale x 2 x i1> %pred, i64* %base, i64 %idx) {
+; CHECK-LABEL: ld1rqd_i64_scalar:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds i64, i64* %base, i64 %idx
+ %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1> %pred, i64* %ptr)
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x double> @ld1rqd_f64_scalar(<vscale x 2 x i1> %pred, double* %base, i64 %idx) {
+; CHECK-LABEL: ld1rqd_f64_scalar:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds double, double* %base, i64 %idx
+ %res = call <vscale x 2 x double> @llvm.aarch64.sve.ld1rq.nxv2f64(<vscale x 2 x i1> %pred, double* %ptr)
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 2 x i64> @ld1rqd_i64_imm_dupqlane(<vscale x 2 x i1> %pred, <2 x i64>* %addr) {
+; CHECK-LABEL: ld1rqd_i64_imm_dupqlane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, #16]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds <2 x i64>, <2 x i64>* %addr, i64 1
+ %load = load <2 x i64>, <2 x i64>* %ptr
+ %1 = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> %load, i64 0)
+ %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %1, i64 0)
+ ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 2 x i64> @ld1rqd_i64_scalar_dupqlane(<vscale x 2 x i1> %pred, i64* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqd_i64_scalar_dupqlane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds i64, i64* %addr, i64 %idx
+ %ptr_bitcast = bitcast i64* %ptr to <2 x i64>*
+ %load = load <2 x i64>, <2 x i64>* %ptr_bitcast
+ %1 = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> %load, i64 0)
+ %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %1, i64 0)
+ ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 2 x double> @ld1rqd_f64_imm_dupqlane(<vscale x 2 x i1> %pred, <2 x double>* %addr) {
+; CHECK-LABEL: ld1rqd_f64_imm_dupqlane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, #16]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds <2 x double>, <2 x double>* %addr, i64 1
+ %load = load <2 x double>, <2 x double>* %ptr
+ %1 = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> undef, <2 x double> %load, i64 0)
+ %2 = tail call <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double> %1, i64 0)
+ ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @ld1rqd_f64_scalar_dupqlane(<vscale x 2 x i1> %pred, double* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqd_f64_scalar_dupqlane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds double, double* %addr, i64 %idx
+ %ptr_bitcast = bitcast double* %ptr to <2 x double>*
+ %load = load <2 x double>, <2 x double>* %ptr_bitcast
+ %1 = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> undef, <2 x double> %load, i64 0)
+ %2 = tail call <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double> %1, i64 0)
+ ret <vscale x 2 x double> %2
+}
+
;
; LDNT1B
;
@@ -616,3 +912,21 @@ declare <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(<vscale
declare <vscale x 32 x bfloat> @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(<vscale x 8 x i1>, bfloat*)
declare <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*)
declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*)
+
+declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
+declare <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double>, <2 x double>, i64)
+declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
+declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
+declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half>, <8 x half>, i64)
+declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat>, <8 x bfloat>, i64)
+declare <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
+
+declare <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64>, i64)
+declare <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double>, i64)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float>, i64)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16>, i64)
+declare <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half>, i64)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat>, i64)
+declare <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8>, i64)
diff --git a/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s
index ec1ca5410a3f2..0f1d46f43bd4b 100644
--- a/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s
@@ -3432,22 +3432,22 @@ zip2 z31.s, z31.s, z31.s
# CHECK-NEXT: 1 11 0.50 * U ld1rh { z31.d }, p7/z, [sp, #126]
# CHECK-NEXT: 1 11 0.50 * U ld1rh { z31.h }, p7/z, [sp, #126]
# CHECK-NEXT: 1 11 0.50 * U ld1rh { z31.s }, p7/z, [sp, #126]
-# CHECK-NEXT: 1 11 0.50 * U ld1rqb { z0.b }, p0/z, [x0, x0]
+# CHECK-NEXT: 1 11 0.50 * ld1rqb { z0.b }, p0/z, [x0, x0]
# CHECK-NEXT: 1 11 0.50 * ld1rqb { z0.b }, p0/z, [x0]
# CHECK-NEXT: 1 11 0.50 * ld1rqb { z21.b }, p5/z, [x10, #112]
# CHECK-NEXT: 1 11 0.50 * ld1rqb { z23.b }, p3/z, [x13, #-128]
# CHECK-NEXT: 1 11 0.50 * ld1rqb { z31.b }, p7/z, [sp, #-16]
-# CHECK-NEXT: 1 11 0.50 * U ld1rqd { z0.d }, p0/z, [x0, x0, lsl #3]
+# CHECK-NEXT: 1 11 0.50 * ld1rqd { z0.d }, p0/z, [x0, x0, lsl #3]
# CHECK-NEXT: 1 11 0.50 * ld1rqd { z0.d }, p0/z, [x0]
# CHECK-NEXT: 1 11 0.50 * ld1rqd { z23.d }, p3/z, [x13, #-128]
# CHECK-NEXT: 1 11 0.50 * ld1rqd { z23.d }, p3/z, [x13, #112]
# CHECK-NEXT: 1 11 0.50 * ld1rqd { z31.d }, p7/z, [sp, #-16]
-# CHECK-NEXT: 1 11 0.50 * U ld1rqh { z0.h }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT: 1 11 0.50 * ld1rqh { z0.h }, p0/z, [x0, x0, lsl #1]
# CHECK-NEXT: 1 11 0.50 * ld1rqh { z0.h }, p0/z, [x0]
# CHECK-NEXT: 1 11 0.50 * ld1rqh { z23.h }, p3/z, [x13, #-128]
# CHECK-NEXT: 1 11 0.50 * ld1rqh { z23.h }, p3/z, [x13, #112]
# CHECK-NEXT: 1 11 0.50 * ld1rqh { z31.h }, p7/z, [sp, #-16]
-# CHECK-NEXT: 1 11 0.50 * U ld1rqw { z0.s }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT: 1 11 0.50 * ld1rqw { z0.s }, p0/z, [x0, x0, lsl #2]
# CHECK-NEXT: 1 11 0.50 * ld1rqw { z0.s }, p0/z, [x0]
# CHECK-NEXT: 1 11 0.50 * ld1rqw { z23.s }, p3/z, [x13, #-128]
# CHECK-NEXT: 1 11 0.50 * ld1rqw { z23.s }, p3/z, [x13, #112]
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
index dfaa601300ea2..062ac80bd718b 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
@@ -4488,22 +4488,22 @@ zip2 z31.s, z31.s, z31.s
# CHECK-NEXT: 1 6 0.33 * U ld1rh { z31.d }, p7/z, [sp, #126]
# CHECK-NEXT: 1 6 0.33 * U ld1rh { z31.h }, p7/z, [sp, #126]
# CHECK-NEXT: 1 6 0.33 * U ld1rh { z31.s }, p7/z, [sp, #126]
-# CHECK-NEXT: 1 6 0.33 * U ld1rqb { z0.b }, p0/z, [x0, x0]
+# CHECK-NEXT: 1 6 0.33 * ld1rqb { z0.b }, p0/z, [x0, x0]
# CHECK-NEXT: 1 6 0.33 * ld1rqb { z0.b }, p0/z, [x0]
# CHECK-NEXT: 1 6 0.33 * ld1rqb { z21.b }, p5/z, [x10, #112]
# CHECK-NEXT: 1 6 0.33 * ld1rqb { z23.b }, p3/z, [x13, #-128]
# CHECK-NEXT: 1 6 0.33 * ld1rqb { z31.b }, p7/z, [sp, #-16]
-# CHECK-NEXT: 1 6 0.33 * U ld1rqd { z0.d }, p0/z, [x0, x0, lsl #3]
+# CHECK-NEXT: 1 6 0.33 * ld1rqd { z0.d }, p0/z, [x0, x0, lsl #3]
# CHECK-NEXT: 1 6 0.33 * ld1rqd { z0.d }, p0/z, [x0]
# CHECK-NEXT: 1 6 0.33 * ld1rqd { z23.d }, p3/z, [x13, #-128]
# CHECK-NEXT: 1 6 0.33 * ld1rqd { z23.d }, p3/z, [x13, #112]
# CHECK-NEXT: 1 6 0.33 * ld1rqd { z31.d }, p7/z, [sp, #-16]
-# CHECK-NEXT: 1 6 0.33 * U ld1rqh { z0.h }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT: 1 6 0.33 * ld1rqh { z0.h }, p0/z, [x0, x0, lsl #1]
# CHECK-NEXT: 1 6 0.33 * ld1rqh { z0.h }, p0/z, [x0]
# CHECK-NEXT: 1 6 0.33 * ld1rqh { z23.h }, p3/z, [x13, #-128]
# CHECK-NEXT: 1 6 0.33 * ld1rqh { z23.h }, p3/z, [x13, #112]
# CHECK-NEXT: 1 6 0.33 * ld1rqh { z31.h }, p7/z, [sp, #-16]
-# CHECK-NEXT: 1 6 0.33 * U ld1rqw { z0.s }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT: 1 6 0.33 * ld1rqw { z0.s }, p0/z, [x0, x0, lsl #2]
# CHECK-NEXT: 1 6 0.33 * ld1rqw { z0.s }, p0/z, [x0]
# CHECK-NEXT: 1 6 0.33 * ld1rqw { z23.s }, p3/z, [x13, #-128]
# CHECK-NEXT: 1 6 0.33 * ld1rqw { z23.s }, p3/z, [x13, #112]
More information about the llvm-commits
mailing list