[llvm] 30b045a - [AArch64][SVE] Extend LD1RQ ISel patterns to cover missing addressing modes

Thu Aug 25 06:07:52 PDT 2022

Author: Matt Devereau
Date: 2022-08-25T13:07:37Z
New Revision: 30b045aba603e61239ff121de908597c536d2c8e

URL: https://github.com/llvm/llvm-project/commit/30b045aba603e61239ff121de908597c536d2c8e
DIFF: https://github.com/llvm/llvm-project/commit/30b045aba603e61239ff121de908597c536d2c8e.diff

LOG: [AArch64][SVE] Extend LD1RQ ISel patterns to cover missing addressing modes

Add some missing patterns for ld1rq's scalar + scalar addressing mode.
Also, adds the scalar + imm and scalar + scalar addressing modes for
the patterns added in https://reviews.llvm.org/D130010

Differential Revision: https://reviews.llvm.org/D130993

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
    llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
    llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s
    llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 08dd920c91378..983631af47c76 100644

--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -937,15 +937,20 @@ let Predicates = [HasSVEorSME] in {
   defm LD1RQ_W      : sve_mem_ldqr_ss<0b10, "ld1rqw", Z_s, ZPR32, GPR64NoXZRshifted32>;
   defm LD1RQ_D      : sve_mem_ldqr_ss<0b11, "ld1rqd", Z_d, ZPR64, GPR64NoXZRshifted64>;
 
-  let AddedComplexity = 1 in {
-  class LD1RQPat<ValueType vt1, ValueType vt2, SDPatternOperator op, Instruction load_instr, Instruction ptrue> :
-          Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load GPR64sp:$Xn)), (i64 0))), (i64 0))),
-          (load_instr (ptrue 31), GPR64sp:$Xn, 0)>;
+  multiclass sve_ld1rq_duplane_pat<ValueType vt1, ValueType vt2, SDPatternOperator op, Instruction load_instr_imm, Instruction ptrue, Instruction load_instr_scalar, ComplexPattern AddrCP> {
+    def : Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load GPR64sp:$Xn)), (i64 0))), (i64 0))),
+          (load_instr_imm (ptrue 31), GPR64sp:$Xn, 0)>;
+    let AddedComplexity = 2 in {
+      def : Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load (add GPR64sp:$Xn, simm4s16:$imm))), (i64 0))), (i64 0))),
+            (load_instr_imm (ptrue 31), GPR64sp:$Xn, simm4s16:$imm)>;
+    }
+    def : Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load (AddrCP GPR64sp:$Xn, GPR64sp:$idx))), (i64 0))), (i64 0))),
+          (load_instr_scalar (ptrue 31), GPR64sp:$Xn, $idx)>;
   }
-  def : LD1RQPat<nxv16i8, v16i8, AArch64duplane128, LD1RQ_B_IMM, PTRUE_B>;
-  def : LD1RQPat<nxv8i16, v8i16, AArch64duplane128, LD1RQ_H_IMM, PTRUE_H>;
-  def : LD1RQPat<nxv4i32, v4i32, AArch64duplane128, LD1RQ_W_IMM, PTRUE_S>;
-  def : LD1RQPat<nxv2i64, v2i64, AArch64duplane128, LD1RQ_D_IMM, PTRUE_D>;
+  defm : sve_ld1rq_duplane_pat<nxv16i8, v16i8, AArch64duplane128, LD1RQ_B_IMM, PTRUE_B, LD1RQ_B, am_sve_regreg_lsl0>;
+  defm : sve_ld1rq_duplane_pat<nxv8i16, v8i16, AArch64duplane128, LD1RQ_H_IMM, PTRUE_H, LD1RQ_H, am_sve_regreg_lsl1>;
+  defm : sve_ld1rq_duplane_pat<nxv4i32, v4i32, AArch64duplane128, LD1RQ_W_IMM, PTRUE_S, LD1RQ_W, am_sve_regreg_lsl2>;
+  defm : sve_ld1rq_duplane_pat<nxv2i64, v2i64, AArch64duplane128, LD1RQ_D_IMM, PTRUE_D, LD1RQ_D, am_sve_regreg_lsl3>;
 
   // continuous load with reg+reg addressing.
   defm LD1B    : sve_mem_cld_ss<0b0000, "ld1b",  Z_b, ZPR8,  GPR64NoXZRshifted8>;
@@ -2260,24 +2265,22 @@ let Predicates = [HasSVEorSME] in {
   def : LD1RPat<nxv2f32, load, LD1RW_D_IMM, PTRUE_D, f32, am_indexed32_6b, uimm6s4>;
   def : LD1RPat<nxv2f64, load, LD1RD_IMM,   PTRUE_D, f64, am_indexed64_6b, uimm6s8>;
 
-  // LD1R of 128-bit masked data
-  def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
-            (LD1RQ_B_IMM $gp, $base, (i64 0))>;
-  def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
-            (LD1RQ_H_IMM $gp, $base, (i64 0))>;
-  def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
-            (LD1RQ_W_IMM $gp, $base, (i64 0))>;
-  def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
-            (LD1RQ_D_IMM $gp, $base, (i64 0))>;
-
-  def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
-            (LD1RQ_B_IMM $gp, $base, simm4s16:$imm)>;
-  def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
-            (LD1RQ_H_IMM $gp, $base, simm4s16:$imm)>;
-  def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
-            (LD1RQ_W_IMM $gp, $base, simm4s16:$imm)>;
-  def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
-            (LD1RQ_D_IMM $gp, $base, simm4s16:$imm)>;
+// LD1R of 128-bit masked data
+  multiclass ld1rq_pat<ValueType vt1, SDPatternOperator op, Instruction load_instr, ComplexPattern AddrCP>{
+    def : Pat<(vt1 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
+          (!cast<Instruction>(load_instr # _IMM) $gp, $base, (i64 0))>;
+    let AddedComplexity = 2 in {
+      def : Pat<(vt1 (op PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
+            (!cast<Instruction>(load_instr # _IMM) $gp, $base, simm4s16:$imm)>;
+    }
+    def : Pat<(vt1 (op PPR:$gp, (AddrCP GPR64:$base, GPR64:$idx))),
+          (load_instr $gp, $base, $idx)>;
+  }
+
+  defm : ld1rq_pat<nxv16i8, AArch64ld1rq_z, LD1RQ_B, am_sve_regreg_lsl0>;
+  defm : ld1rq_pat<nxv8i16, AArch64ld1rq_z, LD1RQ_H, am_sve_regreg_lsl1>;
+  defm : ld1rq_pat<nxv4i32, AArch64ld1rq_z, LD1RQ_W, am_sve_regreg_lsl2>;
+  defm : ld1rq_pat<nxv2i64, AArch64ld1rq_z, LD1RQ_D, am_sve_regreg_lsl3>;
 
   def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_UNDEF_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
   def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_UNDEF_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
index ea5ad352c28b7..423bebeb2bdc4 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
@@ -24,6 +24,16 @@ define <vscale x 16 x i8> @ld1rqb_i8_imm(<vscale x 16 x i1> %pred, i8* %addr) {
   ret <vscale x 16 x i8> %res
 }
 
+define <vscale x 16 x i8> @ld1rqb_i8_scalar(<vscale x 16 x i1> %pred, i8* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqb_i8_scalar:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0, x1]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds i8, i8* %addr, i64 %idx
+  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, i8* %ptr)
+  ret <vscale x 16 x i8> %res
+}
+
 define <vscale x 16 x i8> @ld1rqb_i8_imm_lower_bound(<vscale x 16 x i1> %pred, i8* %addr) {
 ; CHECK-LABEL: ld1rqb_i8_imm_lower_bound:
 ; CHECK:       // %bb.0:
@@ -47,8 +57,8 @@ define <vscale x 16 x i8> @ld1rqb_i8_imm_upper_bound(<vscale x 16 x i1> %pred, i
 define <vscale x 16 x i8> @ld1rqb_i8_imm_out_of_lower_bound(<vscale x 16 x i1> %pred, i8* %addr) {
 ; CHECK-LABEL: ld1rqb_i8_imm_out_of_lower_bound:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub x8, x0, #129
-; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x8]
+; CHECK-NEXT:    mov x8, #-129
+; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0, x8]
 ; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i8, i8* %addr, i64 -129
   %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, i8* %ptr)
@@ -58,14 +68,41 @@ define <vscale x 16 x i8> @ld1rqb_i8_imm_out_of_lower_bound(<vscale x 16 x i1> %
 define <vscale x 16 x i8> @ld1rqb_i8_imm_out_of_upper_bound(<vscale x 16 x i1> %pred, i8* %addr) {
 ; CHECK-LABEL: ld1rqb_i8_imm_out_of_upper_bound:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #113
-; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x8]
+; CHECK-NEXT:    mov w8, #113
+; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0, x8]
 ; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i8, i8* %addr, i64 113
   %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, i8* %ptr)
   ret <vscale x 16 x i8> %res
 }
 
+define <vscale x 16 x i8> @ld1rqb_i8_imm_dupqlane(<vscale x 8 x i1> %pred, <16 x i8>* %addr) {
+; CHECK-LABEL: ld1rqb_i8_imm_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0, #-16]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds <16 x i8>, <16 x i8>* %addr, i16 -1
+  %load = load <16 x i8>, <16 x i8>* %ptr
+  %1 = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> %load, i64 0)
+  %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %1, i64 0)
+  ret <vscale x 16 x i8> %2
+}
+
+define <vscale x 16 x i8> @ld1rqb_i8_scalar_dupqlane(<vscale x 8 x i1> %pred, i8* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqb_i8_scalar_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0, x1]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds i8, i8* %addr, i64 %idx
+  %ptr_bitcast = bitcast i8* %ptr to <16 x i8>*
+  %load = load <16 x i8>, <16 x i8>* %ptr_bitcast
+  %1 = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> %load, i64 0)
+  %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %1, i64 0)
+  ret <vscale x 16 x i8> %2
+}
+
 ;
 ; LD1RQH
 ;
@@ -108,6 +145,26 @@ define <vscale x 8 x half> @ld1rqh_f16_imm(<vscale x 8 x i1> %pred, half* %addr)
   ret <vscale x 8 x half> %res
 }
 
+define <vscale x 8 x i16> @ld1rqh_i16_scalar(<vscale x 8 x i1> %pred, i16* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqh_i16_scalar:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds i16, i16* %addr, i64 %idx
+  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1> %pred, i16* %ptr)
+  ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x half> @ld1rqh_f16_scalar(<vscale x 8 x i1> %pred, half* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqh_f16_scalar:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds half, half* %addr, i64 %idx
+  %res = call <vscale x 8 x half> @llvm.aarch64.sve.ld1rq.nxv8f16(<vscale x 8 x i1> %pred, half* %ptr)
+  ret <vscale x 8 x half> %res
+}
+
 define <vscale x 8 x bfloat> @ld1rqh_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) {
 ; CHECK-LABEL: ld1rqh_bf16:
 ; CHECK:       // %bb.0:
@@ -127,6 +184,97 @@ define <vscale x 8 x bfloat> @ld1rqh_bf16_imm(<vscale x 8 x i1> %pred, bfloat* %
   ret <vscale x 8 x bfloat> %res
 }
 
+define <vscale x 8 x bfloat> @ld1rqh_bf16_scalar(<vscale x 8 x i1> %pred, bfloat* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqh_bf16_scalar:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds bfloat, bfloat* %addr, i64 %idx
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %pred, bfloat* %ptr)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x i16> @ld1rqh_i16_imm_dupqlane(<vscale x 8 x i1> %pred, <8 x i16>* %addr) {
+; CHECK-LABEL: ld1rqh_i16_imm_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, #-16]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds <8 x i16>, <8 x i16>* %addr, i16 -1
+  %load = load <8 x i16>, <8 x i16>* %ptr
+  %1 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> %load, i64 0)
+  %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %1, i64 0)
+  ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 8 x i16> @ld1rqh_i16_scalar_dupqlane(<vscale x 8 x i1> %pred, i16* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqh_i16_scalar_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds i16, i16* %addr, i64 %idx
+  %ptr_bitcast = bitcast i16* %ptr to <8 x i16>*
+  %load = load <8 x i16>, <8 x i16>* %ptr_bitcast
+  %1 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> %load, i64 0)
+  %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %1, i64 0)
+  ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 8 x half> @ld1rqh_f16_imm_dupqlane(<vscale x 8 x i1> %pred, <8 x half>* %addr) {
+; CHECK-LABEL: ld1rqh_f16_imm_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, #-16]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds <8 x half>, <8 x half>* %addr, i16 -1
+  %load = load <8 x half>, <8 x half>* %ptr
+  %1 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %load, i64 0)
+  %2 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %1, i64 0)
+  ret <vscale x 8 x half> %2
+}
+
+define <vscale x 8 x half> @ld1rqh_f16_scalar_dupqlane(<vscale x 8 x i1> %pred, half* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqh_f16_scalar_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds half, half* %addr, i64 %idx
+  %ptr_bitcast = bitcast half* %ptr to <8 x half>*
+  %load = load <8 x half>, <8 x half>* %ptr_bitcast
+  %1 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %load, i64 0)
+  %2 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %1, i64 0)
+  ret <vscale x 8 x half> %2
+}
+
+define <vscale x 8 x bfloat> @ld1rqh_bf16_imm_dupqlane(<vscale x 8 x i1> %pred, <8 x bfloat>* %addr) {
+; CHECK-LABEL: ld1rqh_bf16_imm_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, #-16]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds <8 x bfloat>, <8 x bfloat>* %addr, i16 -1
+  %load = load <8 x bfloat>, <8 x bfloat>* %ptr
+  %1 = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> undef, <8 x bfloat> %load, i64 0)
+  %2 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %1, i64 0)
+  ret <vscale x 8 x bfloat> %2
+}
+
+define <vscale x 8 x bfloat> @ld1rqh_bf16_scalar_dupqlane(<vscale x 8 x i1> %pred, bfloat* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqh_bf16_scalar_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds bfloat, bfloat* %addr, i64 %idx
+  %ptr_bitcast = bitcast bfloat* %ptr to <8 x bfloat>*
+  %load = load <8 x bfloat>, <8 x bfloat>* %ptr_bitcast
+  %1 = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> undef, <8 x bfloat> %load, i64 0)
+  %2 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %1, i64 0)
+  ret <vscale x 8 x bfloat> %2
+}
+
 ;
 ; LD1RQW
 ;
@@ -169,6 +317,80 @@ define <vscale x 4 x float> @ld1rqw_f32_imm(<vscale x 4 x i1> %pred, float* %add
   ret <vscale x 4 x float> %res
 }
 
+define <vscale x 4 x i32> @ld1rqw_i32_scalar(<vscale x 4 x i1> %pred, i32* %base, i64 %idx) {
+; CHECK-LABEL: ld1rqw_i32_scalar:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds i32, i32* %base, i64 %idx
+  %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1> %pred, i32* %ptr)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x float> @ld1rqw_f32_scalar(<vscale x 4 x i1> %pred, float* %base, i64 %idx) {
+; CHECK-LABEL: ld1rqw_f32_scalar:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds float, float* %base, i64 %idx
+  %res = call <vscale x 4 x float> @llvm.aarch64.sve.ld1rq.nxv4f32(<vscale x 4 x i1> %pred, float* %ptr)
+  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 4 x i32> @ld1rqw_i32_imm_dupqlane(<vscale x 4 x i1> %pred, <4 x i32>* %addr) {
+; CHECK-LABEL: ld1rqw_i32_imm_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, #16]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %addr, i32 1
+  %load = load <4 x i32>, <4 x i32>* %ptr
+  %1 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> %load, i64 0)
+  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %1, i64 0)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @ld1rqw_i32_scalar_dupqlane(<vscale x 4 x i1> %pred, i32* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqw_i32_scalar_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds i32, i32* %addr, i64 %idx
+  %ptr_bitcast = bitcast i32* %ptr to <4 x i32>*
+  %load = load <4 x i32>, <4 x i32>* %ptr_bitcast
+  %1 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> %load, i64 0)
+  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %1, i64 0)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x float> @ld1rqw_f32_imm_dupqlane(<vscale x 4 x i1> %pred, <4 x float>* %addr) {
+; CHECK-LABEL: ld1rqw_f32_imm_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, #16]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds <4 x float>, <4 x float>* %addr, i32 1
+  %load = load <4 x float>, <4 x float>* %ptr
+  %1 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %load, i64 0)
+  %2 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %1, i64 0)
+  ret <vscale x 4 x float> %2
+}
+
+define <vscale x 4 x float> @ld1rqw_f32_scalar_dupqlane(<vscale x 4 x i1> %pred, float* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqw_f32_scalar_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds float, float* %addr, i64 %idx
+  %ptr_bitcast = bitcast float* %ptr to <4 x float>*
+  %load = load <4 x float>, <4 x float>* %ptr_bitcast
+  %1 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %load, i64 0)
+  %2 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %1, i64 0)
+  ret <vscale x 4 x float> %2
+}
+
 ;
 ; LD1RQD
 ;
@@ -211,6 +433,80 @@ define <vscale x 2 x double> @ld1rqd_f64_imm(<vscale x 2 x i1> %pred, double* %a
   ret <vscale x 2 x double> %res
 }
 
+define <vscale x 2 x i64> @ld1rqd_i64_scalar(<vscale x 2 x i1> %pred, i64* %base, i64 %idx) {
+; CHECK-LABEL: ld1rqd_i64_scalar:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds i64, i64* %base, i64 %idx
+  %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1> %pred, i64* %ptr)
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x double> @ld1rqd_f64_scalar(<vscale x 2 x i1> %pred, double* %base, i64 %idx) {
+; CHECK-LABEL: ld1rqd_f64_scalar:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds double, double* %base, i64 %idx
+  %res = call <vscale x 2 x double> @llvm.aarch64.sve.ld1rq.nxv2f64(<vscale x 2 x i1> %pred, double* %ptr)
+  ret <vscale x 2 x double> %res
+}
+
+define <vscale x 2 x i64> @ld1rqd_i64_imm_dupqlane(<vscale x 2 x i1> %pred, <2 x i64>* %addr) {
+; CHECK-LABEL: ld1rqd_i64_imm_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, #16]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds <2 x i64>, <2 x i64>* %addr, i64 1
+  %load = load <2 x i64>, <2 x i64>* %ptr
+  %1 = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> %load, i64 0)
+  %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %1, i64 0)
+  ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 2 x i64> @ld1rqd_i64_scalar_dupqlane(<vscale x 2 x i1> %pred, i64* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqd_i64_scalar_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds i64, i64* %addr, i64 %idx
+  %ptr_bitcast = bitcast i64* %ptr to <2 x i64>*
+  %load = load <2 x i64>, <2 x i64>* %ptr_bitcast
+  %1 = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> %load, i64 0)
+  %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %1, i64 0)
+  ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 2 x double> @ld1rqd_f64_imm_dupqlane(<vscale x 2 x i1> %pred, <2 x double>* %addr) {
+; CHECK-LABEL: ld1rqd_f64_imm_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, #16]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds <2 x double>, <2 x double>* %addr, i64 1
+  %load = load <2 x double>, <2 x double>* %ptr
+  %1 = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> undef, <2 x double> %load, i64 0)
+  %2 = tail call <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double> %1, i64 0)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @ld1rqd_f64_scalar_dupqlane(<vscale x 2 x i1> %pred, double* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqd_f64_scalar_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds double, double* %addr, i64 %idx
+  %ptr_bitcast = bitcast double* %ptr to <2 x double>*
+  %load = load <2 x double>, <2 x double>* %ptr_bitcast
+  %1 = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> undef, <2 x double> %load, i64 0)
+  %2 = tail call <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double> %1, i64 0)
+  ret <vscale x 2 x double> %2
+}
+
 ;
 ; LDNT1B
 ;
@@ -616,3 +912,21 @@ declare <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(<vscale
 declare <vscale x 32 x bfloat> @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(<vscale x 8 x i1>, bfloat*)
 declare <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*)
 declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*)
+
+declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
+declare <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double>, <2 x double>, i64)
+declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
+declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
+declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half>, <8 x half>, i64)
+declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat>, <8 x bfloat>, i64)
+declare <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
+
+declare <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64>, i64)
+declare <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double>, i64)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float>, i64)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16>, i64)
+declare <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half>, i64)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat>, i64)
+declare <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8>, i64)

diff  --git a/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s
index ec1ca5410a3f2..0f1d46f43bd4b 100644
--- a/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s
@@ -3432,22 +3432,22 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      11    0.50    *             U     ld1rh	{ z31.d }, p7/z, [sp, #126]
 # CHECK-NEXT:  1      11    0.50    *             U     ld1rh	{ z31.h }, p7/z, [sp, #126]
 # CHECK-NEXT:  1      11    0.50    *             U     ld1rh	{ z31.s }, p7/z, [sp, #126]
-# CHECK-NEXT:  1      11    0.50    *             U     ld1rqb	{ z0.b }, p0/z, [x0, x0]
+# CHECK-NEXT:  1      11    0.50    *                   ld1rqb	{ z0.b }, p0/z, [x0, x0]
 # CHECK-NEXT:  1      11    0.50    *                   ld1rqb	{ z0.b }, p0/z, [x0]
 # CHECK-NEXT:  1      11    0.50    *                   ld1rqb	{ z21.b }, p5/z, [x10, #112]
 # CHECK-NEXT:  1      11    0.50    *                   ld1rqb	{ z23.b }, p3/z, [x13, #-128]
 # CHECK-NEXT:  1      11    0.50    *                   ld1rqb	{ z31.b }, p7/z, [sp, #-16]
-# CHECK-NEXT:  1      11    0.50    *             U     ld1rqd	{ z0.d }, p0/z, [x0, x0, lsl #3]
+# CHECK-NEXT:  1      11    0.50    *                   ld1rqd	{ z0.d }, p0/z, [x0, x0, lsl #3]
 # CHECK-NEXT:  1      11    0.50    *                   ld1rqd	{ z0.d }, p0/z, [x0]
 # CHECK-NEXT:  1      11    0.50    *                   ld1rqd	{ z23.d }, p3/z, [x13, #-128]
 # CHECK-NEXT:  1      11    0.50    *                   ld1rqd	{ z23.d }, p3/z, [x13, #112]
 # CHECK-NEXT:  1      11    0.50    *                   ld1rqd	{ z31.d }, p7/z, [sp, #-16]
-# CHECK-NEXT:  1      11    0.50    *             U     ld1rqh	{ z0.h }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      11    0.50    *                   ld1rqh	{ z0.h }, p0/z, [x0, x0, lsl #1]
 # CHECK-NEXT:  1      11    0.50    *                   ld1rqh	{ z0.h }, p0/z, [x0]
 # CHECK-NEXT:  1      11    0.50    *                   ld1rqh	{ z23.h }, p3/z, [x13, #-128]
 # CHECK-NEXT:  1      11    0.50    *                   ld1rqh	{ z23.h }, p3/z, [x13, #112]
 # CHECK-NEXT:  1      11    0.50    *                   ld1rqh	{ z31.h }, p7/z, [sp, #-16]
-# CHECK-NEXT:  1      11    0.50    *             U     ld1rqw	{ z0.s }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  1      11    0.50    *                   ld1rqw	{ z0.s }, p0/z, [x0, x0, lsl #2]
 # CHECK-NEXT:  1      11    0.50    *                   ld1rqw	{ z0.s }, p0/z, [x0]
 # CHECK-NEXT:  1      11    0.50    *                   ld1rqw	{ z23.s }, p3/z, [x13, #-128]
 # CHECK-NEXT:  1      11    0.50    *                   ld1rqw	{ z23.s }, p3/z, [x13, #112]

diff  --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
index dfaa601300ea2..062ac80bd718b 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
@@ -4488,22 +4488,22 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      6     0.33    *             U     ld1rh	{ z31.d }, p7/z, [sp, #126]
 # CHECK-NEXT:  1      6     0.33    *             U     ld1rh	{ z31.h }, p7/z, [sp, #126]
 # CHECK-NEXT:  1      6     0.33    *             U     ld1rh	{ z31.s }, p7/z, [sp, #126]
-# CHECK-NEXT:  1      6     0.33    *             U     ld1rqb	{ z0.b }, p0/z, [x0, x0]
+# CHECK-NEXT:  1      6     0.33    *                   ld1rqb	{ z0.b }, p0/z, [x0, x0]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqb	{ z0.b }, p0/z, [x0]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqb	{ z21.b }, p5/z, [x10, #112]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqb	{ z23.b }, p3/z, [x13, #-128]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqb	{ z31.b }, p7/z, [sp, #-16]
-# CHECK-NEXT:  1      6     0.33    *             U     ld1rqd	{ z0.d }, p0/z, [x0, x0, lsl #3]
+# CHECK-NEXT:  1      6     0.33    *                   ld1rqd	{ z0.d }, p0/z, [x0, x0, lsl #3]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqd	{ z0.d }, p0/z, [x0]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqd	{ z23.d }, p3/z, [x13, #-128]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqd	{ z23.d }, p3/z, [x13, #112]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqd	{ z31.d }, p7/z, [sp, #-16]
-# CHECK-NEXT:  1      6     0.33    *             U     ld1rqh	{ z0.h }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      6     0.33    *                   ld1rqh	{ z0.h }, p0/z, [x0, x0, lsl #1]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqh	{ z0.h }, p0/z, [x0]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqh	{ z23.h }, p3/z, [x13, #-128]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqh	{ z23.h }, p3/z, [x13, #112]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqh	{ z31.h }, p7/z, [sp, #-16]
-# CHECK-NEXT:  1      6     0.33    *             U     ld1rqw	{ z0.s }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  1      6     0.33    *                   ld1rqw	{ z0.s }, p0/z, [x0, x0, lsl #2]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqw	{ z0.s }, p0/z, [x0]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqw	{ z23.s }, p3/z, [x13, #-128]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqw	{ z23.s }, p3/z, [x13, #112]