[llvm-branch-commits] [llvm] 7832b42 - [CodeGen] Disable FP LD1RX instructions generation for Neoverse-V1

Tobias Hieta via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Thu Aug 10 00:09:39 PDT 2023


Author: Igor Kirillov
Date: 2023-08-10T09:05:30+02:00
New Revision: 7832b42073d61d598be98b43d5896f578bfc3950

URL: https://github.com/llvm/llvm-project/commit/7832b42073d61d598be98b43d5896f578bfc3950
DIFF: https://github.com/llvm/llvm-project/commit/7832b42073d61d598be98b43d5896f578bfc3950.diff

LOG: [CodeGen] Disable FP LD1RX instructions generation for Neoverse-V1

These instructions show worse performance on Neoverse-V1 compared
to pair of LDR(LDP)/MOV instructions.
This patch adds `no-sve-fp-ld1r` sub-target feature, which is enabled
only on Neoverse-V1.

Fixes https://github.com/llvm/llvm-project/issues/64498

Differential Revision: https://reviews.llvm.org/D157279

(cherry picked from commit 60e2a849b0a537f96ca12fb032c4a0e32e07b4ae)

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64.td
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
    llvm/lib/Target/AArch64/SVEInstrFormats.td
    llvm/test/CodeGen/AArch64/sve-ld1r.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 05adbe27c948cb..8f50af4b71fd9a 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -148,6 +148,9 @@ def FeatureExperimentalZeroingPseudos
 def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl",
   "UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">;
 
+def FeatureNoSVEFPLD1R : SubtargetFeature<"no-sve-fp-ld1r",
+  "NoSVEFPLD1R", "true", "Avoid using LD1RX instructions for FP">;
+
 def FeatureSVE2 : SubtargetFeature<"sve2", "HasSVE2", "true",
   "Enable Scalable Vector Extension 2 (SVE2) instructions (FEAT_SVE2)",
   [FeatureSVE, FeatureUseScalarIncVL]>;
@@ -1137,7 +1140,8 @@ def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1
                                       FeatureLSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
-                                      FeaturePredictableSelectIsExpensive]>;
+                                      FeaturePredictableSelectIsExpensive,
+                                      FeatureNoSVEFPLD1R]>;
 
 def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2",
                                       "Neoverse V2 ARM processors", [

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 565d629841b940..0f3d3461767801 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -262,6 +262,8 @@ def UseNegativeImmediates
 
 def UseScalarIncVL : Predicate<"Subtarget->useScalarIncVL()">;
 
+def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">;
+
 def IsNeonAvailable : Predicate<"Subtarget->isNeonAvailable()">;
 
 def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",

diff  --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 0710c654a95df6..b4f02e0dd20374 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2355,13 +2355,15 @@ let Predicates = [HasSVEorSME] in {
   // LDR1 of 64-bit data
   defm : LD1RPat<nxv2i64, load, LD1RD_IMM, PTRUE_D, i64, am_indexed64_6b, uimm6s8>;
 
-  // LD1R of FP data
-  defm : LD1RPat<nxv8f16, load, LD1RH_IMM,   PTRUE_H, f16, am_indexed16_6b, uimm6s2>;
-  defm : LD1RPat<nxv4f16, load, LD1RH_S_IMM, PTRUE_S, f16, am_indexed16_6b, uimm6s2>;
-  defm : LD1RPat<nxv2f16, load, LD1RH_D_IMM, PTRUE_D, f16, am_indexed16_6b, uimm6s2>;
-  defm : LD1RPat<nxv4f32, load, LD1RW_IMM,   PTRUE_S, f32, am_indexed32_6b, uimm6s4>;
-  defm : LD1RPat<nxv2f32, load, LD1RW_D_IMM, PTRUE_D, f32, am_indexed32_6b, uimm6s4>;
-  defm : LD1RPat<nxv2f64, load, LD1RD_IMM,   PTRUE_D, f64, am_indexed64_6b, uimm6s8>;
+  let Predicates = [HasSVEorSME, UseSVEFPLD1R] in {
+    // LD1R of FP data
+    defm : LD1RPat<nxv8f16, load, LD1RH_IMM,   PTRUE_H, f16, am_indexed16_6b, uimm6s2>;
+    defm : LD1RPat<nxv4f16, load, LD1RH_S_IMM, PTRUE_S, f16, am_indexed16_6b, uimm6s2>;
+    defm : LD1RPat<nxv2f16, load, LD1RH_D_IMM, PTRUE_D, f16, am_indexed16_6b, uimm6s2>;
+    defm : LD1RPat<nxv4f32, load, LD1RW_IMM,   PTRUE_S, f32, am_indexed32_6b, uimm6s4>;
+    defm : LD1RPat<nxv2f32, load, LD1RW_D_IMM, PTRUE_D, f32, am_indexed32_6b, uimm6s4>;
+    defm : LD1RPat<nxv2f64, load, LD1RD_IMM,   PTRUE_D, f64, am_indexed64_6b, uimm6s8>;
+  }
 
 // LD1R of 128-bit masked data
   multiclass ld1rq_pat<ValueType vt1, SDPatternOperator op, Instruction load_instr, ComplexPattern AddrCP>{

diff  --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index c4c0dca114ce7c..4902ec3639ec54 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -7203,6 +7203,10 @@ multiclass sve_int_perm_cpy_v<string asm, SDPatternOperator op> {
 
   def : Pat<(nxv8f16 (op nxv8i1:$pg, f16:$splat, nxv8f16:$passthru)),
             (!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>;
+  def : Pat<(nxv4f16 (op nxv4i1:$pg, f16:$splat, nxv4f16:$passthru)),
+            (!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>;
+  def : Pat<(nxv2f16 (op nxv2i1:$pg, f16:$splat, nxv2f16:$passthru)),
+            (!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>;
   def : Pat<(nxv2f32 (op nxv2i1:$pg, f32:$splat, nxv2f32:$passthru)),
             (!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>;
   def : Pat<(nxv4f32 (op nxv4i1:$pg, f32:$splat, nxv4f32:$passthru)),

diff  --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll
index 632641e7042d8f..a1103fc28a2eed 100644
--- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-LD1R
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+no-sve-fp-ld1r < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NO-LD1R
 ;
 ; Check that ldr1* instruction is generated to splat scalar during load,
 ; rather than mov from scalar to vector register (which would require the vector unit).
@@ -406,10 +407,28 @@ define <vscale x 2 x i64> @ld1rd_gep_out_of_range_down(ptr %valp) {
 }
 
 define <vscale x 8 x half> @ld1rh_half(ptr %valp) {
-; CHECK-LABEL: ld1rh_half:
+; CHECK-LD1R-LABEL: ld1rh_half:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ptrue p0.h
+; CHECK-LD1R-NEXT:    ld1rh { z0.h }, p0/z, [x0]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rh_half:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr h0, [x0]
+; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
+; CHECK-NO-LD1R-NEXT:    ret
+  %val = load half, ptr %valp
+  %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
+  %shf = shufflevector <vscale x 8 x half> %ins, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x half> %shf
+}
+
+define <vscale x 8 x half> @ld1rh_half_neoverse(ptr %valp) #1 {
+; CHECK-LABEL: ld1rh_half_neoverse:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    ret
   %val = load half, ptr %valp
   %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
@@ -418,11 +437,17 @@ define <vscale x 8 x half> @ld1rh_half(ptr %valp) {
 }
 
 define <vscale x 8 x half> @ld1rh_half_gep(ptr %valp) {
-; CHECK-LABEL: ld1rh_half_gep:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x0, #126]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rh_half_gep:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ptrue p0.h
+; CHECK-LD1R-NEXT:    ld1rh { z0.h }, p0/z, [x0, #126]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rh_half_gep:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr h0, [x0, #126]
+; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
+; CHECK-NO-LD1R-NEXT:    ret
   %valp2 = getelementptr half, ptr %valp, i32 63
   %val = load half, ptr %valp2
   %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
@@ -431,12 +456,18 @@ define <vscale x 8 x half> @ld1rh_half_gep(ptr %valp) {
 }
 
 define <vscale x 8 x half> @ld1rh_half_gep_out_of_range_up(ptr %valp) {
-; CHECK-LABEL: ld1rh_half_gep_out_of_range_up:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #128
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x8]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rh_half_gep_out_of_range_up:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    add x8, x0, #128
+; CHECK-LD1R-NEXT:    ptrue p0.h
+; CHECK-LD1R-NEXT:    ld1rh { z0.h }, p0/z, [x8]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rh_half_gep_out_of_range_up:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr h0, [x0, #128]
+; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
+; CHECK-NO-LD1R-NEXT:    ret
   %valp2 = getelementptr half, ptr %valp, i32 64
   %val = load half, ptr %valp2
   %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
@@ -445,12 +476,18 @@ define <vscale x 8 x half> @ld1rh_half_gep_out_of_range_up(ptr %valp) {
 }
 
 define <vscale x 8 x half> @ld1rh_half_gep_out_of_range_down(ptr %valp) {
-; CHECK-LABEL: ld1rh_half_gep_out_of_range_down:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub x8, x0, #2
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x8]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rh_half_gep_out_of_range_down:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    sub x8, x0, #2
+; CHECK-LD1R-NEXT:    ptrue p0.h
+; CHECK-LD1R-NEXT:    ld1rh { z0.h }, p0/z, [x8]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rh_half_gep_out_of_range_down:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldur h0, [x0, #-2]
+; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
+; CHECK-NO-LD1R-NEXT:    ret
   %valp2 = getelementptr half, ptr %valp, i32 -1
   %val = load half, ptr %valp2
   %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
@@ -459,11 +496,17 @@ define <vscale x 8 x half> @ld1rh_half_gep_out_of_range_down(ptr %valp) {
 }
 
 define <vscale x 4 x half> @ld1rh_half_unpacked4(ptr %valp) {
-; CHECK-LABEL: ld1rh_half_unpacked4:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1rh { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rh_half_unpacked4:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ptrue p0.s
+; CHECK-LD1R-NEXT:    ld1rh { z0.s }, p0/z, [x0]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked4:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr h0, [x0]
+; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
+; CHECK-NO-LD1R-NEXT:    ret
   %val = load half, ptr %valp
   %ins = insertelement <vscale x 4 x half> undef, half %val, i32 0
   %shf = shufflevector <vscale x 4 x half> %ins, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
@@ -471,11 +514,17 @@ define <vscale x 4 x half> @ld1rh_half_unpacked4(ptr %valp) {
 }
 
 define <vscale x 4 x half> @ld1rh_half_unpacked4_gep(ptr %valp) {
-; CHECK-LABEL: ld1rh_half_unpacked4_gep:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1rh { z0.s }, p0/z, [x0, #126]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rh_half_unpacked4_gep:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ptrue p0.s
+; CHECK-LD1R-NEXT:    ld1rh { z0.s }, p0/z, [x0, #126]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked4_gep:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr h0, [x0, #126]
+; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
+; CHECK-NO-LD1R-NEXT:    ret
   %valp2 = getelementptr half, ptr %valp, i32 63
   %val = load half, ptr %valp2
   %ins = insertelement <vscale x 4 x half> undef, half %val, i32 0
@@ -484,12 +533,18 @@ define <vscale x 4 x half> @ld1rh_half_unpacked4_gep(ptr %valp) {
 }
 
 define <vscale x 4 x half> @ld1rh_half_unpacked4_gep_out_of_range_up(ptr %valp) {
-; CHECK-LABEL: ld1rh_half_unpacked4_gep_out_of_range_up:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #128
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1rh { z0.s }, p0/z, [x8]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_up:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    add x8, x0, #128
+; CHECK-LD1R-NEXT:    ptrue p0.s
+; CHECK-LD1R-NEXT:    ld1rh { z0.s }, p0/z, [x8]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_up:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr h0, [x0, #128]
+; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
+; CHECK-NO-LD1R-NEXT:    ret
   %valp2 = getelementptr half, ptr %valp, i32 64
   %val = load half, ptr %valp2
   %ins = insertelement <vscale x 4 x half> undef, half %val, i32 0
@@ -498,12 +553,18 @@ define <vscale x 4 x half> @ld1rh_half_unpacked4_gep_out_of_range_up(ptr %valp)
 }
 
 define <vscale x 4 x half> @ld1rh_half_unpacked4_gep_out_of_range_down(ptr %valp) {
-; CHECK-LABEL: ld1rh_half_unpacked4_gep_out_of_range_down:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub x8, x0, #2
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1rh { z0.s }, p0/z, [x8]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_down:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    sub x8, x0, #2
+; CHECK-LD1R-NEXT:    ptrue p0.s
+; CHECK-LD1R-NEXT:    ld1rh { z0.s }, p0/z, [x8]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_down:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldur h0, [x0, #-2]
+; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
+; CHECK-NO-LD1R-NEXT:    ret
   %valp2 = getelementptr half, ptr %valp, i32 -1
   %val = load half, ptr %valp2
   %ins = insertelement <vscale x 4 x half> undef, half %val, i32 0
@@ -512,11 +573,17 @@ define <vscale x 4 x half> @ld1rh_half_unpacked4_gep_out_of_range_down(ptr %valp
 }
 
 define <vscale x 2 x half> @ld1rh_half_unpacked2(ptr %valp) {
-; CHECK-LABEL: ld1rh_half_unpacked2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1rh { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rh_half_unpacked2:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ptrue p0.d
+; CHECK-LD1R-NEXT:    ld1rh { z0.d }, p0/z, [x0]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked2:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr h0, [x0]
+; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
+; CHECK-NO-LD1R-NEXT:    ret
   %val = load half, ptr %valp
   %ins = insertelement <vscale x 2 x half> undef, half %val, i32 0
   %shf = shufflevector <vscale x 2 x half> %ins, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
@@ -524,11 +591,17 @@ define <vscale x 2 x half> @ld1rh_half_unpacked2(ptr %valp) {
 }
 
 define <vscale x 2 x half> @ld1rh_half_unpacked2_gep(ptr %valp) {
-; CHECK-LABEL: ld1rh_half_unpacked2_gep:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1rh { z0.d }, p0/z, [x0, #126]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rh_half_unpacked2_gep:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ptrue p0.d
+; CHECK-LD1R-NEXT:    ld1rh { z0.d }, p0/z, [x0, #126]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked2_gep:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr h0, [x0, #126]
+; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
+; CHECK-NO-LD1R-NEXT:    ret
   %valp2 = getelementptr half, ptr %valp, i32 63
   %val = load half, ptr %valp2
   %ins = insertelement <vscale x 2 x half> undef, half %val, i32 0
@@ -537,12 +610,18 @@ define <vscale x 2 x half> @ld1rh_half_unpacked2_gep(ptr %valp) {
 }
 
 define <vscale x 2 x half> @ld1rh_half_unpacked2_gep_out_of_range_up(ptr %valp) {
-; CHECK-LABEL: ld1rh_half_unpacked2_gep_out_of_range_up:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #128
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1rh { z0.d }, p0/z, [x8]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_up:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    add x8, x0, #128
+; CHECK-LD1R-NEXT:    ptrue p0.d
+; CHECK-LD1R-NEXT:    ld1rh { z0.d }, p0/z, [x8]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_up:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr h0, [x0, #128]
+; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
+; CHECK-NO-LD1R-NEXT:    ret
   %valp2 = getelementptr half, ptr %valp, i32 64
   %val = load half, ptr %valp2
   %ins = insertelement <vscale x 2 x half> undef, half %val, i32 0
@@ -551,12 +630,18 @@ define <vscale x 2 x half> @ld1rh_half_unpacked2_gep_out_of_range_up(ptr %valp)
 }
 
 define <vscale x 2 x half> @ld1rh_half_unpacked2_gep_out_of_range_down(ptr %valp) {
-; CHECK-LABEL: ld1rh_half_unpacked2_gep_out_of_range_down:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub x8, x0, #2
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1rh { z0.d }, p0/z, [x8]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_down:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    sub x8, x0, #2
+; CHECK-LD1R-NEXT:    ptrue p0.d
+; CHECK-LD1R-NEXT:    ld1rh { z0.d }, p0/z, [x8]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_down:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldur h0, [x0, #-2]
+; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
+; CHECK-NO-LD1R-NEXT:    ret
   %valp2 = getelementptr half, ptr %valp, i32 -1
   %val = load half, ptr %valp2
   %ins = insertelement <vscale x 2 x half> undef, half %val, i32 0
@@ -565,11 +650,17 @@ define <vscale x 2 x half> @ld1rh_half_unpacked2_gep_out_of_range_down(ptr %valp
 }
 
 define <vscale x 4 x float> @ld1rw_float(ptr %valp) {
-; CHECK-LABEL: ld1rw_float:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rw_float:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ptrue p0.s
+; CHECK-LD1R-NEXT:    ld1rw { z0.s }, p0/z, [x0]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rw_float:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr s0, [x0]
+; CHECK-NO-LD1R-NEXT:    mov z0.s, s0
+; CHECK-NO-LD1R-NEXT:    ret
   %val = load float, ptr %valp
   %ins = insertelement <vscale x 4 x float> undef, float %val, i32 0
   %shf = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
@@ -577,11 +668,17 @@ define <vscale x 4 x float> @ld1rw_float(ptr %valp) {
 }
 
 define <vscale x 4 x float> @ld1rw_float_gep(ptr %valp) {
-; CHECK-LABEL: ld1rw_float_gep:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x0, #252]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rw_float_gep:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ptrue p0.s
+; CHECK-LD1R-NEXT:    ld1rw { z0.s }, p0/z, [x0, #252]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rw_float_gep:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr s0, [x0, #252]
+; CHECK-NO-LD1R-NEXT:    mov z0.s, s0
+; CHECK-NO-LD1R-NEXT:    ret
   %valp2 = getelementptr float, ptr %valp, i32 63
   %val = load float, ptr %valp2
   %ins = insertelement <vscale x 4 x float> undef, float %val, i32 0
@@ -590,12 +687,18 @@ define <vscale x 4 x float> @ld1rw_float_gep(ptr %valp) {
 }
 
 define <vscale x 4 x float> @ld1rw_float_gep_out_of_range_up(ptr %valp) {
-; CHECK-LABEL: ld1rw_float_gep_out_of_range_up:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #256
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x8]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rw_float_gep_out_of_range_up:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    add x8, x0, #256
+; CHECK-LD1R-NEXT:    ptrue p0.s
+; CHECK-LD1R-NEXT:    ld1rw { z0.s }, p0/z, [x8]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rw_float_gep_out_of_range_up:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr s0, [x0, #256]
+; CHECK-NO-LD1R-NEXT:    mov z0.s, s0
+; CHECK-NO-LD1R-NEXT:    ret
   %valp2 = getelementptr float, ptr %valp, i32 64
   %val = load float, ptr %valp2
   %ins = insertelement <vscale x 4 x float> undef, float %val, i32 0
@@ -604,12 +707,18 @@ define <vscale x 4 x float> @ld1rw_float_gep_out_of_range_up(ptr %valp) {
 }
 
 define <vscale x 4 x float> @ld1rw_float_gep_out_of_range_down(ptr %valp) {
-; CHECK-LABEL: ld1rw_float_gep_out_of_range_down:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub x8, x0, #4
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x8]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rw_float_gep_out_of_range_down:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    sub x8, x0, #4
+; CHECK-LD1R-NEXT:    ptrue p0.s
+; CHECK-LD1R-NEXT:    ld1rw { z0.s }, p0/z, [x8]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rw_float_gep_out_of_range_down:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldur s0, [x0, #-4]
+; CHECK-NO-LD1R-NEXT:    mov z0.s, s0
+; CHECK-NO-LD1R-NEXT:    ret
   %valp2 = getelementptr float, ptr %valp, i32 -1
   %val = load float, ptr %valp2
   %ins = insertelement <vscale x 4 x float> undef, float %val, i32 0
@@ -618,11 +727,17 @@ define <vscale x 4 x float> @ld1rw_float_gep_out_of_range_down(ptr %valp) {
 }
 
 define <vscale x 2 x float> @ld1rw_float_unpacked2(ptr %valp) {
-; CHECK-LABEL: ld1rw_float_unpacked2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1rw { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rw_float_unpacked2:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ptrue p0.d
+; CHECK-LD1R-NEXT:    ld1rw { z0.d }, p0/z, [x0]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rw_float_unpacked2:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr s0, [x0]
+; CHECK-NO-LD1R-NEXT:    mov z0.s, s0
+; CHECK-NO-LD1R-NEXT:    ret
   %val = load float, ptr %valp
   %ins = insertelement <vscale x 2 x float> undef, float %val, i32 0
   %shf = shufflevector <vscale x 2 x float> %ins, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
@@ -630,11 +745,17 @@ define <vscale x 2 x float> @ld1rw_float_unpacked2(ptr %valp) {
 }
 
 define <vscale x 2 x float> @ld1rw_float_unpacked2_gep(ptr %valp) {
-; CHECK-LABEL: ld1rw_float_unpacked2_gep:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1rw { z0.d }, p0/z, [x0, #252]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rw_float_unpacked2_gep:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ptrue p0.d
+; CHECK-LD1R-NEXT:    ld1rw { z0.d }, p0/z, [x0, #252]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rw_float_unpacked2_gep:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr s0, [x0, #252]
+; CHECK-NO-LD1R-NEXT:    mov z0.s, s0
+; CHECK-NO-LD1R-NEXT:    ret
   %valp2 = getelementptr float, ptr %valp, i32 63
   %val = load float, ptr %valp2
   %ins = insertelement <vscale x 2 x float> undef, float %val, i32 0
@@ -643,12 +764,18 @@ define <vscale x 2 x float> @ld1rw_float_unpacked2_gep(ptr %valp) {
 }
 
 define <vscale x 2 x float> @ld1rw_float_unpacked2_gep_out_of_range_up(ptr %valp) {
-; CHECK-LABEL: ld1rw_float_unpacked2_gep_out_of_range_up:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #256
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1rw { z0.d }, p0/z, [x8]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_up:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    add x8, x0, #256
+; CHECK-LD1R-NEXT:    ptrue p0.d
+; CHECK-LD1R-NEXT:    ld1rw { z0.d }, p0/z, [x8]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_up:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr s0, [x0, #256]
+; CHECK-NO-LD1R-NEXT:    mov z0.s, s0
+; CHECK-NO-LD1R-NEXT:    ret
   %valp2 = getelementptr float, ptr %valp, i32 64
   %val = load float, ptr %valp2
   %ins = insertelement <vscale x 2 x float> undef, float %val, i32 0
@@ -657,12 +784,18 @@ define <vscale x 2 x float> @ld1rw_float_unpacked2_gep_out_of_range_up(ptr %valp
 }
 
 define <vscale x 2 x float> @ld1rw_float_unpacked2_gep_out_of_range_down(ptr %valp) {
-; CHECK-LABEL: ld1rw_float_unpacked2_gep_out_of_range_down:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub x8, x0, #4
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1rw { z0.d }, p0/z, [x8]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_down:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    sub x8, x0, #4
+; CHECK-LD1R-NEXT:    ptrue p0.d
+; CHECK-LD1R-NEXT:    ld1rw { z0.d }, p0/z, [x8]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_down:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldur s0, [x0, #-4]
+; CHECK-NO-LD1R-NEXT:    mov z0.s, s0
+; CHECK-NO-LD1R-NEXT:    ret
   %valp2 = getelementptr float, ptr %valp, i32 -1
   %val = load float, ptr %valp2
   %ins = insertelement <vscale x 2 x float> undef, float %val, i32 0
@@ -671,11 +804,17 @@ define <vscale x 2 x float> @ld1rw_float_unpacked2_gep_out_of_range_down(ptr %va
 }
 
 define <vscale x 2 x double> @ld1rd_double(ptr %valp) {
-; CHECK-LABEL: ld1rd_double:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rd_double:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ptrue p0.d
+; CHECK-LD1R-NEXT:    ld1rd { z0.d }, p0/z, [x0]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rd_double:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr d0, [x0]
+; CHECK-NO-LD1R-NEXT:    mov z0.d, d0
+; CHECK-NO-LD1R-NEXT:    ret
   %val = load double, ptr %valp
   %ins = insertelement <vscale x 2 x double> undef, double %val, i32 0
   %shf = shufflevector <vscale x 2 x double> %ins, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
@@ -683,11 +822,17 @@ define <vscale x 2 x double> @ld1rd_double(ptr %valp) {
 }
 
 define <vscale x 2 x double> @ld1rd_double_gep(ptr %valp) {
-; CHECK-LABEL: ld1rd_double_gep:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x0, #504]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rd_double_gep:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ptrue p0.d
+; CHECK-LD1R-NEXT:    ld1rd { z0.d }, p0/z, [x0, #504]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rd_double_gep:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr d0, [x0, #504]
+; CHECK-NO-LD1R-NEXT:    mov z0.d, d0
+; CHECK-NO-LD1R-NEXT:    ret
   %valp2 = getelementptr double, ptr %valp, i32 63
   %val = load double, ptr %valp2
   %ins = insertelement <vscale x 2 x double> undef, double %val, i32 0
@@ -696,12 +841,18 @@ define <vscale x 2 x double> @ld1rd_double_gep(ptr %valp) {
 }
 
 define <vscale x 2 x double> @ld1rd_double_gep_out_of_range_up(ptr %valp) {
-; CHECK-LABEL: ld1rd_double_gep_out_of_range_up:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #512
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x8]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rd_double_gep_out_of_range_up:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    add x8, x0, #512
+; CHECK-LD1R-NEXT:    ptrue p0.d
+; CHECK-LD1R-NEXT:    ld1rd { z0.d }, p0/z, [x8]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rd_double_gep_out_of_range_up:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr d0, [x0, #512]
+; CHECK-NO-LD1R-NEXT:    mov z0.d, d0
+; CHECK-NO-LD1R-NEXT:    ret
   %valp2 = getelementptr double, ptr %valp, i32 64
   %val = load double, ptr %valp2
   %ins = insertelement <vscale x 2 x double> undef, double %val, i32 0
@@ -710,12 +861,18 @@ define <vscale x 2 x double> @ld1rd_double_gep_out_of_range_up(ptr %valp) {
 }
 
 define <vscale x 2 x double> @ld1rd_double_gep_out_of_range_down(ptr %valp) {
-; CHECK-LABEL: ld1rd_double_gep_out_of_range_down:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub x8, x0, #8
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x8]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: ld1rd_double_gep_out_of_range_down:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    sub x8, x0, #8
+; CHECK-LD1R-NEXT:    ptrue p0.d
+; CHECK-LD1R-NEXT:    ld1rd { z0.d }, p0/z, [x8]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: ld1rd_double_gep_out_of_range_down:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldur d0, [x0, #-8]
+; CHECK-NO-LD1R-NEXT:    mov z0.d, d0
+; CHECK-NO-LD1R-NEXT:    ret
   %valp2 = getelementptr double, ptr %valp, i32 -1
   %val = load double, ptr %valp2
   %ins = insertelement <vscale x 2 x double> undef, double %val, i32 0
@@ -990,37 +1147,61 @@ define <vscale x 2 x i64> @dup_ld1rs_i32_passthruundef_nxv2i64_zext(<vscale x 2
     ret <vscale x 2 x i64> %res
 }
 define <vscale x 8 x half> @dup_ld1rh_half_passthruundef_nxv8f16(<vscale x 8 x i1> %pg, ptr %addr) {
-; CHECK-LABEL: dup_ld1rh_half_passthruundef_nxv8f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruundef_nxv8f16:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ld1rh { z0.h }, p0/z, [x0]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruundef_nxv8f16:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr h0, [x0]
+; CHECK-NO-LD1R-NEXT:    mov z0.h, p0/m, h0
+; CHECK-NO-LD1R-NEXT:    ret
     %ld = load half, ptr %addr
     %res = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, half %ld)
     ret <vscale x 8 x half> %res
 }
 define <vscale x 4 x float> @dup_ld1rs_float_passthruundef_nxv4f32(<vscale x 4 x i1> %pg, ptr %addr) {
-; CHECK-LABEL: dup_ld1rs_float_passthruundef_nxv4f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: dup_ld1rs_float_passthruundef_nxv4f32:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ld1rw { z0.s }, p0/z, [x0]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: dup_ld1rs_float_passthruundef_nxv4f32:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr s0, [x0]
+; CHECK-NO-LD1R-NEXT:    mov z0.s, p0/m, s0
+; CHECK-NO-LD1R-NEXT:    ret
     %ld = load float, ptr %addr
     %res = call <vscale x 4 x float> @llvm.aarch64.sve.dup.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, float %ld)
     ret <vscale x 4 x float> %res
 }
 define <vscale x 2 x double> @dup_ld1rd_double_passthruundef_nxv2f64(<vscale x 2 x i1> %pg, ptr %addr) {
-; CHECK-LABEL: dup_ld1rd_double_passthruundef_nxv2f64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: dup_ld1rd_double_passthruundef_nxv2f64:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ld1rd { z0.d }, p0/z, [x0]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: dup_ld1rd_double_passthruundef_nxv2f64:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr d0, [x0]
+; CHECK-NO-LD1R-NEXT:    mov z0.d, p0/m, d0
+; CHECK-NO-LD1R-NEXT:    ret
     %ld = load double, ptr %addr
     %res = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, double %ld)
     ret <vscale x 2 x double> %res
 }
 define <vscale x 4 x half> @dup_ld1rh_half_passthruundef_nxv4f16(<vscale x 4 x i1> %pg, ptr %addr) {
-; CHECK-LABEL: dup_ld1rh_half_passthruundef_nxv4f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1rh { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruundef_nxv4f16:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ld1rh { z0.s }, p0/z, [x0]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruundef_nxv4f16:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr h0, [x0]
+; CHECK-NO-LD1R-NEXT:    mov z0.h, p0/m, h0
+; CHECK-NO-LD1R-NEXT:    ret
     %ld = load half, ptr %addr
     %res = call <vscale x 4 x half> @llvm.aarch64.sve.dup.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x i1> %pg, half %ld)
     ret <vscale x 4 x half> %res
@@ -1062,55 +1243,97 @@ define <vscale x 2 x i64> @dup_ld1rd_i64_passthruzero_nxv2i64(<vscale x 2 x i1>
     ret <vscale x 2 x i64> %res
 }
 define <vscale x 8 x half> @dup_ld1rh_half_passthruzero_nxv8f16(<vscale x 8 x i1> %pg, ptr %addr) {
-; CHECK-LABEL: dup_ld1rh_half_passthruzero_nxv8f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv8f16:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ld1rh { z0.h }, p0/z, [x0]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv8f16:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr h1, [x0]
+; CHECK-NO-LD1R-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NO-LD1R-NEXT:    mov z0.h, p0/m, h1
+; CHECK-NO-LD1R-NEXT:    ret
     %ld = load half, ptr %addr
     %res = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x i1> %pg, half %ld)
     ret <vscale x 8 x half> %res
 }
 define <vscale x 4 x float> @dup_ld1rs_float_passthruzero_nxv4f32(<vscale x 4 x i1> %pg, ptr %addr) {
-; CHECK-LABEL: dup_ld1rs_float_passthruzero_nxv4f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv4f32:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ld1rw { z0.s }, p0/z, [x0]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv4f32:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr s1, [x0]
+; CHECK-NO-LD1R-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NO-LD1R-NEXT:    mov z0.s, p0/m, s1
+; CHECK-NO-LD1R-NEXT:    ret
     %ld = load float, ptr %addr
     %res = call <vscale x 4 x float> @llvm.aarch64.sve.dup.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> %pg, float %ld)
     ret <vscale x 4 x float> %res
 }
 define <vscale x 2 x double> @dup_ld1rd_double_passthruzero_nxv2f64(<vscale x 2 x i1> %pg, ptr %addr) {
-; CHECK-LABEL: dup_ld1rd_double_passthruzero_nxv2f64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: dup_ld1rd_double_passthruzero_nxv2f64:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ld1rd { z0.d }, p0/z, [x0]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: dup_ld1rd_double_passthruzero_nxv2f64:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr d1, [x0]
+; CHECK-NO-LD1R-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NO-LD1R-NEXT:    mov z0.d, p0/m, d1
+; CHECK-NO-LD1R-NEXT:    ret
     %ld = load double, ptr %addr
     %res = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x i1> %pg, double %ld)
     ret <vscale x 2 x double> %res
 }
 define <vscale x 4 x half> @dup_ld1rh_half_passthruzero_nxv4f16(<vscale x 4 x i1> %pg, ptr %addr) {
-; CHECK-LABEL: dup_ld1rh_half_passthruzero_nxv4f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1rh { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv4f16:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ld1rh { z0.s }, p0/z, [x0]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv4f16:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr h1, [x0]
+; CHECK-NO-LD1R-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NO-LD1R-NEXT:    mov z0.h, p0/m, h1
+; CHECK-NO-LD1R-NEXT:    ret
     %ld = load half, ptr %addr
     %res = call <vscale x 4 x half> @llvm.aarch64.sve.dup.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x i1> %pg, half %ld)
     ret <vscale x 4 x half> %res
 }
 define <vscale x 2 x half> @dup_ld1rh_half_passthruzero_nxv2f16(<vscale x 2 x i1> %pg, ptr %addr) {
-; CHECK-LABEL: dup_ld1rh_half_passthruzero_nxv2f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1rh { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv2f16:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ld1rh { z0.d }, p0/z, [x0]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv2f16:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr h1, [x0]
+; CHECK-NO-LD1R-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NO-LD1R-NEXT:    mov z0.h, p0/m, h1
+; CHECK-NO-LD1R-NEXT:    ret
     %ld = load half, ptr %addr
     %res = call <vscale x 2 x half> @llvm.aarch64.sve.dup.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x i1> %pg, half %ld)
     ret <vscale x 2 x half> %res
 }
 define <vscale x 2 x float> @dup_ld1rs_float_passthruzero_nxv2f32(<vscale x 2 x i1> %pg, ptr %addr) {
-; CHECK-LABEL: dup_ld1rs_float_passthruzero_nxv2f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1rw { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ret
+; CHECK-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv2f32:
+; CHECK-LD1R:       // %bb.0:
+; CHECK-LD1R-NEXT:    ld1rw { z0.d }, p0/z, [x0]
+; CHECK-LD1R-NEXT:    ret
+;
+; CHECK-NO-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv2f32:
+; CHECK-NO-LD1R:       // %bb.0:
+; CHECK-NO-LD1R-NEXT:    ldr s1, [x0]
+; CHECK-NO-LD1R-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NO-LD1R-NEXT:    mov z0.s, p0/m, s1
+; CHECK-NO-LD1R-NEXT:    ret
     %ld = load float, ptr %addr
     %res = call <vscale x 2 x float> @llvm.aarch64.sve.dup.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x i1> %pg, float %ld)
     ret <vscale x 2 x float> %res
@@ -1313,3 +1536,4 @@ declare <vscale x 2 x float> @llvm.aarch64.sve.dup.nxv2f32(<vscale x 2 x float>,
 
 
 attributes #0 = { "target-features"="+sve,+bf16" }
+attributes #1 = { "target-cpu"="neoverse-v1" }


        


More information about the llvm-branch-commits mailing list