[llvm] [AArch64][SME] Allow memory operations lowering to custom SME functions. (PR #79263)

Dinar Temirbulatov via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 9 05:47:20 PDT 2024


https://github.com/dtemirbulatov updated https://github.com/llvm/llvm-project/pull/79263

>From 94a31ab03d8bb9c7178998824bc848f89f7e16c8 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Wed, 24 Jan 2024 08:14:07 +0000
Subject: [PATCH 01/11] [AArch64][SME] Enable memory operations lowering to
 custom SME functions.

This change allows to lower memcpy, memset, memmove to custom SME version
provided by LibRT.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 10 ++-
 .../AArch64/AArch64SelectionDAGInfo.cpp       | 72 +++++++++++++++++++
 .../Target/AArch64/AArch64SelectionDAGInfo.h  |  4 ++
 .../AArch64/Utils/AArch64SMEAttributes.cpp    |  3 +
 llvm/test/CodeGen/AArch64/sme2-mops.ll        | 67 +++++++++++++++++
 5 files changed, 154 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sme2-mops.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 819e8ccd5c33f0..fcf5747e3e359a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7941,8 +7941,14 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
   if (CLI.CB)
     CalleeAttrs = SMEAttrs(*CLI.CB);
-  else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
-    CalleeAttrs = SMEAttrs(ES->getSymbol());
+  else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) {
+    if (StringRef(ES->getSymbol()) == StringRef("__arm_sc_memcpy")) {
+      auto Attrs = AttributeList().addFnAttribute(
+          *DAG.getContext(), "aarch64_pstate_sm_compatible");
+      CalleeAttrs = SMEAttrs(Attrs);
+    } else
+      CalleeAttrs = SMEAttrs(ES->getSymbol());
+  }
 
   auto DescribeCallsite =
       [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 9e43f206efcf78..fff4e2333194e3 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -76,12 +76,74 @@ SDValue AArch64SelectionDAGInfo::EmitMOPS(AArch64ISD::NodeType SDOpcode,
   }
 }
 
+SDValue AArch64SelectionDAGInfo::EmitSpecializedLibcall(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, RTLIB::Libcall LC) const {
+  const AArch64Subtarget &STI =
+      DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
+  const AArch64TargetLowering *TLI = STI.getTargetLowering();
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+  Entry.Node = Dst;
+  Args.push_back(Entry);
+
+  enum { SME_MEMCPY = 0, SME_MEMMOVE, SME_MEMSET } SMELibcall;
+  switch (LC) {
+  case RTLIB::MEMCPY:
+    SMELibcall = SME_MEMCPY;
+    Entry.Node = Src;
+    Args.push_back(Entry);
+    break;
+  case RTLIB::MEMMOVE:
+    SMELibcall = SME_MEMMOVE;
+    Entry.Node = Src;
+    Args.push_back(Entry);
+    break;
+  case RTLIB::MEMSET:
+    SMELibcall = SME_MEMSET;
+    if (Src.getValueType().bitsGT(MVT::i32))
+      Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
+    else if (Src.getValueType().bitsLT(MVT::i32))
+      Src = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
+    Entry.Node = Src;
+    Entry.Ty = Type::getInt32Ty(*DAG.getContext());
+    Entry.IsSExt = false;
+    Args.push_back(Entry);
+    break;
+  default:
+    return SDValue();
+  }
+  Entry.Node = Size;
+  Args.push_back(Entry);
+  char const *FunctionNames[3] = {"__arm_sc_memcpy", "__arm_sc_memmove",
+                                  "__arm_sc_memset"};
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL)
+      .setChain(Chain)
+      .setLibCallee(
+          TLI->getLibcallCallingConv(RTLIB::MEMCPY),
+          Type::getVoidTy(*DAG.getContext()),
+          DAG.getExternalSymbol(FunctionNames[SMELibcall],
+                                TLI->getPointerTy(DAG.getDataLayout())),
+          std::move(Args))
+      .setDiscardResult();
+  std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
+  return CallResult.second;
+}
+
 SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemcpy(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src,
     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   const AArch64Subtarget &STI =
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
+
+  SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
+  if (Attrs.hasStreamingBody() || Attrs.hasStreamingCompatibleInterface())
+    return EmitSpecializedLibcall(DAG, DL, Chain, Dst, Src, Size,
+                                  RTLIB::MEMCPY);
   if (STI.hasMOPS())
     return EmitMOPS(AArch64ISD::MOPS_MEMCOPY, DAG, DL, Chain, Dst, Src, Size,
                     Alignment, isVolatile, DstPtrInfo, SrcPtrInfo);
@@ -95,6 +157,11 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
   const AArch64Subtarget &STI =
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
 
+  SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
+  if (Attrs.hasStreamingBody() || Attrs.hasStreamingCompatibleInterface())
+    return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
+                                  RTLIB::MEMSET);
+
   if (STI.hasMOPS()) {
     return EmitMOPS(AArch64ISD::MOPS_MEMSET, DAG, dl, Chain, Dst, Src, Size,
                     Alignment, isVolatile, DstPtrInfo, MachinePointerInfo{});
@@ -108,6 +175,11 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove(
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   const AArch64Subtarget &STI =
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
+
+  SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
+  if (Attrs.hasStreamingBody() || Attrs.hasStreamingCompatibleInterface())
+    return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
+                                  RTLIB::MEMMOVE);
   if (STI.hasMOPS()) {
     return EmitMOPS(AArch64ISD::MOPS_MEMMOVE, DAG, dl, Chain, Dst, Src, Size,
                     Alignment, isVolatile, DstPtrInfo, SrcPtrInfo);
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index 73f93724d6fc73..9c55c21f3c3202 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -47,6 +47,10 @@ class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo {
                                   SDValue Chain, SDValue Op1, SDValue Op2,
                                   MachinePointerInfo DstPtrInfo,
                                   bool ZeroData) const override;
+
+  SDValue EmitSpecializedLibcall(SelectionDAG &DAG, const SDLoc &DL,
+                                 SDValue Chain, SDValue Dst, SDValue Src,
+                                 SDValue Size, RTLIB::Libcall LC) const;
 };
 }
 
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
index d399e0ac0794f6..b35e3f7f5394e4 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
@@ -53,6 +53,9 @@ SMEAttrs::SMEAttrs(StringRef FuncName) : Bitmask(0) {
   if (FuncName == "__arm_tpidr2_restore")
     Bitmask |= SMEAttrs::SM_Compatible | encodeZAState(StateValue::In) |
                SMEAttrs::SME_ABI_Routine;
+  if (FuncName == "__arm_sc_memcpy" || FuncName == "__arm_sc_memset" ||
+      FuncName == "__arm_sc_memmove")
+    Bitmask |= SMEAttrs::SM_Compatible;
 }
 
 SMEAttrs::SMEAttrs(const AttributeList &Attrs) {
diff --git a/llvm/test/CodeGen/AArch64/sme2-mops.ll b/llvm/test/CodeGen/AArch64/sme2-mops.ll
new file mode 100644
index 00000000000000..0ded6e965ecb9c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-mops.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+
+ at dst = global [512 x i8] zeroinitializer, align 1
+ at src = global [512 x i8] zeroinitializer, align 1
+
+
+define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: sc_memcpy:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    adrp x1, :got:src
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NEXT:    bl __arm_sc_memcpy
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+define void @sc_memset(i64 noundef %n) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: sc_memset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    mov w1, #2 // =0x2
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    // kill: def $w2 killed $w2 killed $x2
+; CHECK-NEXT:    bl __arm_sc_memset
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.memset.p0.i64(ptr align 1 @dst, i8 2, i64 %n, i1 false)
+  ret void
+}
+
+define void @sc_memmove(i64 noundef %n) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: sc_memmove:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    adrp x1, :got:src
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NEXT:    bl __arm_sc_memmove
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.memmove.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg)
+declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg)

>From b5b73118c75392adb0fc559e127f257c13b0df37 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Tue, 30 Jan 2024 09:08:43 +0000
Subject: [PATCH 02/11] Resolved comments

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 10 ++------
 .../AArch64/AArch64SelectionDAGInfo.cpp       | 24 +++++++------------
 llvm/test/CodeGen/AArch64/sme2-mops.ll        |  1 -
 3 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index fcf5747e3e359a..819e8ccd5c33f0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7941,14 +7941,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
   if (CLI.CB)
     CalleeAttrs = SMEAttrs(*CLI.CB);
-  else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) {
-    if (StringRef(ES->getSymbol()) == StringRef("__arm_sc_memcpy")) {
-      auto Attrs = AttributeList().addFnAttribute(
-          *DAG.getContext(), "aarch64_pstate_sm_compatible");
-      CalleeAttrs = SMEAttrs(Attrs);
-    } else
-      CalleeAttrs = SMEAttrs(ES->getSymbol());
-  }
+  else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
+    CalleeAttrs = SMEAttrs(ES->getSymbol());
 
   auto DescribeCallsite =
       [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index fff4e2333194e3..1c4142e535793c 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -84,28 +84,26 @@ SDValue AArch64SelectionDAGInfo::EmitSpecializedLibcall(
   const AArch64TargetLowering *TLI = STI.getTargetLowering();
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
+  SDValue Symbol;
   Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
   Entry.Node = Dst;
   Args.push_back(Entry);
+  EVT Ty = TLI->getPointerTy(DAG.getDataLayout());
 
-  enum { SME_MEMCPY = 0, SME_MEMMOVE, SME_MEMSET } SMELibcall;
   switch (LC) {
   case RTLIB::MEMCPY:
-    SMELibcall = SME_MEMCPY;
+    Symbol = DAG.getExternalSymbol("__arm_sc_memcpy", Ty);
     Entry.Node = Src;
     Args.push_back(Entry);
     break;
   case RTLIB::MEMMOVE:
-    SMELibcall = SME_MEMMOVE;
+    Symbol = DAG.getExternalSymbol("__arm_sc_memmove", Ty);
     Entry.Node = Src;
     Args.push_back(Entry);
     break;
   case RTLIB::MEMSET:
-    SMELibcall = SME_MEMSET;
-    if (Src.getValueType().bitsGT(MVT::i32))
-      Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
-    else if (Src.getValueType().bitsLT(MVT::i32))
-      Src = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
+    Symbol = DAG.getExternalSymbol("__arm_sc_memset", Ty);
+    Src = DAG.getZExtOrTrunc(Src, DL, MVT::i32);
     Entry.Node = Src;
     Entry.Ty = Type::getInt32Ty(*DAG.getContext());
     Entry.IsSExt = false;
@@ -116,18 +114,12 @@ SDValue AArch64SelectionDAGInfo::EmitSpecializedLibcall(
   }
   Entry.Node = Size;
   Args.push_back(Entry);
-  char const *FunctionNames[3] = {"__arm_sc_memcpy", "__arm_sc_memmove",
-                                  "__arm_sc_memset"};
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(DL)
       .setChain(Chain)
-      .setLibCallee(
-          TLI->getLibcallCallingConv(RTLIB::MEMCPY),
-          Type::getVoidTy(*DAG.getContext()),
-          DAG.getExternalSymbol(FunctionNames[SMELibcall],
-                                TLI->getPointerTy(DAG.getDataLayout())),
-          std::move(Args))
+      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
+                    Type::getVoidTy(*DAG.getContext()), Symbol, std::move(Args))
       .setDiscardResult();
   std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
   return CallResult.second;
diff --git a/llvm/test/CodeGen/AArch64/sme2-mops.ll b/llvm/test/CodeGen/AArch64/sme2-mops.ll
index 0ded6e965ecb9c..0599bc61a52f73 100644
--- a/llvm/test/CodeGen/AArch64/sme2-mops.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-mops.ll
@@ -4,7 +4,6 @@
 @dst = global [512 x i8] zeroinitializer, align 1
 @src = global [512 x i8] zeroinitializer, align 1
 
-
 define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: sc_memcpy:
 ; CHECK:       // %bb.0: // %entry

>From 281dfd3a1e536382f873c34f78585a63b3e143a0 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Wed, 31 Jan 2024 09:54:42 +0000
Subject: [PATCH 03/11] Resolved comments.

---
 llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 1c4142e535793c..a3386bea28350c 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -116,11 +116,9 @@ SDValue AArch64SelectionDAGInfo::EmitSpecializedLibcall(
   Args.push_back(Entry);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(DL)
-      .setChain(Chain)
-      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
-                    Type::getVoidTy(*DAG.getContext()), Symbol, std::move(Args))
-      .setDiscardResult();
+  CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
+      TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
+      Symbol, std::move(Args));
   std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
   return CallResult.second;
 }

>From 70f84734f834573aa07529d8676d7f5b2fc338c1 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Mon, 5 Feb 2024 08:36:14 +0000
Subject: [PATCH 04/11] Resolved comments.

---
 .../AArch64/AArch64SelectionDAGInfo.cpp       |  26 +-
 .../AArch64/Utils/AArch64SMEAttributes.cpp    |   2 +-
 llvm/test/CodeGen/AArch64/sme2-mops.ll        | 491 +++++++++++++++++-
 3 files changed, 511 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index a3386bea28350c..d2908ae83e9c0a 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -15,6 +15,12 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-selectiondag-info"
 
+static cl::opt<bool>
+    EnableSMEMops("aarch64-enable-sme-mops", cl::Hidden,
+                  cl::desc("Enable AArch64 SME memory operations "
+                           "to lower to librt functions"),
+                  cl::init(true));
+
 SDValue AArch64SelectionDAGInfo::EmitMOPS(AArch64ISD::NodeType SDOpcode,
                                           SelectionDAG &DAG, const SDLoc &DL,
                                           SDValue Chain, SDValue Dst,
@@ -90,6 +96,9 @@ SDValue AArch64SelectionDAGInfo::EmitSpecializedLibcall(
   Args.push_back(Entry);
   EVT Ty = TLI->getPointerTy(DAG.getDataLayout());
 
+  if (!EnableSMEMops)
+    return SDValue();
+
   switch (LC) {
   case RTLIB::MEMCPY:
     Symbol = DAG.getExternalSymbol("__arm_sc_memcpy", Ty);
@@ -116,9 +125,11 @@ SDValue AArch64SelectionDAGInfo::EmitSpecializedLibcall(
   Args.push_back(Entry);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
-      TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
-      Symbol, std::move(Args));
+  CLI.setDebugLoc(DL)
+      .setChain(Chain)
+      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
+                    Type::getVoidTy(*DAG.getContext()), Symbol, std::move(Args))
+      .setDiscardResult();
   std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
   return CallResult.second;
 }
@@ -131,7 +142,8 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemcpy(
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
 
   SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
-  if (Attrs.hasStreamingBody() || Attrs.hasStreamingCompatibleInterface())
+  if (Attrs.hasStreamingBody() || Attrs.hasStreamingCompatibleInterface() ||
+      Attrs.hasStreamingInterface())
     return EmitSpecializedLibcall(DAG, DL, Chain, Dst, Src, Size,
                                   RTLIB::MEMCPY);
   if (STI.hasMOPS())
@@ -148,7 +160,8 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
 
   SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
-  if (Attrs.hasStreamingBody() || Attrs.hasStreamingCompatibleInterface())
+  if (Attrs.hasStreamingBody() || Attrs.hasStreamingCompatibleInterface() ||
+      Attrs.hasStreamingInterface())
     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
                                   RTLIB::MEMSET);
 
@@ -167,7 +180,8 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove(
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
 
   SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
-  if (Attrs.hasStreamingBody() || Attrs.hasStreamingCompatibleInterface())
+  if (Attrs.hasStreamingBody() || Attrs.hasStreamingCompatibleInterface() ||
+      Attrs.hasStreamingInterface())
     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
                                   RTLIB::MEMMOVE);
   if (STI.hasMOPS()) {
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
index b35e3f7f5394e4..015ca4cb92b25e 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
@@ -54,7 +54,7 @@ SMEAttrs::SMEAttrs(StringRef FuncName) : Bitmask(0) {
     Bitmask |= SMEAttrs::SM_Compatible | encodeZAState(StateValue::In) |
                SMEAttrs::SME_ABI_Routine;
   if (FuncName == "__arm_sc_memcpy" || FuncName == "__arm_sc_memset" ||
-      FuncName == "__arm_sc_memmove")
+      FuncName == "__arm_sc_memmove" || FuncName == "__arm_sc_memchr")
     Bitmask |= SMEAttrs::SM_Compatible;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-mops.ll b/llvm/test/CodeGen/AArch64/sme2-mops.ll
index 0599bc61a52f73..6c3017d076079b 100644
--- a/llvm/test/CodeGen/AArch64/sme2-mops.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-mops.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -aarch64-enable-sme-mops=false < %s | FileCheck %s -check-prefixes=NO_SME_MOPS
 
 @dst = global [512 x i8] zeroinitializer, align 1
 @src = global [512 x i8] zeroinitializer, align 1
@@ -18,6 +19,48 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" {
 ; CHECK-NEXT:    bl __arm_sc_memcpy
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; NO_SME_MOPS-LABEL: sc_memcpy:
+; NO_SME_MOPS:       // %bb.0: // %entry
+; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    .cfi_def_cfa_offset 80
+; NO_SME_MOPS-NEXT:    .cfi_offset w19, -8
+; NO_SME_MOPS-NEXT:    .cfi_offset w30, -16
+; NO_SME_MOPS-NEXT:    .cfi_offset b8, -24
+; NO_SME_MOPS-NEXT:    .cfi_offset b9, -32
+; NO_SME_MOPS-NEXT:    .cfi_offset b10, -40
+; NO_SME_MOPS-NEXT:    .cfi_offset b11, -48
+; NO_SME_MOPS-NEXT:    .cfi_offset b12, -56
+; NO_SME_MOPS-NEXT:    .cfi_offset b13, -64
+; NO_SME_MOPS-NEXT:    .cfi_offset b14, -72
+; NO_SME_MOPS-NEXT:    .cfi_offset b15, -80
+; NO_SME_MOPS-NEXT:    mov x2, x0
+; NO_SME_MOPS-NEXT:    bl __arm_sme_state
+; NO_SME_MOPS-NEXT:    adrp x8, :got:dst
+; NO_SME_MOPS-NEXT:    adrp x1, :got:src
+; NO_SME_MOPS-NEXT:    and x19, x0, #0x1
+; NO_SME_MOPS-NEXT:    ldr x8, [x8, :got_lo12:dst]
+; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
+; NO_SME_MOPS-NEXT:    tbz w19, #0, .LBB0_2
+; NO_SME_MOPS-NEXT:  // %bb.1: // %entry
+; NO_SME_MOPS-NEXT:    smstop sm
+; NO_SME_MOPS-NEXT:  .LBB0_2: // %entry
+; NO_SME_MOPS-NEXT:    mov x0, x8
+; NO_SME_MOPS-NEXT:    bl memcpy
+; NO_SME_MOPS-NEXT:    tbz w19, #0, .LBB0_4
+; NO_SME_MOPS-NEXT:  // %bb.3: // %entry
+; NO_SME_MOPS-NEXT:    smstart sm
+; NO_SME_MOPS-NEXT:  .LBB0_4: // %entry
+; NO_SME_MOPS-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ret
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
   ret void
@@ -37,6 +80,46 @@ define void @sc_memset(i64 noundef %n) "aarch64_pstate_sm_compatible" {
 ; CHECK-NEXT:    bl __arm_sc_memset
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; NO_SME_MOPS-LABEL: sc_memset:
+; NO_SME_MOPS:       // %bb.0: // %entry
+; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    .cfi_def_cfa_offset 80
+; NO_SME_MOPS-NEXT:    .cfi_offset w19, -8
+; NO_SME_MOPS-NEXT:    .cfi_offset w30, -16
+; NO_SME_MOPS-NEXT:    .cfi_offset b8, -24
+; NO_SME_MOPS-NEXT:    .cfi_offset b9, -32
+; NO_SME_MOPS-NEXT:    .cfi_offset b10, -40
+; NO_SME_MOPS-NEXT:    .cfi_offset b11, -48
+; NO_SME_MOPS-NEXT:    .cfi_offset b12, -56
+; NO_SME_MOPS-NEXT:    .cfi_offset b13, -64
+; NO_SME_MOPS-NEXT:    .cfi_offset b14, -72
+; NO_SME_MOPS-NEXT:    .cfi_offset b15, -80
+; NO_SME_MOPS-NEXT:    mov x2, x0
+; NO_SME_MOPS-NEXT:    bl __arm_sme_state
+; NO_SME_MOPS-NEXT:    and x19, x0, #0x1
+; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
+; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; NO_SME_MOPS-NEXT:    tbz w19, #0, .LBB1_2
+; NO_SME_MOPS-NEXT:  // %bb.1: // %entry
+; NO_SME_MOPS-NEXT:    smstop sm
+; NO_SME_MOPS-NEXT:  .LBB1_2: // %entry
+; NO_SME_MOPS-NEXT:    mov w1, #2 // =0x2
+; NO_SME_MOPS-NEXT:    bl memset
+; NO_SME_MOPS-NEXT:    tbz w19, #0, .LBB1_4
+; NO_SME_MOPS-NEXT:  // %bb.3: // %entry
+; NO_SME_MOPS-NEXT:    smstart sm
+; NO_SME_MOPS-NEXT:  .LBB1_4: // %entry
+; NO_SME_MOPS-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ret
 entry:
   tail call void @llvm.memset.p0.i64(ptr align 1 @dst, i8 2, i64 %n, i1 false)
   ret void
@@ -56,6 +139,412 @@ define void @sc_memmove(i64 noundef %n) "aarch64_pstate_sm_compatible" {
 ; CHECK-NEXT:    bl __arm_sc_memmove
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; NO_SME_MOPS-LABEL: sc_memmove:
+; NO_SME_MOPS:       // %bb.0: // %entry
+; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    .cfi_def_cfa_offset 80
+; NO_SME_MOPS-NEXT:    .cfi_offset w19, -8
+; NO_SME_MOPS-NEXT:    .cfi_offset w30, -16
+; NO_SME_MOPS-NEXT:    .cfi_offset b8, -24
+; NO_SME_MOPS-NEXT:    .cfi_offset b9, -32
+; NO_SME_MOPS-NEXT:    .cfi_offset b10, -40
+; NO_SME_MOPS-NEXT:    .cfi_offset b11, -48
+; NO_SME_MOPS-NEXT:    .cfi_offset b12, -56
+; NO_SME_MOPS-NEXT:    .cfi_offset b13, -64
+; NO_SME_MOPS-NEXT:    .cfi_offset b14, -72
+; NO_SME_MOPS-NEXT:    .cfi_offset b15, -80
+; NO_SME_MOPS-NEXT:    mov x2, x0
+; NO_SME_MOPS-NEXT:    bl __arm_sme_state
+; NO_SME_MOPS-NEXT:    adrp x8, :got:dst
+; NO_SME_MOPS-NEXT:    adrp x1, :got:src
+; NO_SME_MOPS-NEXT:    and x19, x0, #0x1
+; NO_SME_MOPS-NEXT:    ldr x8, [x8, :got_lo12:dst]
+; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
+; NO_SME_MOPS-NEXT:    tbz w19, #0, .LBB2_2
+; NO_SME_MOPS-NEXT:  // %bb.1: // %entry
+; NO_SME_MOPS-NEXT:    smstop sm
+; NO_SME_MOPS-NEXT:  .LBB2_2: // %entry
+; NO_SME_MOPS-NEXT:    mov x0, x8
+; NO_SME_MOPS-NEXT:    bl memmove
+; NO_SME_MOPS-NEXT:    tbz w19, #0, .LBB2_4
+; NO_SME_MOPS-NEXT:  // %bb.3: // %entry
+; NO_SME_MOPS-NEXT:    smstart sm
+; NO_SME_MOPS-NEXT:  .LBB2_4: // %entry
+; NO_SME_MOPS-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ret
+entry:
+  tail call void @llvm.memmove.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+define void @se_memcpy(i64 noundef %n) "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: se_memcpy:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    adrp x1, :got:src
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NEXT:    bl __arm_sc_memcpy
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+;
+; NO_SME_MOPS-LABEL: se_memcpy:
+; NO_SME_MOPS:       // %bb.0: // %entry
+; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; NO_SME_MOPS-NEXT:    .cfi_def_cfa_offset 80
+; NO_SME_MOPS-NEXT:    .cfi_offset w30, -16
+; NO_SME_MOPS-NEXT:    .cfi_offset b8, -24
+; NO_SME_MOPS-NEXT:    .cfi_offset b9, -32
+; NO_SME_MOPS-NEXT:    .cfi_offset b10, -40
+; NO_SME_MOPS-NEXT:    .cfi_offset b11, -48
+; NO_SME_MOPS-NEXT:    .cfi_offset b12, -56
+; NO_SME_MOPS-NEXT:    .cfi_offset b13, -64
+; NO_SME_MOPS-NEXT:    .cfi_offset b14, -72
+; NO_SME_MOPS-NEXT:    .cfi_offset b15, -80
+; NO_SME_MOPS-NEXT:    mov x2, x0
+; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
+; NO_SME_MOPS-NEXT:    adrp x1, :got:src
+; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
+; NO_SME_MOPS-NEXT:    smstop sm
+; NO_SME_MOPS-NEXT:    bl memcpy
+; NO_SME_MOPS-NEXT:    smstart sm
+; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+define void @se_memset(i64 noundef %n) "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: se_memset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    mov w1, #2 // =0x2
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    // kill: def $w2 killed $w2 killed $x2
+; CHECK-NEXT:    bl __arm_sc_memset
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+;
+; NO_SME_MOPS-LABEL: se_memset:
+; NO_SME_MOPS:       // %bb.0: // %entry
+; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; NO_SME_MOPS-NEXT:    .cfi_def_cfa_offset 80
+; NO_SME_MOPS-NEXT:    .cfi_offset w30, -16
+; NO_SME_MOPS-NEXT:    .cfi_offset b8, -24
+; NO_SME_MOPS-NEXT:    .cfi_offset b9, -32
+; NO_SME_MOPS-NEXT:    .cfi_offset b10, -40
+; NO_SME_MOPS-NEXT:    .cfi_offset b11, -48
+; NO_SME_MOPS-NEXT:    .cfi_offset b12, -56
+; NO_SME_MOPS-NEXT:    .cfi_offset b13, -64
+; NO_SME_MOPS-NEXT:    .cfi_offset b14, -72
+; NO_SME_MOPS-NEXT:    .cfi_offset b15, -80
+; NO_SME_MOPS-NEXT:    mov x2, x0
+; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
+; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; NO_SME_MOPS-NEXT:    smstop sm
+; NO_SME_MOPS-NEXT:    mov w1, #2 // =0x2
+; NO_SME_MOPS-NEXT:    bl memset
+; NO_SME_MOPS-NEXT:    smstart sm
+; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ret
+entry:
+  tail call void @llvm.memset.p0.i64(ptr align 1 @dst, i8 2, i64 %n, i1 false)
+  ret void
+}
+
+define void @se_memmove(i64 noundef %n) "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: se_memmove:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    adrp x1, :got:src
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NEXT:    bl __arm_sc_memmove
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+;
+; NO_SME_MOPS-LABEL: se_memmove:
+; NO_SME_MOPS:       // %bb.0: // %entry
+; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; NO_SME_MOPS-NEXT:    .cfi_def_cfa_offset 80
+; NO_SME_MOPS-NEXT:    .cfi_offset w30, -16
+; NO_SME_MOPS-NEXT:    .cfi_offset b8, -24
+; NO_SME_MOPS-NEXT:    .cfi_offset b9, -32
+; NO_SME_MOPS-NEXT:    .cfi_offset b10, -40
+; NO_SME_MOPS-NEXT:    .cfi_offset b11, -48
+; NO_SME_MOPS-NEXT:    .cfi_offset b12, -56
+; NO_SME_MOPS-NEXT:    .cfi_offset b13, -64
+; NO_SME_MOPS-NEXT:    .cfi_offset b14, -72
+; NO_SME_MOPS-NEXT:    .cfi_offset b15, -80
+; NO_SME_MOPS-NEXT:    mov x2, x0
+; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
+; NO_SME_MOPS-NEXT:    adrp x1, :got:src
+; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
+; NO_SME_MOPS-NEXT:    smstop sm
+; NO_SME_MOPS-NEXT:    bl memmove
+; NO_SME_MOPS-NEXT:    smstart sm
+; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ret
+entry:
+  tail call void @llvm.memmove.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" {
+; CHECK-LABEL: sb_memcpy:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    .cfi_offset b8, -24
+; CHECK-NEXT:    .cfi_offset b9, -32
+; CHECK-NEXT:    .cfi_offset b10, -40
+; CHECK-NEXT:    .cfi_offset b11, -48
+; CHECK-NEXT:    .cfi_offset b12, -56
+; CHECK-NEXT:    .cfi_offset b13, -64
+; CHECK-NEXT:    .cfi_offset b14, -72
+; CHECK-NEXT:    .cfi_offset b15, -80
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    adrp x1, :got:src
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NEXT:    bl __arm_sc_memcpy
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+;
+; NO_SME_MOPS-LABEL: sb_memcpy:
+; NO_SME_MOPS:       // %bb.0: // %entry
+; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; NO_SME_MOPS-NEXT:    .cfi_def_cfa_offset 80
+; NO_SME_MOPS-NEXT:    .cfi_offset w30, -16
+; NO_SME_MOPS-NEXT:    .cfi_offset b8, -24
+; NO_SME_MOPS-NEXT:    .cfi_offset b9, -32
+; NO_SME_MOPS-NEXT:    .cfi_offset b10, -40
+; NO_SME_MOPS-NEXT:    .cfi_offset b11, -48
+; NO_SME_MOPS-NEXT:    .cfi_offset b12, -56
+; NO_SME_MOPS-NEXT:    .cfi_offset b13, -64
+; NO_SME_MOPS-NEXT:    .cfi_offset b14, -72
+; NO_SME_MOPS-NEXT:    .cfi_offset b15, -80
+; NO_SME_MOPS-NEXT:    mov x2, x0
+; NO_SME_MOPS-NEXT:    smstart sm
+; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
+; NO_SME_MOPS-NEXT:    adrp x1, :got:src
+; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
+; NO_SME_MOPS-NEXT:    smstop sm
+; NO_SME_MOPS-NEXT:    bl memcpy
+; NO_SME_MOPS-NEXT:    smstart sm
+; NO_SME_MOPS-NEXT:    smstop sm
+; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+define void @sb_memset(i64 noundef %n) "aarch64_pstate_sm_body" {
+; CHECK-LABEL: sb_memset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    .cfi_offset b8, -24
+; CHECK-NEXT:    .cfi_offset b9, -32
+; CHECK-NEXT:    .cfi_offset b10, -40
+; CHECK-NEXT:    .cfi_offset b11, -48
+; CHECK-NEXT:    .cfi_offset b12, -56
+; CHECK-NEXT:    .cfi_offset b13, -64
+; CHECK-NEXT:    .cfi_offset b14, -72
+; CHECK-NEXT:    .cfi_offset b15, -80
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    mov w1, #2 // =0x2
+; CHECK-NEXT:    // kill: def $w2 killed $w2 killed $x2
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    bl __arm_sc_memset
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+;
+; NO_SME_MOPS-LABEL: sb_memset:
+; NO_SME_MOPS:       // %bb.0: // %entry
+; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; NO_SME_MOPS-NEXT:    .cfi_def_cfa_offset 80
+; NO_SME_MOPS-NEXT:    .cfi_offset w30, -16
+; NO_SME_MOPS-NEXT:    .cfi_offset b8, -24
+; NO_SME_MOPS-NEXT:    .cfi_offset b9, -32
+; NO_SME_MOPS-NEXT:    .cfi_offset b10, -40
+; NO_SME_MOPS-NEXT:    .cfi_offset b11, -48
+; NO_SME_MOPS-NEXT:    .cfi_offset b12, -56
+; NO_SME_MOPS-NEXT:    .cfi_offset b13, -64
+; NO_SME_MOPS-NEXT:    .cfi_offset b14, -72
+; NO_SME_MOPS-NEXT:    .cfi_offset b15, -80
+; NO_SME_MOPS-NEXT:    mov x2, x0
+; NO_SME_MOPS-NEXT:    smstart sm
+; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
+; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; NO_SME_MOPS-NEXT:    smstop sm
+; NO_SME_MOPS-NEXT:    mov w1, #2 // =0x2
+; NO_SME_MOPS-NEXT:    bl memset
+; NO_SME_MOPS-NEXT:    smstart sm
+; NO_SME_MOPS-NEXT:    smstop sm
+; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ret
+entry:
+  tail call void @llvm.memset.p0.i64(ptr align 1 @dst, i8 2, i64 %n, i1 false)
+  ret void
+}
+
+define void @sb_memmove(i64 noundef %n) "aarch64_pstate_sm_body" {
+; CHECK-LABEL: sb_memmove:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    .cfi_offset b8, -24
+; CHECK-NEXT:    .cfi_offset b9, -32
+; CHECK-NEXT:    .cfi_offset b10, -40
+; CHECK-NEXT:    .cfi_offset b11, -48
+; CHECK-NEXT:    .cfi_offset b12, -56
+; CHECK-NEXT:    .cfi_offset b13, -64
+; CHECK-NEXT:    .cfi_offset b14, -72
+; CHECK-NEXT:    .cfi_offset b15, -80
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    adrp x1, :got:src
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NEXT:    bl __arm_sc_memmove
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+;
+; NO_SME_MOPS-LABEL: sb_memmove:
+; NO_SME_MOPS:       // %bb.0: // %entry
+; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; NO_SME_MOPS-NEXT:    .cfi_def_cfa_offset 80
+; NO_SME_MOPS-NEXT:    .cfi_offset w30, -16
+; NO_SME_MOPS-NEXT:    .cfi_offset b8, -24
+; NO_SME_MOPS-NEXT:    .cfi_offset b9, -32
+; NO_SME_MOPS-NEXT:    .cfi_offset b10, -40
+; NO_SME_MOPS-NEXT:    .cfi_offset b11, -48
+; NO_SME_MOPS-NEXT:    .cfi_offset b12, -56
+; NO_SME_MOPS-NEXT:    .cfi_offset b13, -64
+; NO_SME_MOPS-NEXT:    .cfi_offset b14, -72
+; NO_SME_MOPS-NEXT:    .cfi_offset b15, -80
+; NO_SME_MOPS-NEXT:    mov x2, x0
+; NO_SME_MOPS-NEXT:    smstart sm
+; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
+; NO_SME_MOPS-NEXT:    adrp x1, :got:src
+; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
+; NO_SME_MOPS-NEXT:    smstop sm
+; NO_SME_MOPS-NEXT:    bl memmove
+; NO_SME_MOPS-NEXT:    smstart sm
+; NO_SME_MOPS-NEXT:    smstop sm
+; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ret
 entry:
   tail call void @llvm.memmove.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
   ret void

>From 5b5ffd5e24fdcd7d2cda548137373344b3fc3ad9 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Mon, 5 Feb 2024 16:41:33 +0000
Subject: [PATCH 05/11] Removed accidently restored setDiscardResult() for
 lowering call, fix issue with incorrect type initialization for Size
 argument.

---
 llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp | 10 ++++------
 llvm/test/CodeGen/AArch64/sme2-mops.ll              |  3 ---
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index d2908ae83e9c0a..a04d83f69b5cf4 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -115,21 +115,19 @@ SDValue AArch64SelectionDAGInfo::EmitSpecializedLibcall(
     Src = DAG.getZExtOrTrunc(Src, DL, MVT::i32);
     Entry.Node = Src;
     Entry.Ty = Type::getInt32Ty(*DAG.getContext());
-    Entry.IsSExt = false;
     Args.push_back(Entry);
     break;
   default:
     return SDValue();
   }
   Entry.Node = Size;
+  Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
   Args.push_back(Entry);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(DL)
-      .setChain(Chain)
-      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
-                    Type::getVoidTy(*DAG.getContext()), Symbol, std::move(Args))
-      .setDiscardResult();
+  CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
+      TLI->getLibcallCallingConv(RTLIB::MEMCPY),
+      Type::getVoidTy(*DAG.getContext()), Symbol, std::move(Args));
   std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
   return CallResult.second;
 }
diff --git a/llvm/test/CodeGen/AArch64/sme2-mops.ll b/llvm/test/CodeGen/AArch64/sme2-mops.ll
index 6c3017d076079b..dda509993f4807 100644
--- a/llvm/test/CodeGen/AArch64/sme2-mops.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-mops.ll
@@ -76,7 +76,6 @@ define void @sc_memset(i64 noundef %n) "aarch64_pstate_sm_compatible" {
 ; CHECK-NEXT:    adrp x0, :got:dst
 ; CHECK-NEXT:    mov w1, #2 // =0x2
 ; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; CHECK-NEXT:    // kill: def $w2 killed $w2 killed $x2
 ; CHECK-NEXT:    bl __arm_sc_memset
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -247,7 +246,6 @@ define void @se_memset(i64 noundef %n) "aarch64_pstate_sm_enabled" {
 ; CHECK-NEXT:    adrp x0, :got:dst
 ; CHECK-NEXT:    mov w1, #2 // =0x2
 ; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; CHECK-NEXT:    // kill: def $w2 killed $w2 killed $x2
 ; CHECK-NEXT:    bl __arm_sc_memset
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -431,7 +429,6 @@ define void @sb_memset(i64 noundef %n) "aarch64_pstate_sm_body" {
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    adrp x0, :got:dst
 ; CHECK-NEXT:    mov w1, #2 // =0x2
-; CHECK-NEXT:    // kill: def $w2 killed $w2 killed $x2
 ; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
 ; CHECK-NEXT:    bl __arm_sc_memset
 ; CHECK-NEXT:    smstop sm

>From 00beaf114082d6dc915801df9a62207a6318e334 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Mon, 5 Feb 2024 21:56:11 +0000
Subject: [PATCH 06/11] Restore change, accidently removed before.

---
 llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index a04d83f69b5cf4..a3658b376f5f83 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -126,8 +126,8 @@ SDValue AArch64SelectionDAGInfo::EmitSpecializedLibcall(
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
-      TLI->getLibcallCallingConv(RTLIB::MEMCPY),
-      Type::getVoidTy(*DAG.getContext()), Symbol, std::move(Args));
+      TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
+      Symbol, std::move(Args));
   std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
   return CallResult.second;
 }

>From 751c95f0a50bf0e621ba5fe9e51940a83f0a910f Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Thu, 29 Feb 2024 16:53:47 +0000
Subject: [PATCH 07/11] Resolved comments.

---
 .../AArch64/AArch64SelectionDAGInfo.cpp       |  73 +--
 llvm/test/CodeGen/AArch64/sme2-mops.ll        | 552 ------------------
 .../streaming-compatible-memory-ops.ll        | 289 +++++++++
 3 files changed, 329 insertions(+), 585 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/sme2-mops.ll
 create mode 100644 llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll

diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index a3658b376f5f83..a8cefc3a72ed5e 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -16,10 +16,10 @@ using namespace llvm;
 #define DEBUG_TYPE "aarch64-selectiondag-info"
 
 static cl::opt<bool>
-    EnableSMEMops("aarch64-enable-sme-mops", cl::Hidden,
-                  cl::desc("Enable AArch64 SME memory operations "
-                           "to lower to librt functions"),
-                  cl::init(true));
+    LowerToSMERoutines("aarch64-lower-to-sme-routines", cl::Hidden,
+                       cl::desc("Enable AArch64 SME memory operations "
+                                "to lower to librt functions"),
+                       cl::init(true));
 
 SDValue AArch64SelectionDAGInfo::EmitMOPS(AArch64ISD::NodeType SDOpcode,
                                           SelectionDAG &DAG, const SDLoc &DL,
@@ -89,40 +89,50 @@ SDValue AArch64SelectionDAGInfo::EmitSpecializedLibcall(
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
   const AArch64TargetLowering *TLI = STI.getTargetLowering();
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
+  TargetLowering::ArgListEntry DstEntry;
   SDValue Symbol;
-  Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
-  Entry.Node = Dst;
-  Args.push_back(Entry);
+  DstEntry.Ty = PointerType::getUnqual(*DAG.getContext());
+  DstEntry.Node = Dst;
+  Args.push_back(DstEntry);
   EVT Ty = TLI->getPointerTy(DAG.getDataLayout());
 
-  if (!EnableSMEMops)
+  if (!LowerToSMERoutines)
     return SDValue();
 
   switch (LC) {
-  case RTLIB::MEMCPY:
+  case RTLIB::MEMCPY: {
+    TargetLowering::ArgListEntry Entry;
+    Entry.Ty = PointerType::getUnqual(*DAG.getContext());
     Symbol = DAG.getExternalSymbol("__arm_sc_memcpy", Ty);
     Entry.Node = Src;
     Args.push_back(Entry);
     break;
-  case RTLIB::MEMMOVE:
+  }
+  case RTLIB::MEMMOVE: {
+    TargetLowering::ArgListEntry Entry;
+    Entry.Ty = PointerType::getUnqual(*DAG.getContext());
     Symbol = DAG.getExternalSymbol("__arm_sc_memmove", Ty);
     Entry.Node = Src;
     Args.push_back(Entry);
     break;
-  case RTLIB::MEMSET:
+  }
+  case RTLIB::MEMSET: {
+    TargetLowering::ArgListEntry Entry;
+    Entry.Ty = PointerType::getUnqual(*DAG.getContext());
     Symbol = DAG.getExternalSymbol("__arm_sc_memset", Ty);
     Src = DAG.getZExtOrTrunc(Src, DL, MVT::i32);
     Entry.Node = Src;
     Entry.Ty = Type::getInt32Ty(*DAG.getContext());
     Args.push_back(Entry);
     break;
+  }
   default:
     return SDValue();
   }
-  Entry.Node = Size;
-  Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
-  Args.push_back(Entry);
+  TargetLowering::ArgListEntry SizeEntry;
+  SizeEntry.Node = Size;
+  SizeEntry.Ty = PointerType::getUnqual(*DAG.getContext());
+  Args.push_back(SizeEntry);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
@@ -139,14 +149,14 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemcpy(
   const AArch64Subtarget &STI =
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
 
-  SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
-  if (Attrs.hasStreamingBody() || Attrs.hasStreamingCompatibleInterface() ||
-      Attrs.hasStreamingInterface())
-    return EmitSpecializedLibcall(DAG, DL, Chain, Dst, Src, Size,
-                                  RTLIB::MEMCPY);
   if (STI.hasMOPS())
     return EmitMOPS(AArch64ISD::MOPS_MEMCOPY, DAG, DL, Chain, Dst, Src, Size,
                     Alignment, isVolatile, DstPtrInfo, SrcPtrInfo);
+
+  SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
+  if (!Attrs.hasNonStreamingInterfaceAndBody())
+    return EmitSpecializedLibcall(DAG, DL, Chain, Dst, Src, Size,
+                                  RTLIB::MEMCPY);
   return SDValue();
 }
 
@@ -157,16 +167,14 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
   const AArch64Subtarget &STI =
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
 
+  if (STI.hasMOPS())
+    return EmitMOPS(AArch64ISD::MOPS_MEMSET, DAG, dl, Chain, Dst, Src, Size,
+                    Alignment, isVolatile, DstPtrInfo, MachinePointerInfo{});
+
   SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
-  if (Attrs.hasStreamingBody() || Attrs.hasStreamingCompatibleInterface() ||
-      Attrs.hasStreamingInterface())
+  if (!Attrs.hasNonStreamingInterfaceAndBody())
     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
                                   RTLIB::MEMSET);
-
-  if (STI.hasMOPS()) {
-    return EmitMOPS(AArch64ISD::MOPS_MEMSET, DAG, dl, Chain, Dst, Src, Size,
-                    Alignment, isVolatile, DstPtrInfo, MachinePointerInfo{});
-  }
   return SDValue();
 }
 
@@ -177,15 +185,14 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove(
   const AArch64Subtarget &STI =
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
 
+  if (STI.hasMOPS())
+    return EmitMOPS(AArch64ISD::MOPS_MEMMOVE, DAG, dl, Chain, Dst, Src, Size,
+                    Alignment, isVolatile, DstPtrInfo, SrcPtrInfo);
+
   SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
-  if (Attrs.hasStreamingBody() || Attrs.hasStreamingCompatibleInterface() ||
-      Attrs.hasStreamingInterface())
+  if (!Attrs.hasNonStreamingInterfaceAndBody())
     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
                                   RTLIB::MEMMOVE);
-  if (STI.hasMOPS()) {
-    return EmitMOPS(AArch64ISD::MOPS_MEMMOVE, DAG, dl, Chain, Dst, Src, Size,
-                    Alignment, isVolatile, DstPtrInfo, SrcPtrInfo);
-  }
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-mops.ll b/llvm/test/CodeGen/AArch64/sme2-mops.ll
deleted file mode 100644
index dda509993f4807..00000000000000
--- a/llvm/test/CodeGen/AArch64/sme2-mops.ll
+++ /dev/null
@@ -1,552 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -aarch64-enable-sme-mops=false < %s | FileCheck %s -check-prefixes=NO_SME_MOPS
-
- at dst = global [512 x i8] zeroinitializer, align 1
- at src = global [512 x i8] zeroinitializer, align 1
-
-define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: sc_memcpy:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    adrp x0, :got:dst
-; CHECK-NEXT:    adrp x1, :got:src
-; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
-; CHECK-NEXT:    bl __arm_sc_memcpy
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-;
-; NO_SME_MOPS-LABEL: sc_memcpy:
-; NO_SME_MOPS:       // %bb.0: // %entry
-; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    .cfi_def_cfa_offset 80
-; NO_SME_MOPS-NEXT:    .cfi_offset w19, -8
-; NO_SME_MOPS-NEXT:    .cfi_offset w30, -16
-; NO_SME_MOPS-NEXT:    .cfi_offset b8, -24
-; NO_SME_MOPS-NEXT:    .cfi_offset b9, -32
-; NO_SME_MOPS-NEXT:    .cfi_offset b10, -40
-; NO_SME_MOPS-NEXT:    .cfi_offset b11, -48
-; NO_SME_MOPS-NEXT:    .cfi_offset b12, -56
-; NO_SME_MOPS-NEXT:    .cfi_offset b13, -64
-; NO_SME_MOPS-NEXT:    .cfi_offset b14, -72
-; NO_SME_MOPS-NEXT:    .cfi_offset b15, -80
-; NO_SME_MOPS-NEXT:    mov x2, x0
-; NO_SME_MOPS-NEXT:    bl __arm_sme_state
-; NO_SME_MOPS-NEXT:    adrp x8, :got:dst
-; NO_SME_MOPS-NEXT:    adrp x1, :got:src
-; NO_SME_MOPS-NEXT:    and x19, x0, #0x1
-; NO_SME_MOPS-NEXT:    ldr x8, [x8, :got_lo12:dst]
-; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
-; NO_SME_MOPS-NEXT:    tbz w19, #0, .LBB0_2
-; NO_SME_MOPS-NEXT:  // %bb.1: // %entry
-; NO_SME_MOPS-NEXT:    smstop sm
-; NO_SME_MOPS-NEXT:  .LBB0_2: // %entry
-; NO_SME_MOPS-NEXT:    mov x0, x8
-; NO_SME_MOPS-NEXT:    bl memcpy
-; NO_SME_MOPS-NEXT:    tbz w19, #0, .LBB0_4
-; NO_SME_MOPS-NEXT:  // %bb.3: // %entry
-; NO_SME_MOPS-NEXT:    smstart sm
-; NO_SME_MOPS-NEXT:  .LBB0_4: // %entry
-; NO_SME_MOPS-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ret
-entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
-  ret void
-}
-
-define void @sc_memset(i64 noundef %n) "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: sc_memset:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    adrp x0, :got:dst
-; CHECK-NEXT:    mov w1, #2 // =0x2
-; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; CHECK-NEXT:    bl __arm_sc_memset
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-;
-; NO_SME_MOPS-LABEL: sc_memset:
-; NO_SME_MOPS:       // %bb.0: // %entry
-; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    .cfi_def_cfa_offset 80
-; NO_SME_MOPS-NEXT:    .cfi_offset w19, -8
-; NO_SME_MOPS-NEXT:    .cfi_offset w30, -16
-; NO_SME_MOPS-NEXT:    .cfi_offset b8, -24
-; NO_SME_MOPS-NEXT:    .cfi_offset b9, -32
-; NO_SME_MOPS-NEXT:    .cfi_offset b10, -40
-; NO_SME_MOPS-NEXT:    .cfi_offset b11, -48
-; NO_SME_MOPS-NEXT:    .cfi_offset b12, -56
-; NO_SME_MOPS-NEXT:    .cfi_offset b13, -64
-; NO_SME_MOPS-NEXT:    .cfi_offset b14, -72
-; NO_SME_MOPS-NEXT:    .cfi_offset b15, -80
-; NO_SME_MOPS-NEXT:    mov x2, x0
-; NO_SME_MOPS-NEXT:    bl __arm_sme_state
-; NO_SME_MOPS-NEXT:    and x19, x0, #0x1
-; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
-; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; NO_SME_MOPS-NEXT:    tbz w19, #0, .LBB1_2
-; NO_SME_MOPS-NEXT:  // %bb.1: // %entry
-; NO_SME_MOPS-NEXT:    smstop sm
-; NO_SME_MOPS-NEXT:  .LBB1_2: // %entry
-; NO_SME_MOPS-NEXT:    mov w1, #2 // =0x2
-; NO_SME_MOPS-NEXT:    bl memset
-; NO_SME_MOPS-NEXT:    tbz w19, #0, .LBB1_4
-; NO_SME_MOPS-NEXT:  // %bb.3: // %entry
-; NO_SME_MOPS-NEXT:    smstart sm
-; NO_SME_MOPS-NEXT:  .LBB1_4: // %entry
-; NO_SME_MOPS-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ret
-entry:
-  tail call void @llvm.memset.p0.i64(ptr align 1 @dst, i8 2, i64 %n, i1 false)
-  ret void
-}
-
-define void @sc_memmove(i64 noundef %n) "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: sc_memmove:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    adrp x0, :got:dst
-; CHECK-NEXT:    adrp x1, :got:src
-; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
-; CHECK-NEXT:    bl __arm_sc_memmove
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-;
-; NO_SME_MOPS-LABEL: sc_memmove:
-; NO_SME_MOPS:       // %bb.0: // %entry
-; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    .cfi_def_cfa_offset 80
-; NO_SME_MOPS-NEXT:    .cfi_offset w19, -8
-; NO_SME_MOPS-NEXT:    .cfi_offset w30, -16
-; NO_SME_MOPS-NEXT:    .cfi_offset b8, -24
-; NO_SME_MOPS-NEXT:    .cfi_offset b9, -32
-; NO_SME_MOPS-NEXT:    .cfi_offset b10, -40
-; NO_SME_MOPS-NEXT:    .cfi_offset b11, -48
-; NO_SME_MOPS-NEXT:    .cfi_offset b12, -56
-; NO_SME_MOPS-NEXT:    .cfi_offset b13, -64
-; NO_SME_MOPS-NEXT:    .cfi_offset b14, -72
-; NO_SME_MOPS-NEXT:    .cfi_offset b15, -80
-; NO_SME_MOPS-NEXT:    mov x2, x0
-; NO_SME_MOPS-NEXT:    bl __arm_sme_state
-; NO_SME_MOPS-NEXT:    adrp x8, :got:dst
-; NO_SME_MOPS-NEXT:    adrp x1, :got:src
-; NO_SME_MOPS-NEXT:    and x19, x0, #0x1
-; NO_SME_MOPS-NEXT:    ldr x8, [x8, :got_lo12:dst]
-; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
-; NO_SME_MOPS-NEXT:    tbz w19, #0, .LBB2_2
-; NO_SME_MOPS-NEXT:  // %bb.1: // %entry
-; NO_SME_MOPS-NEXT:    smstop sm
-; NO_SME_MOPS-NEXT:  .LBB2_2: // %entry
-; NO_SME_MOPS-NEXT:    mov x0, x8
-; NO_SME_MOPS-NEXT:    bl memmove
-; NO_SME_MOPS-NEXT:    tbz w19, #0, .LBB2_4
-; NO_SME_MOPS-NEXT:  // %bb.3: // %entry
-; NO_SME_MOPS-NEXT:    smstart sm
-; NO_SME_MOPS-NEXT:  .LBB2_4: // %entry
-; NO_SME_MOPS-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ret
-entry:
-  tail call void @llvm.memmove.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
-  ret void
-}
-
-define void @se_memcpy(i64 noundef %n) "aarch64_pstate_sm_enabled" {
-; CHECK-LABEL: se_memcpy:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    adrp x0, :got:dst
-; CHECK-NEXT:    adrp x1, :got:src
-; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
-; CHECK-NEXT:    bl __arm_sc_memcpy
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-;
-; NO_SME_MOPS-LABEL: se_memcpy:
-; NO_SME_MOPS:       // %bb.0: // %entry
-; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; NO_SME_MOPS-NEXT:    .cfi_def_cfa_offset 80
-; NO_SME_MOPS-NEXT:    .cfi_offset w30, -16
-; NO_SME_MOPS-NEXT:    .cfi_offset b8, -24
-; NO_SME_MOPS-NEXT:    .cfi_offset b9, -32
-; NO_SME_MOPS-NEXT:    .cfi_offset b10, -40
-; NO_SME_MOPS-NEXT:    .cfi_offset b11, -48
-; NO_SME_MOPS-NEXT:    .cfi_offset b12, -56
-; NO_SME_MOPS-NEXT:    .cfi_offset b13, -64
-; NO_SME_MOPS-NEXT:    .cfi_offset b14, -72
-; NO_SME_MOPS-NEXT:    .cfi_offset b15, -80
-; NO_SME_MOPS-NEXT:    mov x2, x0
-; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
-; NO_SME_MOPS-NEXT:    adrp x1, :got:src
-; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
-; NO_SME_MOPS-NEXT:    smstop sm
-; NO_SME_MOPS-NEXT:    bl memcpy
-; NO_SME_MOPS-NEXT:    smstart sm
-; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ret
-entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
-  ret void
-}
-
-define void @se_memset(i64 noundef %n) "aarch64_pstate_sm_enabled" {
-; CHECK-LABEL: se_memset:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    adrp x0, :got:dst
-; CHECK-NEXT:    mov w1, #2 // =0x2
-; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; CHECK-NEXT:    bl __arm_sc_memset
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-;
-; NO_SME_MOPS-LABEL: se_memset:
-; NO_SME_MOPS:       // %bb.0: // %entry
-; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; NO_SME_MOPS-NEXT:    .cfi_def_cfa_offset 80
-; NO_SME_MOPS-NEXT:    .cfi_offset w30, -16
-; NO_SME_MOPS-NEXT:    .cfi_offset b8, -24
-; NO_SME_MOPS-NEXT:    .cfi_offset b9, -32
-; NO_SME_MOPS-NEXT:    .cfi_offset b10, -40
-; NO_SME_MOPS-NEXT:    .cfi_offset b11, -48
-; NO_SME_MOPS-NEXT:    .cfi_offset b12, -56
-; NO_SME_MOPS-NEXT:    .cfi_offset b13, -64
-; NO_SME_MOPS-NEXT:    .cfi_offset b14, -72
-; NO_SME_MOPS-NEXT:    .cfi_offset b15, -80
-; NO_SME_MOPS-NEXT:    mov x2, x0
-; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
-; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; NO_SME_MOPS-NEXT:    smstop sm
-; NO_SME_MOPS-NEXT:    mov w1, #2 // =0x2
-; NO_SME_MOPS-NEXT:    bl memset
-; NO_SME_MOPS-NEXT:    smstart sm
-; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ret
-entry:
-  tail call void @llvm.memset.p0.i64(ptr align 1 @dst, i8 2, i64 %n, i1 false)
-  ret void
-}
-
-define void @se_memmove(i64 noundef %n) "aarch64_pstate_sm_enabled" {
-; CHECK-LABEL: se_memmove:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    adrp x0, :got:dst
-; CHECK-NEXT:    adrp x1, :got:src
-; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
-; CHECK-NEXT:    bl __arm_sc_memmove
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-;
-; NO_SME_MOPS-LABEL: se_memmove:
-; NO_SME_MOPS:       // %bb.0: // %entry
-; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; NO_SME_MOPS-NEXT:    .cfi_def_cfa_offset 80
-; NO_SME_MOPS-NEXT:    .cfi_offset w30, -16
-; NO_SME_MOPS-NEXT:    .cfi_offset b8, -24
-; NO_SME_MOPS-NEXT:    .cfi_offset b9, -32
-; NO_SME_MOPS-NEXT:    .cfi_offset b10, -40
-; NO_SME_MOPS-NEXT:    .cfi_offset b11, -48
-; NO_SME_MOPS-NEXT:    .cfi_offset b12, -56
-; NO_SME_MOPS-NEXT:    .cfi_offset b13, -64
-; NO_SME_MOPS-NEXT:    .cfi_offset b14, -72
-; NO_SME_MOPS-NEXT:    .cfi_offset b15, -80
-; NO_SME_MOPS-NEXT:    mov x2, x0
-; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
-; NO_SME_MOPS-NEXT:    adrp x1, :got:src
-; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
-; NO_SME_MOPS-NEXT:    smstop sm
-; NO_SME_MOPS-NEXT:    bl memmove
-; NO_SME_MOPS-NEXT:    smstart sm
-; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ret
-entry:
-  tail call void @llvm.memmove.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
-  ret void
-}
-
-define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" {
-; CHECK-LABEL: sb_memcpy:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    .cfi_offset b8, -24
-; CHECK-NEXT:    .cfi_offset b9, -32
-; CHECK-NEXT:    .cfi_offset b10, -40
-; CHECK-NEXT:    .cfi_offset b11, -48
-; CHECK-NEXT:    .cfi_offset b12, -56
-; CHECK-NEXT:    .cfi_offset b13, -64
-; CHECK-NEXT:    .cfi_offset b14, -72
-; CHECK-NEXT:    .cfi_offset b15, -80
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    adrp x0, :got:dst
-; CHECK-NEXT:    adrp x1, :got:src
-; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
-; CHECK-NEXT:    bl __arm_sc_memcpy
-; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; CHECK-NEXT:    ret
-;
-; NO_SME_MOPS-LABEL: sb_memcpy:
-; NO_SME_MOPS:       // %bb.0: // %entry
-; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; NO_SME_MOPS-NEXT:    .cfi_def_cfa_offset 80
-; NO_SME_MOPS-NEXT:    .cfi_offset w30, -16
-; NO_SME_MOPS-NEXT:    .cfi_offset b8, -24
-; NO_SME_MOPS-NEXT:    .cfi_offset b9, -32
-; NO_SME_MOPS-NEXT:    .cfi_offset b10, -40
-; NO_SME_MOPS-NEXT:    .cfi_offset b11, -48
-; NO_SME_MOPS-NEXT:    .cfi_offset b12, -56
-; NO_SME_MOPS-NEXT:    .cfi_offset b13, -64
-; NO_SME_MOPS-NEXT:    .cfi_offset b14, -72
-; NO_SME_MOPS-NEXT:    .cfi_offset b15, -80
-; NO_SME_MOPS-NEXT:    mov x2, x0
-; NO_SME_MOPS-NEXT:    smstart sm
-; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
-; NO_SME_MOPS-NEXT:    adrp x1, :got:src
-; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
-; NO_SME_MOPS-NEXT:    smstop sm
-; NO_SME_MOPS-NEXT:    bl memcpy
-; NO_SME_MOPS-NEXT:    smstart sm
-; NO_SME_MOPS-NEXT:    smstop sm
-; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ret
-entry:
-  tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
-  ret void
-}
-
-define void @sb_memset(i64 noundef %n) "aarch64_pstate_sm_body" {
-; CHECK-LABEL: sb_memset:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    .cfi_offset b8, -24
-; CHECK-NEXT:    .cfi_offset b9, -32
-; CHECK-NEXT:    .cfi_offset b10, -40
-; CHECK-NEXT:    .cfi_offset b11, -48
-; CHECK-NEXT:    .cfi_offset b12, -56
-; CHECK-NEXT:    .cfi_offset b13, -64
-; CHECK-NEXT:    .cfi_offset b14, -72
-; CHECK-NEXT:    .cfi_offset b15, -80
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    adrp x0, :got:dst
-; CHECK-NEXT:    mov w1, #2 // =0x2
-; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; CHECK-NEXT:    bl __arm_sc_memset
-; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; CHECK-NEXT:    ret
-;
-; NO_SME_MOPS-LABEL: sb_memset:
-; NO_SME_MOPS:       // %bb.0: // %entry
-; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; NO_SME_MOPS-NEXT:    .cfi_def_cfa_offset 80
-; NO_SME_MOPS-NEXT:    .cfi_offset w30, -16
-; NO_SME_MOPS-NEXT:    .cfi_offset b8, -24
-; NO_SME_MOPS-NEXT:    .cfi_offset b9, -32
-; NO_SME_MOPS-NEXT:    .cfi_offset b10, -40
-; NO_SME_MOPS-NEXT:    .cfi_offset b11, -48
-; NO_SME_MOPS-NEXT:    .cfi_offset b12, -56
-; NO_SME_MOPS-NEXT:    .cfi_offset b13, -64
-; NO_SME_MOPS-NEXT:    .cfi_offset b14, -72
-; NO_SME_MOPS-NEXT:    .cfi_offset b15, -80
-; NO_SME_MOPS-NEXT:    mov x2, x0
-; NO_SME_MOPS-NEXT:    smstart sm
-; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
-; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; NO_SME_MOPS-NEXT:    smstop sm
-; NO_SME_MOPS-NEXT:    mov w1, #2 // =0x2
-; NO_SME_MOPS-NEXT:    bl memset
-; NO_SME_MOPS-NEXT:    smstart sm
-; NO_SME_MOPS-NEXT:    smstop sm
-; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ret
-entry:
-  tail call void @llvm.memset.p0.i64(ptr align 1 @dst, i8 2, i64 %n, i1 false)
-  ret void
-}
-
-define void @sb_memmove(i64 noundef %n) "aarch64_pstate_sm_body" {
-; CHECK-LABEL: sb_memmove:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    .cfi_offset b8, -24
-; CHECK-NEXT:    .cfi_offset b9, -32
-; CHECK-NEXT:    .cfi_offset b10, -40
-; CHECK-NEXT:    .cfi_offset b11, -48
-; CHECK-NEXT:    .cfi_offset b12, -56
-; CHECK-NEXT:    .cfi_offset b13, -64
-; CHECK-NEXT:    .cfi_offset b14, -72
-; CHECK-NEXT:    .cfi_offset b15, -80
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    adrp x0, :got:dst
-; CHECK-NEXT:    adrp x1, :got:src
-; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
-; CHECK-NEXT:    bl __arm_sc_memmove
-; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; CHECK-NEXT:    ret
-;
-; NO_SME_MOPS-LABEL: sb_memmove:
-; NO_SME_MOPS:       // %bb.0: // %entry
-; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; NO_SME_MOPS-NEXT:    .cfi_def_cfa_offset 80
-; NO_SME_MOPS-NEXT:    .cfi_offset w30, -16
-; NO_SME_MOPS-NEXT:    .cfi_offset b8, -24
-; NO_SME_MOPS-NEXT:    .cfi_offset b9, -32
-; NO_SME_MOPS-NEXT:    .cfi_offset b10, -40
-; NO_SME_MOPS-NEXT:    .cfi_offset b11, -48
-; NO_SME_MOPS-NEXT:    .cfi_offset b12, -56
-; NO_SME_MOPS-NEXT:    .cfi_offset b13, -64
-; NO_SME_MOPS-NEXT:    .cfi_offset b14, -72
-; NO_SME_MOPS-NEXT:    .cfi_offset b15, -80
-; NO_SME_MOPS-NEXT:    mov x2, x0
-; NO_SME_MOPS-NEXT:    smstart sm
-; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
-; NO_SME_MOPS-NEXT:    adrp x1, :got:src
-; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
-; NO_SME_MOPS-NEXT:    smstop sm
-; NO_SME_MOPS-NEXT:    bl memmove
-; NO_SME_MOPS-NEXT:    smstart sm
-; NO_SME_MOPS-NEXT:    smstop sm
-; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ret
-entry:
-  tail call void @llvm.memmove.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
-  ret void
-}
-
-declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
-declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg)
-declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg)
diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
new file mode 100644
index 00000000000000..f2258bafd6134d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
@@ -0,0 +1,289 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -aarch64-lower-to-sme-routines=false < %s | FileCheck %s -check-prefixes=NO_SME_MOPS
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+mops -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK_MOPS
+
+ at dst = global [512 x i8] zeroinitializer, align 1
+ at src = global [512 x i8] zeroinitializer, align 1
+
+define void @se_memcpy(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
+; CHECK-LABEL: se_memcpy:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    adrp x1, :got:src
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NEXT:    bl __arm_sc_memcpy
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+;
+; NO_SME_MOPS-LABEL: se_memcpy:
+; NO_SME_MOPS:       // %bb.0: // %entry
+; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    mov x2, x0
+; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
+; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    adrp x1, :got:src
+; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
+; NO_SME_MOPS-NEXT:    smstop sm
+; NO_SME_MOPS-NEXT:    bl memcpy
+; NO_SME_MOPS-NEXT:    smstart sm
+; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ret
+;
+; CHECK_MOPS-LABEL: se_memcpy:
+; CHECK_MOPS:       // %bb.0: // %entry
+; CHECK_MOPS-NEXT:    adrp x8, :got:src
+; CHECK_MOPS-NEXT:    adrp x9, :got:dst
+; CHECK_MOPS-NEXT:    ldr x8, [x8, :got_lo12:src]
+; CHECK_MOPS-NEXT:    ldr x9, [x9, :got_lo12:dst]
+; CHECK_MOPS-NEXT:    cpyfp [x9]!, [x8]!, x0!
+; CHECK_MOPS-NEXT:    cpyfm [x9]!, [x8]!, x0!
+; CHECK_MOPS-NEXT:    cpyfe [x9]!, [x8]!, x0!
+; CHECK_MOPS-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+define void @se_memset(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
+; CHECK-LABEL: se_memset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    mov w1, #2 // =0x2
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    bl __arm_sc_memset
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+;
+; NO_SME_MOPS-LABEL: se_memset:
+; NO_SME_MOPS:       // %bb.0: // %entry
+; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    mov x2, x0
+; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
+; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; NO_SME_MOPS-NEXT:    smstop sm
+; NO_SME_MOPS-NEXT:    mov w1, #2 // =0x2
+; NO_SME_MOPS-NEXT:    bl memset
+; NO_SME_MOPS-NEXT:    smstart sm
+; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ret
+;
+; CHECK_MOPS-LABEL: se_memset:
+; CHECK_MOPS:       // %bb.0: // %entry
+; CHECK_MOPS-NEXT:    adrp x8, :got:dst
+; CHECK_MOPS-NEXT:    mov w9, #2 // =0x2
+; CHECK_MOPS-NEXT:    ldr x8, [x8, :got_lo12:dst]
+; CHECK_MOPS-NEXT:    setp [x8]!, x0!, x9
+; CHECK_MOPS-NEXT:    setm [x8]!, x0!, x9
+; CHECK_MOPS-NEXT:    sete [x8]!, x0!, x9
+; CHECK_MOPS-NEXT:    ret
+entry:
+  tail call void @llvm.memset.p0.i64(ptr align 1 @dst, i8 2, i64 %n, i1 false)
+  ret void
+}
+
+define void @se_memmove(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
+; CHECK-LABEL: se_memmove:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    adrp x1, :got:src
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NEXT:    bl __arm_sc_memmove
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+;
+; NO_SME_MOPS-LABEL: se_memmove:
+; NO_SME_MOPS:       // %bb.0: // %entry
+; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    mov x2, x0
+; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
+; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    adrp x1, :got:src
+; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
+; NO_SME_MOPS-NEXT:    smstop sm
+; NO_SME_MOPS-NEXT:    bl memmove
+; NO_SME_MOPS-NEXT:    smstart sm
+; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ret
+;
+; CHECK_MOPS-LABEL: se_memmove:
+; CHECK_MOPS:       // %bb.0: // %entry
+; CHECK_MOPS-NEXT:    adrp x8, :got:src
+; CHECK_MOPS-NEXT:    adrp x9, :got:dst
+; CHECK_MOPS-NEXT:    ldr x8, [x8, :got_lo12:src]
+; CHECK_MOPS-NEXT:    ldr x9, [x9, :got_lo12:dst]
+; CHECK_MOPS-NEXT:    cpyp [x9]!, [x8]!, x0!
+; CHECK_MOPS-NEXT:    cpym [x9]!, [x8]!, x0!
+; CHECK_MOPS-NEXT:    cpye [x9]!, [x8]!, x0!
+; CHECK_MOPS-NEXT:    ret
+entry:
+  tail call void @llvm.memmove.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
+; CHECK-LABEL: sc_memcpy:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    adrp x1, :got:src
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NEXT:    bl __arm_sc_memcpy
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+;
+; NO_SME_MOPS-LABEL: sc_memcpy:
+; NO_SME_MOPS:       // %bb.0: // %entry
+; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    mov x2, x0
+; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    bl __arm_sme_state
+; NO_SME_MOPS-NEXT:    adrp x8, :got:dst
+; NO_SME_MOPS-NEXT:    adrp x1, :got:src
+; NO_SME_MOPS-NEXT:    and x19, x0, #0x1
+; NO_SME_MOPS-NEXT:    ldr x8, [x8, :got_lo12:dst]
+; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
+; NO_SME_MOPS-NEXT:    tbz w19, #0, .LBB3_2
+; NO_SME_MOPS-NEXT:  // %bb.1: // %entry
+; NO_SME_MOPS-NEXT:    smstop sm
+; NO_SME_MOPS-NEXT:  .LBB3_2: // %entry
+; NO_SME_MOPS-NEXT:    mov x0, x8
+; NO_SME_MOPS-NEXT:    bl memcpy
+; NO_SME_MOPS-NEXT:    tbz w19, #0, .LBB3_4
+; NO_SME_MOPS-NEXT:  // %bb.3: // %entry
+; NO_SME_MOPS-NEXT:    smstart sm
+; NO_SME_MOPS-NEXT:  .LBB3_4: // %entry
+; NO_SME_MOPS-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ret
+;
+; CHECK_MOPS-LABEL: sc_memcpy:
+; CHECK_MOPS:       // %bb.0: // %entry
+; CHECK_MOPS-NEXT:    adrp x8, :got:src
+; CHECK_MOPS-NEXT:    adrp x9, :got:dst
+; CHECK_MOPS-NEXT:    ldr x8, [x8, :got_lo12:src]
+; CHECK_MOPS-NEXT:    ldr x9, [x9, :got_lo12:dst]
+; CHECK_MOPS-NEXT:    cpyfp [x9]!, [x8]!, x0!
+; CHECK_MOPS-NEXT:    cpyfm [x9]!, [x8]!, x0!
+; CHECK_MOPS-NEXT:    cpyfe [x9]!, [x8]!, x0!
+; CHECK_MOPS-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: sb_memcpy:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    adrp x1, :got:src
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NEXT:    bl __arm_sc_memcpy
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+;
+; NO_SME_MOPS-LABEL: sb_memcpy:
+; NO_SME_MOPS:       // %bb.0: // %entry
+; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    mov x2, x0
+; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; NO_SME_MOPS-NEXT:    smstart sm
+; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
+; NO_SME_MOPS-NEXT:    adrp x1, :got:src
+; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
+; NO_SME_MOPS-NEXT:    smstop sm
+; NO_SME_MOPS-NEXT:    bl memcpy
+; NO_SME_MOPS-NEXT:    smstart sm
+; NO_SME_MOPS-NEXT:    smstop sm
+; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; NO_SME_MOPS-NEXT:    ret
+;
+; CHECK_MOPS-LABEL: sb_memcpy:
+; CHECK_MOPS:       // %bb.0: // %entry
+; CHECK_MOPS-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK_MOPS-NEXT:    smstart sm
+; CHECK_MOPS-NEXT:    adrp x8, :got:src
+; CHECK_MOPS-NEXT:    adrp x9, :got:dst
+; CHECK_MOPS-NEXT:    ldr x8, [x8, :got_lo12:src]
+; CHECK_MOPS-NEXT:    ldr x9, [x9, :got_lo12:dst]
+; CHECK_MOPS-NEXT:    cpyfp [x9]!, [x8]!, x0!
+; CHECK_MOPS-NEXT:    cpyfm [x9]!, [x8]!, x0!
+; CHECK_MOPS-NEXT:    cpyfe [x9]!, [x8]!, x0!
+; CHECK_MOPS-NEXT:    smstop sm
+; CHECK_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK_MOPS-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK_MOPS-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg)
+declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg)

>From 9c2b171f189fc0dae5f868b153db08b2f08672f5 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Mon, 4 Mar 2024 02:30:09 +0000
Subject: [PATCH 08/11] Resolved comments.

---
 .../AArch64/AArch64SelectionDAGInfo.cpp       |  12 +-
 .../streaming-compatible-memory-ops.ll        | 352 +++++++++---------
 2 files changed, 184 insertions(+), 180 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index a8cefc3a72ed5e..7cf3b8f3a2be4a 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -95,6 +95,7 @@ SDValue AArch64SelectionDAGInfo::EmitSpecializedLibcall(
   DstEntry.Node = Dst;
   Args.push_back(DstEntry);
   EVT Ty = TLI->getPointerTy(DAG.getDataLayout());
+  PointerType *RetTy;
 
   if (!LowerToSMERoutines)
     return SDValue();
@@ -106,6 +107,7 @@ SDValue AArch64SelectionDAGInfo::EmitSpecializedLibcall(
     Symbol = DAG.getExternalSymbol("__arm_sc_memcpy", Ty);
     Entry.Node = Src;
     Args.push_back(Entry);
+    RetTy = PointerType::getUnqual(*DAG.getContext());
     break;
   }
   case RTLIB::MEMMOVE: {
@@ -114,16 +116,17 @@ SDValue AArch64SelectionDAGInfo::EmitSpecializedLibcall(
     Symbol = DAG.getExternalSymbol("__arm_sc_memmove", Ty);
     Entry.Node = Src;
     Args.push_back(Entry);
+    RetTy = PointerType::getUnqual(*DAG.getContext());
     break;
   }
   case RTLIB::MEMSET: {
     TargetLowering::ArgListEntry Entry;
-    Entry.Ty = PointerType::getUnqual(*DAG.getContext());
+    Entry.Ty = Type::getInt32Ty(*DAG.getContext());
     Symbol = DAG.getExternalSymbol("__arm_sc_memset", Ty);
     Src = DAG.getZExtOrTrunc(Src, DL, MVT::i32);
     Entry.Node = Src;
-    Entry.Ty = Type::getInt32Ty(*DAG.getContext());
     Args.push_back(Entry);
+    RetTy = PointerType::getUnqual(*DAG.getContext());
     break;
   }
   default:
@@ -133,11 +136,12 @@ SDValue AArch64SelectionDAGInfo::EmitSpecializedLibcall(
   SizeEntry.Node = Size;
   SizeEntry.Ty = PointerType::getUnqual(*DAG.getContext());
   Args.push_back(SizeEntry);
+  assert(Symbol->getOpcode() == ISD::ExternalSymbol &&
+         "Function name is not set");
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
-      TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
-      Symbol, std::move(Args));
+      TLI->getLibcallCallingConv(LC), RetTy, Symbol, std::move(Args));
   std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
   return CallResult.second;
 }
diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
index f2258bafd6134d..c39894c27d9d4d 100644
--- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
+++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -aarch64-lower-to-sme-routines=false < %s | FileCheck %s -check-prefixes=NO_SME_MOPS
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+mops -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK_MOPS
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -aarch64-lower-to-sme-routines=false < %s | FileCheck %s -check-prefixes=CHECK-NO-SME-ROUTINES
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+mops -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK-MOPS
 
 @dst = global [512 x i8] zeroinitializer, align 1
 @src = global [512 x i8] zeroinitializer, align 1
@@ -19,38 +19,38 @@ define void @se_memcpy(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
-; NO_SME_MOPS-LABEL: se_memcpy:
-; NO_SME_MOPS:       // %bb.0: // %entry
-; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    mov x2, x0
-; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
-; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    adrp x1, :got:src
-; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
-; NO_SME_MOPS-NEXT:    smstop sm
-; NO_SME_MOPS-NEXT:    bl memcpy
-; NO_SME_MOPS-NEXT:    smstart sm
-; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ret
+; CHECK-NO-SME-ROUTINES-LABEL: se_memcpy:
+; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x1, :got:src
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
+; CHECK-NO-SME-ROUTINES-NEXT:    bl memcpy
+; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ret
 ;
-; CHECK_MOPS-LABEL: se_memcpy:
-; CHECK_MOPS:       // %bb.0: // %entry
-; CHECK_MOPS-NEXT:    adrp x8, :got:src
-; CHECK_MOPS-NEXT:    adrp x9, :got:dst
-; CHECK_MOPS-NEXT:    ldr x8, [x8, :got_lo12:src]
-; CHECK_MOPS-NEXT:    ldr x9, [x9, :got_lo12:dst]
-; CHECK_MOPS-NEXT:    cpyfp [x9]!, [x8]!, x0!
-; CHECK_MOPS-NEXT:    cpyfm [x9]!, [x8]!, x0!
-; CHECK_MOPS-NEXT:    cpyfe [x9]!, [x8]!, x0!
-; CHECK_MOPS-NEXT:    ret
+; CHECK-MOPS-LABEL: se_memcpy:
+; CHECK-MOPS:       // %bb.0: // %entry
+; CHECK-MOPS-NEXT:    adrp x8, :got:src
+; CHECK-MOPS-NEXT:    adrp x9, :got:dst
+; CHECK-MOPS-NEXT:    ldr x8, [x8, :got_lo12:src]
+; CHECK-MOPS-NEXT:    ldr x9, [x9, :got_lo12:dst]
+; CHECK-MOPS-NEXT:    cpyfp [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    cpyfm [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    cpyfe [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    ret
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
   ret void
@@ -68,36 +68,36 @@ define void @se_memset(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
-; NO_SME_MOPS-LABEL: se_memset:
-; NO_SME_MOPS:       // %bb.0: // %entry
-; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    mov x2, x0
-; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
-; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; NO_SME_MOPS-NEXT:    smstop sm
-; NO_SME_MOPS-NEXT:    mov w1, #2 // =0x2
-; NO_SME_MOPS-NEXT:    bl memset
-; NO_SME_MOPS-NEXT:    smstart sm
-; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ret
+; CHECK-NO-SME-ROUTINES-LABEL: se_memset:
+; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
+; CHECK-NO-SME-ROUTINES-NEXT:    mov w1, #2 // =0x2
+; CHECK-NO-SME-ROUTINES-NEXT:    bl memset
+; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ret
 ;
-; CHECK_MOPS-LABEL: se_memset:
-; CHECK_MOPS:       // %bb.0: // %entry
-; CHECK_MOPS-NEXT:    adrp x8, :got:dst
-; CHECK_MOPS-NEXT:    mov w9, #2 // =0x2
-; CHECK_MOPS-NEXT:    ldr x8, [x8, :got_lo12:dst]
-; CHECK_MOPS-NEXT:    setp [x8]!, x0!, x9
-; CHECK_MOPS-NEXT:    setm [x8]!, x0!, x9
-; CHECK_MOPS-NEXT:    sete [x8]!, x0!, x9
-; CHECK_MOPS-NEXT:    ret
+; CHECK-MOPS-LABEL: se_memset:
+; CHECK-MOPS:       // %bb.0: // %entry
+; CHECK-MOPS-NEXT:    adrp x8, :got:dst
+; CHECK-MOPS-NEXT:    mov w9, #2 // =0x2
+; CHECK-MOPS-NEXT:    ldr x8, [x8, :got_lo12:dst]
+; CHECK-MOPS-NEXT:    setp [x8]!, x0!, x9
+; CHECK-MOPS-NEXT:    setm [x8]!, x0!, x9
+; CHECK-MOPS-NEXT:    sete [x8]!, x0!, x9
+; CHECK-MOPS-NEXT:    ret
 entry:
   tail call void @llvm.memset.p0.i64(ptr align 1 @dst, i8 2, i64 %n, i1 false)
   ret void
@@ -116,38 +116,38 @@ define void @se_memmove(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
-; NO_SME_MOPS-LABEL: se_memmove:
-; NO_SME_MOPS:       // %bb.0: // %entry
-; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    mov x2, x0
-; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
-; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    adrp x1, :got:src
-; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
-; NO_SME_MOPS-NEXT:    smstop sm
-; NO_SME_MOPS-NEXT:    bl memmove
-; NO_SME_MOPS-NEXT:    smstart sm
-; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ret
+; CHECK-NO-SME-ROUTINES-LABEL: se_memmove:
+; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x1, :got:src
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
+; CHECK-NO-SME-ROUTINES-NEXT:    bl memmove
+; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ret
 ;
-; CHECK_MOPS-LABEL: se_memmove:
-; CHECK_MOPS:       // %bb.0: // %entry
-; CHECK_MOPS-NEXT:    adrp x8, :got:src
-; CHECK_MOPS-NEXT:    adrp x9, :got:dst
-; CHECK_MOPS-NEXT:    ldr x8, [x8, :got_lo12:src]
-; CHECK_MOPS-NEXT:    ldr x9, [x9, :got_lo12:dst]
-; CHECK_MOPS-NEXT:    cpyp [x9]!, [x8]!, x0!
-; CHECK_MOPS-NEXT:    cpym [x9]!, [x8]!, x0!
-; CHECK_MOPS-NEXT:    cpye [x9]!, [x8]!, x0!
-; CHECK_MOPS-NEXT:    ret
+; CHECK-MOPS-LABEL: se_memmove:
+; CHECK-MOPS:       // %bb.0: // %entry
+; CHECK-MOPS-NEXT:    adrp x8, :got:src
+; CHECK-MOPS-NEXT:    adrp x9, :got:dst
+; CHECK-MOPS-NEXT:    ldr x8, [x8, :got_lo12:src]
+; CHECK-MOPS-NEXT:    ldr x9, [x9, :got_lo12:dst]
+; CHECK-MOPS-NEXT:    cpyp [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    cpym [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    cpye [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    ret
 entry:
   tail call void @llvm.memmove.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
   ret void
@@ -166,47 +166,47 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
-; NO_SME_MOPS-LABEL: sc_memcpy:
-; NO_SME_MOPS:       // %bb.0: // %entry
-; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    mov x2, x0
-; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    bl __arm_sme_state
-; NO_SME_MOPS-NEXT:    adrp x8, :got:dst
-; NO_SME_MOPS-NEXT:    adrp x1, :got:src
-; NO_SME_MOPS-NEXT:    and x19, x0, #0x1
-; NO_SME_MOPS-NEXT:    ldr x8, [x8, :got_lo12:dst]
-; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
-; NO_SME_MOPS-NEXT:    tbz w19, #0, .LBB3_2
-; NO_SME_MOPS-NEXT:  // %bb.1: // %entry
-; NO_SME_MOPS-NEXT:    smstop sm
-; NO_SME_MOPS-NEXT:  .LBB3_2: // %entry
-; NO_SME_MOPS-NEXT:    mov x0, x8
-; NO_SME_MOPS-NEXT:    bl memcpy
-; NO_SME_MOPS-NEXT:    tbz w19, #0, .LBB3_4
-; NO_SME_MOPS-NEXT:  // %bb.3: // %entry
-; NO_SME_MOPS-NEXT:    smstart sm
-; NO_SME_MOPS-NEXT:  .LBB3_4: // %entry
-; NO_SME_MOPS-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ret
+; CHECK-NO-SME-ROUTINES-LABEL: sc_memcpy:
+; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    bl __arm_sme_state
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x8, :got:dst
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x1, :got:src
+; CHECK-NO-SME-ROUTINES-NEXT:    and x19, x0, #0x1
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x8, [x8, :got_lo12:dst]
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NO-SME-ROUTINES-NEXT:    tbz w19, #0, .LBB3_2
+; CHECK-NO-SME-ROUTINES-NEXT:  // %bb.1: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
+; CHECK-NO-SME-ROUTINES-NEXT:  .LBB3_2: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    mov x0, x8
+; CHECK-NO-SME-ROUTINES-NEXT:    bl memcpy
+; CHECK-NO-SME-ROUTINES-NEXT:    tbz w19, #0, .LBB3_4
+; CHECK-NO-SME-ROUTINES-NEXT:  // %bb.3: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
+; CHECK-NO-SME-ROUTINES-NEXT:  .LBB3_4: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ret
 ;
-; CHECK_MOPS-LABEL: sc_memcpy:
-; CHECK_MOPS:       // %bb.0: // %entry
-; CHECK_MOPS-NEXT:    adrp x8, :got:src
-; CHECK_MOPS-NEXT:    adrp x9, :got:dst
-; CHECK_MOPS-NEXT:    ldr x8, [x8, :got_lo12:src]
-; CHECK_MOPS-NEXT:    ldr x9, [x9, :got_lo12:dst]
-; CHECK_MOPS-NEXT:    cpyfp [x9]!, [x8]!, x0!
-; CHECK_MOPS-NEXT:    cpyfm [x9]!, [x8]!, x0!
-; CHECK_MOPS-NEXT:    cpyfe [x9]!, [x8]!, x0!
-; CHECK_MOPS-NEXT:    ret
+; CHECK-MOPS-LABEL: sc_memcpy:
+; CHECK-MOPS:       // %bb.0: // %entry
+; CHECK-MOPS-NEXT:    adrp x8, :got:src
+; CHECK-MOPS-NEXT:    adrp x9, :got:dst
+; CHECK-MOPS-NEXT:    ldr x8, [x8, :got_lo12:src]
+; CHECK-MOPS-NEXT:    ldr x9, [x9, :got_lo12:dst]
+; CHECK-MOPS-NEXT:    cpyfp [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    cpyfm [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    cpyfe [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    ret
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
   ret void
@@ -235,50 +235,50 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
 ; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
-; NO_SME_MOPS-LABEL: sb_memcpy:
-; NO_SME_MOPS:       // %bb.0: // %entry
-; NO_SME_MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    mov x2, x0
-; NO_SME_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; NO_SME_MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; NO_SME_MOPS-NEXT:    smstart sm
-; NO_SME_MOPS-NEXT:    adrp x0, :got:dst
-; NO_SME_MOPS-NEXT:    adrp x1, :got:src
-; NO_SME_MOPS-NEXT:    ldr x0, [x0, :got_lo12:dst]
-; NO_SME_MOPS-NEXT:    ldr x1, [x1, :got_lo12:src]
-; NO_SME_MOPS-NEXT:    smstop sm
-; NO_SME_MOPS-NEXT:    bl memcpy
-; NO_SME_MOPS-NEXT:    smstart sm
-; NO_SME_MOPS-NEXT:    smstop sm
-; NO_SME_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; NO_SME_MOPS-NEXT:    ret
+; CHECK-NO-SME-ROUTINES-LABEL: sb_memcpy:
+; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x1, :got:src
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
+; CHECK-NO-SME-ROUTINES-NEXT:    bl memcpy
+; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
+; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ret
 ;
-; CHECK_MOPS-LABEL: sb_memcpy:
-; CHECK_MOPS:       // %bb.0: // %entry
-; CHECK_MOPS-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK_MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK_MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK_MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK_MOPS-NEXT:    smstart sm
-; CHECK_MOPS-NEXT:    adrp x8, :got:src
-; CHECK_MOPS-NEXT:    adrp x9, :got:dst
-; CHECK_MOPS-NEXT:    ldr x8, [x8, :got_lo12:src]
-; CHECK_MOPS-NEXT:    ldr x9, [x9, :got_lo12:dst]
-; CHECK_MOPS-NEXT:    cpyfp [x9]!, [x8]!, x0!
-; CHECK_MOPS-NEXT:    cpyfm [x9]!, [x8]!, x0!
-; CHECK_MOPS-NEXT:    cpyfe [x9]!, [x8]!, x0!
-; CHECK_MOPS-NEXT:    smstop sm
-; CHECK_MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK_MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK_MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK_MOPS-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
-; CHECK_MOPS-NEXT:    ret
+; CHECK-MOPS-LABEL: sb_memcpy:
+; CHECK-MOPS:       // %bb.0: // %entry
+; CHECK-MOPS-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    smstart sm
+; CHECK-MOPS-NEXT:    adrp x8, :got:src
+; CHECK-MOPS-NEXT:    adrp x9, :got:dst
+; CHECK-MOPS-NEXT:    ldr x8, [x8, :got_lo12:src]
+; CHECK-MOPS-NEXT:    ldr x9, [x9, :got_lo12:dst]
+; CHECK-MOPS-NEXT:    cpyfp [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    cpyfm [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    cpyfe [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    smstop sm
+; CHECK-MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ret
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
   ret void

>From 92b4448c4d8e62ad1f11a917fd9039a3d9f2184e Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Fri, 8 Mar 2024 19:45:49 +0000
Subject: [PATCH 09/11] Check the result type in order to handle it correctly.

---
 llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 7cf3b8f3a2be4a..2495222dbf60aa 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -143,7 +143,7 @@ SDValue AArch64SelectionDAGInfo::EmitSpecializedLibcall(
   CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
       TLI->getLibcallCallingConv(LC), RetTy, Symbol, std::move(Args));
   std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
-  return CallResult.second;
+  return (isa<PointerType>(RetTy) ? CallResult.second : CallResult.first);
 }
 
 SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemcpy(

>From a19fa2439d47abce928b8823424892d727cbe8c8 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Mon, 25 Mar 2024 09:51:25 +0000
Subject: [PATCH 10/11] Resolved comments.

---
 .../AArch64/AArch64SelectionDAGInfo.cpp       | 42 +++++++++----------
 .../Target/AArch64/AArch64SelectionDAGInfo.h  |  7 ++--
 2 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 2495222dbf60aa..3c84680adb38b7 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -82,68 +82,64 @@ SDValue AArch64SelectionDAGInfo::EmitMOPS(AArch64ISD::NodeType SDOpcode,
   }
 }
 
-SDValue AArch64SelectionDAGInfo::EmitSpecializedLibcall(
+SDValue AArch64SelectionDAGInfo::EmitStreamingCompatibleMemLibCall(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src,
     SDValue Size, RTLIB::Libcall LC) const {
   const AArch64Subtarget &STI =
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
   const AArch64TargetLowering *TLI = STI.getTargetLowering();
-  TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry DstEntry;
   SDValue Symbol;
+  TargetLowering::ArgListEntry DstEntry;
   DstEntry.Ty = PointerType::getUnqual(*DAG.getContext());
   DstEntry.Node = Dst;
+  TargetLowering::ArgListTy Args;
   Args.push_back(DstEntry);
-  EVT Ty = TLI->getPointerTy(DAG.getDataLayout());
-  PointerType *RetTy;
-
-  if (!LowerToSMERoutines)
-    return SDValue();
+  EVT PointerVT = TLI->getPointerTy(DAG.getDataLayout());
 
   switch (LC) {
   case RTLIB::MEMCPY: {
     TargetLowering::ArgListEntry Entry;
     Entry.Ty = PointerType::getUnqual(*DAG.getContext());
-    Symbol = DAG.getExternalSymbol("__arm_sc_memcpy", Ty);
+    Symbol = DAG.getExternalSymbol("__arm_sc_memcpy", PointerVT);
     Entry.Node = Src;
     Args.push_back(Entry);
-    RetTy = PointerType::getUnqual(*DAG.getContext());
     break;
   }
   case RTLIB::MEMMOVE: {
     TargetLowering::ArgListEntry Entry;
     Entry.Ty = PointerType::getUnqual(*DAG.getContext());
-    Symbol = DAG.getExternalSymbol("__arm_sc_memmove", Ty);
+    Symbol = DAG.getExternalSymbol("__arm_sc_memmove", PointerVT);
     Entry.Node = Src;
     Args.push_back(Entry);
-    RetTy = PointerType::getUnqual(*DAG.getContext());
     break;
   }
   case RTLIB::MEMSET: {
     TargetLowering::ArgListEntry Entry;
     Entry.Ty = Type::getInt32Ty(*DAG.getContext());
-    Symbol = DAG.getExternalSymbol("__arm_sc_memset", Ty);
+    Symbol = DAG.getExternalSymbol("__arm_sc_memset", PointerVT);
     Src = DAG.getZExtOrTrunc(Src, DL, MVT::i32);
     Entry.Node = Src;
     Args.push_back(Entry);
-    RetTy = PointerType::getUnqual(*DAG.getContext());
     break;
   }
   default:
     return SDValue();
   }
+
   TargetLowering::ArgListEntry SizeEntry;
   SizeEntry.Node = Size;
-  SizeEntry.Ty = PointerType::getUnqual(*DAG.getContext());
+  SizeEntry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
   Args.push_back(SizeEntry);
   assert(Symbol->getOpcode() == ISD::ExternalSymbol &&
          "Function name is not set");
+  if (!LowerToSMERoutines)
+    return SDValue();
 
   TargetLowering::CallLoweringInfo CLI(DAG);
+  PointerType *RetTy = PointerType::getUnqual(*DAG.getContext());
   CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
       TLI->getLibcallCallingConv(LC), RetTy, Symbol, std::move(Args));
-  std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
-  return (isa<PointerType>(RetTy) ? CallResult.second : CallResult.first);
+  return TLI->LowerCallTo(CLI).second;
 }
 
 SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemcpy(
@@ -159,8 +155,8 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemcpy(
 
   SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
   if (!Attrs.hasNonStreamingInterfaceAndBody())
-    return EmitSpecializedLibcall(DAG, DL, Chain, Dst, Src, Size,
-                                  RTLIB::MEMCPY);
+    return EmitStreamingCompatibleMemLibCall(DAG, DL, Chain, Dst, Src, Size,
+                                             RTLIB::MEMCPY);
   return SDValue();
 }
 
@@ -177,8 +173,8 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
 
   SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
   if (!Attrs.hasNonStreamingInterfaceAndBody())
-    return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
-                                  RTLIB::MEMSET);
+    return EmitStreamingCompatibleMemLibCall(DAG, dl, Chain, Dst, Src, Size,
+                                             RTLIB::MEMSET);
   return SDValue();
 }
 
@@ -195,8 +191,8 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove(
 
   SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
   if (!Attrs.hasNonStreamingInterfaceAndBody())
-    return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
-                                  RTLIB::MEMMOVE);
+    return EmitStreamingCompatibleMemLibCall(DAG, dl, Chain, Dst, Src, Size,
+                                             RTLIB::MEMMOVE);
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index 9c55c21f3c3202..514de44778630e 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -48,9 +48,10 @@ class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo {
                                   MachinePointerInfo DstPtrInfo,
                                   bool ZeroData) const override;
 
-  SDValue EmitSpecializedLibcall(SelectionDAG &DAG, const SDLoc &DL,
-                                 SDValue Chain, SDValue Dst, SDValue Src,
-                                 SDValue Size, RTLIB::Libcall LC) const;
+  SDValue EmitStreamingCompatibleMemLibCall(SelectionDAG &DAG, const SDLoc &DL,
+                                            SDValue Chain, SDValue Dst,
+                                            SDValue Src, SDValue Size,
+                                            RTLIB::Libcall LC) const;
 };
 }
 

>From e9fa3a68167725449b9b85bd0419a6caf2e6d92b Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Tue, 2 Apr 2024 10:44:05 +0000
Subject: [PATCH 11/11] Moved LowerToSMERoutines closer to usage.

---
 llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 3c84680adb38b7..19ef6f4fb32e74 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -132,8 +132,6 @@ SDValue AArch64SelectionDAGInfo::EmitStreamingCompatibleMemLibCall(
   Args.push_back(SizeEntry);
   assert(Symbol->getOpcode() == ISD::ExternalSymbol &&
          "Function name is not set");
-  if (!LowerToSMERoutines)
-    return SDValue();
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   PointerType *RetTy = PointerType::getUnqual(*DAG.getContext());
@@ -154,7 +152,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemcpy(
                     Alignment, isVolatile, DstPtrInfo, SrcPtrInfo);
 
   SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
-  if (!Attrs.hasNonStreamingInterfaceAndBody())
+  if (LowerToSMERoutines && !Attrs.hasNonStreamingInterfaceAndBody())
     return EmitStreamingCompatibleMemLibCall(DAG, DL, Chain, Dst, Src, Size,
                                              RTLIB::MEMCPY);
   return SDValue();
@@ -172,7 +170,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
                     Alignment, isVolatile, DstPtrInfo, MachinePointerInfo{});
 
   SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
-  if (!Attrs.hasNonStreamingInterfaceAndBody())
+  if (LowerToSMERoutines && !Attrs.hasNonStreamingInterfaceAndBody())
     return EmitStreamingCompatibleMemLibCall(DAG, dl, Chain, Dst, Src, Size,
                                              RTLIB::MEMSET);
   return SDValue();
@@ -190,7 +188,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove(
                     Alignment, isVolatile, DstPtrInfo, SrcPtrInfo);
 
   SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
-  if (!Attrs.hasNonStreamingInterfaceAndBody())
+  if (LowerToSMERoutines && !Attrs.hasNonStreamingInterfaceAndBody())
     return EmitStreamingCompatibleMemLibCall(DAG, dl, Chain, Dst, Src, Size,
                                              RTLIB::MEMMOVE);
   return SDValue();



More information about the llvm-commits mailing list