[llvm] 528943f - [AArch64][SME] Allow memory operations lowering to custom SME functions. (#79263)

via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 9 09:27:49 PDT 2024


Author: Dinar Temirbulatov
Date: 2024-04-09T17:27:46+01:00
New Revision: 528943f1535b925ce175afb2438cec79513cfc2b

URL: https://github.com/llvm/llvm-project/commit/528943f1535b925ce175afb2438cec79513cfc2b
DIFF: https://github.com/llvm/llvm-project/commit/528943f1535b925ce175afb2438cec79513cfc2b.diff

LOG: [AArch64][SME] Allow memory operations lowering to custom SME functions. (#79263)

This change allows to lower memcpy, memset, memmove to custom SME
version provided by LibRT.

Added: 
    llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
    llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
    llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 9e43f206efcf78..19ef6f4fb32e74 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -15,6 +15,12 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-selectiondag-info"
 
+static cl::opt<bool>
+    LowerToSMERoutines("aarch64-lower-to-sme-routines", cl::Hidden,
+                       cl::desc("Enable AArch64 SME memory operations "
+                                "to lower to librt functions"),
+                       cl::init(true));
+
 SDValue AArch64SelectionDAGInfo::EmitMOPS(AArch64ISD::NodeType SDOpcode,
                                           SelectionDAG &DAG, const SDLoc &DL,
                                           SDValue Chain, SDValue Dst,
@@ -76,15 +82,79 @@ SDValue AArch64SelectionDAGInfo::EmitMOPS(AArch64ISD::NodeType SDOpcode,
   }
 }
 
+SDValue AArch64SelectionDAGInfo::EmitStreamingCompatibleMemLibCall(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, RTLIB::Libcall LC) const {
+  const AArch64Subtarget &STI =
+      DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
+  const AArch64TargetLowering *TLI = STI.getTargetLowering();
+  SDValue Symbol;
+  TargetLowering::ArgListEntry DstEntry;
+  DstEntry.Ty = PointerType::getUnqual(*DAG.getContext());
+  DstEntry.Node = Dst;
+  TargetLowering::ArgListTy Args;
+  Args.push_back(DstEntry);
+  EVT PointerVT = TLI->getPointerTy(DAG.getDataLayout());
+
+  switch (LC) {
+  case RTLIB::MEMCPY: {
+    TargetLowering::ArgListEntry Entry;
+    Entry.Ty = PointerType::getUnqual(*DAG.getContext());
+    Symbol = DAG.getExternalSymbol("__arm_sc_memcpy", PointerVT);
+    Entry.Node = Src;
+    Args.push_back(Entry);
+    break;
+  }
+  case RTLIB::MEMMOVE: {
+    TargetLowering::ArgListEntry Entry;
+    Entry.Ty = PointerType::getUnqual(*DAG.getContext());
+    Symbol = DAG.getExternalSymbol("__arm_sc_memmove", PointerVT);
+    Entry.Node = Src;
+    Args.push_back(Entry);
+    break;
+  }
+  case RTLIB::MEMSET: {
+    TargetLowering::ArgListEntry Entry;
+    Entry.Ty = Type::getInt32Ty(*DAG.getContext());
+    Symbol = DAG.getExternalSymbol("__arm_sc_memset", PointerVT);
+    Src = DAG.getZExtOrTrunc(Src, DL, MVT::i32);
+    Entry.Node = Src;
+    Args.push_back(Entry);
+    break;
+  }
+  default:
+    return SDValue();
+  }
+
+  TargetLowering::ArgListEntry SizeEntry;
+  SizeEntry.Node = Size;
+  SizeEntry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+  Args.push_back(SizeEntry);
+  assert(Symbol->getOpcode() == ISD::ExternalSymbol &&
+         "Function name is not set");
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  PointerType *RetTy = PointerType::getUnqual(*DAG.getContext());
+  CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
+      TLI->getLibcallCallingConv(LC), RetTy, Symbol, std::move(Args));
+  return TLI->LowerCallTo(CLI).second;
+}
+
 SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemcpy(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src,
     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   const AArch64Subtarget &STI =
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
+
   if (STI.hasMOPS())
     return EmitMOPS(AArch64ISD::MOPS_MEMCOPY, DAG, DL, Chain, Dst, Src, Size,
                     Alignment, isVolatile, DstPtrInfo, SrcPtrInfo);
+
+  SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
+  if (LowerToSMERoutines && !Attrs.hasNonStreamingInterfaceAndBody())
+    return EmitStreamingCompatibleMemLibCall(DAG, DL, Chain, Dst, Src, Size,
+                                             RTLIB::MEMCPY);
   return SDValue();
 }
 
@@ -95,10 +165,14 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
   const AArch64Subtarget &STI =
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
 
-  if (STI.hasMOPS()) {
+  if (STI.hasMOPS())
     return EmitMOPS(AArch64ISD::MOPS_MEMSET, DAG, dl, Chain, Dst, Src, Size,
                     Alignment, isVolatile, DstPtrInfo, MachinePointerInfo{});
-  }
+
+  SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
+  if (LowerToSMERoutines && !Attrs.hasNonStreamingInterfaceAndBody())
+    return EmitStreamingCompatibleMemLibCall(DAG, dl, Chain, Dst, Src, Size,
+                                             RTLIB::MEMSET);
   return SDValue();
 }
 
@@ -108,10 +182,15 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove(
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   const AArch64Subtarget &STI =
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
-  if (STI.hasMOPS()) {
+
+  if (STI.hasMOPS())
     return EmitMOPS(AArch64ISD::MOPS_MEMMOVE, DAG, dl, Chain, Dst, Src, Size,
                     Alignment, isVolatile, DstPtrInfo, SrcPtrInfo);
-  }
+
+  SMEAttrs Attrs(DAG.getMachineFunction().getFunction());
+  if (LowerToSMERoutines && !Attrs.hasNonStreamingInterfaceAndBody())
+    return EmitStreamingCompatibleMemLibCall(DAG, dl, Chain, Dst, Src, Size,
+                                             RTLIB::MEMMOVE);
   return SDValue();
 }
 

diff  --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index 73f93724d6fc73..514de44778630e 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -47,6 +47,11 @@ class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo {
                                   SDValue Chain, SDValue Op1, SDValue Op2,
                                   MachinePointerInfo DstPtrInfo,
                                   bool ZeroData) const override;
+
+  SDValue EmitStreamingCompatibleMemLibCall(SelectionDAG &DAG, const SDLoc &DL,
+                                            SDValue Chain, SDValue Dst,
+                                            SDValue Src, SDValue Size,
+                                            RTLIB::Libcall LC) const;
 };
 }
 

diff  --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
index d399e0ac0794f6..015ca4cb92b25e 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
@@ -53,6 +53,9 @@ SMEAttrs::SMEAttrs(StringRef FuncName) : Bitmask(0) {
   if (FuncName == "__arm_tpidr2_restore")
     Bitmask |= SMEAttrs::SM_Compatible | encodeZAState(StateValue::In) |
                SMEAttrs::SME_ABI_Routine;
+  if (FuncName == "__arm_sc_memcpy" || FuncName == "__arm_sc_memset" ||
+      FuncName == "__arm_sc_memmove" || FuncName == "__arm_sc_memchr")
+    Bitmask |= SMEAttrs::SM_Compatible;
 }
 
 SMEAttrs::SMEAttrs(const AttributeList &Attrs) {

diff  --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
new file mode 100644
index 00000000000000..c39894c27d9d4d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
@@ -0,0 +1,289 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -aarch64-lower-to-sme-routines=false < %s | FileCheck %s -check-prefixes=CHECK-NO-SME-ROUTINES
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+mops -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK-MOPS
+
+ at dst = global [512 x i8] zeroinitializer, align 1
+ at src = global [512 x i8] zeroinitializer, align 1
+
+define void @se_memcpy(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
+; CHECK-LABEL: se_memcpy:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    adrp x1, :got:src
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NEXT:    bl __arm_sc_memcpy
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+;
+; CHECK-NO-SME-ROUTINES-LABEL: se_memcpy:
+; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x1, :got:src
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
+; CHECK-NO-SME-ROUTINES-NEXT:    bl memcpy
+; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ret
+;
+; CHECK-MOPS-LABEL: se_memcpy:
+; CHECK-MOPS:       // %bb.0: // %entry
+; CHECK-MOPS-NEXT:    adrp x8, :got:src
+; CHECK-MOPS-NEXT:    adrp x9, :got:dst
+; CHECK-MOPS-NEXT:    ldr x8, [x8, :got_lo12:src]
+; CHECK-MOPS-NEXT:    ldr x9, [x9, :got_lo12:dst]
+; CHECK-MOPS-NEXT:    cpyfp [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    cpyfm [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    cpyfe [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+define void @se_memset(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
+; CHECK-LABEL: se_memset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    mov w1, #2 // =0x2
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    bl __arm_sc_memset
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+;
+; CHECK-NO-SME-ROUTINES-LABEL: se_memset:
+; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
+; CHECK-NO-SME-ROUTINES-NEXT:    mov w1, #2 // =0x2
+; CHECK-NO-SME-ROUTINES-NEXT:    bl memset
+; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ret
+;
+; CHECK-MOPS-LABEL: se_memset:
+; CHECK-MOPS:       // %bb.0: // %entry
+; CHECK-MOPS-NEXT:    adrp x8, :got:dst
+; CHECK-MOPS-NEXT:    mov w9, #2 // =0x2
+; CHECK-MOPS-NEXT:    ldr x8, [x8, :got_lo12:dst]
+; CHECK-MOPS-NEXT:    setp [x8]!, x0!, x9
+; CHECK-MOPS-NEXT:    setm [x8]!, x0!, x9
+; CHECK-MOPS-NEXT:    sete [x8]!, x0!, x9
+; CHECK-MOPS-NEXT:    ret
+entry:
+  tail call void @llvm.memset.p0.i64(ptr align 1 @dst, i8 2, i64 %n, i1 false)
+  ret void
+}
+
+define void @se_memmove(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
+; CHECK-LABEL: se_memmove:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    adrp x1, :got:src
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NEXT:    bl __arm_sc_memmove
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+;
+; CHECK-NO-SME-ROUTINES-LABEL: se_memmove:
+; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x1, :got:src
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
+; CHECK-NO-SME-ROUTINES-NEXT:    bl memmove
+; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ret
+;
+; CHECK-MOPS-LABEL: se_memmove:
+; CHECK-MOPS:       // %bb.0: // %entry
+; CHECK-MOPS-NEXT:    adrp x8, :got:src
+; CHECK-MOPS-NEXT:    adrp x9, :got:dst
+; CHECK-MOPS-NEXT:    ldr x8, [x8, :got_lo12:src]
+; CHECK-MOPS-NEXT:    ldr x9, [x9, :got_lo12:dst]
+; CHECK-MOPS-NEXT:    cpyp [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    cpym [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    cpye [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    ret
+entry:
+  tail call void @llvm.memmove.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
+; CHECK-LABEL: sc_memcpy:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    adrp x1, :got:src
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NEXT:    bl __arm_sc_memcpy
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+;
+; CHECK-NO-SME-ROUTINES-LABEL: sc_memcpy:
+; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    bl __arm_sme_state
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x8, :got:dst
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x1, :got:src
+; CHECK-NO-SME-ROUTINES-NEXT:    and x19, x0, #0x1
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x8, [x8, :got_lo12:dst]
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NO-SME-ROUTINES-NEXT:    tbz w19, #0, .LBB3_2
+; CHECK-NO-SME-ROUTINES-NEXT:  // %bb.1: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
+; CHECK-NO-SME-ROUTINES-NEXT:  .LBB3_2: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    mov x0, x8
+; CHECK-NO-SME-ROUTINES-NEXT:    bl memcpy
+; CHECK-NO-SME-ROUTINES-NEXT:    tbz w19, #0, .LBB3_4
+; CHECK-NO-SME-ROUTINES-NEXT:  // %bb.3: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
+; CHECK-NO-SME-ROUTINES-NEXT:  .LBB3_4: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ret
+;
+; CHECK-MOPS-LABEL: sc_memcpy:
+; CHECK-MOPS:       // %bb.0: // %entry
+; CHECK-MOPS-NEXT:    adrp x8, :got:src
+; CHECK-MOPS-NEXT:    adrp x9, :got:dst
+; CHECK-MOPS-NEXT:    ldr x8, [x8, :got_lo12:src]
+; CHECK-MOPS-NEXT:    ldr x9, [x9, :got_lo12:dst]
+; CHECK-MOPS-NEXT:    cpyfp [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    cpyfm [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    cpyfe [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: sb_memcpy:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    adrp x0, :got:dst
+; CHECK-NEXT:    adrp x1, :got:src
+; CHECK-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NEXT:    bl __arm_sc_memcpy
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+;
+; CHECK-NO-SME-ROUTINES-LABEL: sb_memcpy:
+; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x1, :got:src
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x0, [x0, :got_lo12:dst]
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x1, [x1, :got_lo12:src]
+; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
+; CHECK-NO-SME-ROUTINES-NEXT:    bl memcpy
+; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
+; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ret
+;
+; CHECK-MOPS-LABEL: sb_memcpy:
+; CHECK-MOPS:       // %bb.0: // %entry
+; CHECK-MOPS-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    smstart sm
+; CHECK-MOPS-NEXT:    adrp x8, :got:src
+; CHECK-MOPS-NEXT:    adrp x9, :got:dst
+; CHECK-MOPS-NEXT:    ldr x8, [x8, :got_lo12:src]
+; CHECK-MOPS-NEXT:    ldr x9, [x9, :got_lo12:dst]
+; CHECK-MOPS-NEXT:    cpyfp [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    cpyfm [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    cpyfe [x9]!, [x8]!, x0!
+; CHECK-MOPS-NEXT:    smstop sm
+; CHECK-MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ret
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg)
+declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg)


        


More information about the llvm-commits mailing list