[llvm] [llvm][ARM][CodeGen] Disable MEMCPY LDM/STM inlining for Cortex v7-m (PR #106378)
Nashe Mncube via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 19 01:58:39 PDT 2024
https://github.com/nasherm updated https://github.com/llvm/llvm-project/pull/106378
>From 3f56dab3ad0f870250d24c49b1eab3365a157728 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Tue, 13 Aug 2024 10:55:51 +0100
Subject: [PATCH 1/3] [ARM][CodeGen] Disable MEMCPY LDM/STM inlining for v7-m
This patch disables the expansion of MEMCPY to LDM/STM
on v7-m targets. This is due to a slowdown caused
by this inlining method.
Change-Id: I91095299c2c67670a16849d08540bdbc07a95adc
---
llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp | 116 ++++++++++++++
llvm/lib/Target/ARM/ARMSelectionDAGInfo.h | 6 +
llvm/lib/Target/ARM/ARMSubtarget.h | 10 ++
llvm/test/CodeGen/ARM/memcpy-v7m.ll | 165 ++++++++++++++++++++
4 files changed, 297 insertions(+)
create mode 100644 llvm/test/CodeGen/ARM/memcpy-v7m.ll
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index c57825949c1cef..dcf1f3d04a9e17 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -138,6 +138,116 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
return CallResult.second;
}
+SDValue ARMSelectionDAGInfo::EmitMemcpyAsLdSt(
+ SelectionDAG &DAG, SDLoc dl, const ARMSubtarget &Subtarget, SDValue Chain,
+ SDValue Dst, SDValue Src, uint64_t SizeVal, bool isVolatile,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+ // Do repeated batches of 4-byte loads and stores.
+ unsigned BytesLeft = SizeVal & 3;
+ unsigned NumMemOps = SizeVal >> 2;
+ unsigned EmittedNumMemOps = 0;
+ EVT VT = MVT::i32;
+ unsigned VTSize = 4;
+ unsigned i = 0;
+ // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
+ const unsigned MaxLoads = Subtarget.isThumb1Only() ? 4 : 6;
+ SDValue TFOps[6];
+ SDValue Loads[6];
+ uint64_t SrcOff = 0, DstOff = 0;
+
+ MachineMemOperand::Flags MOFlags = MachineMemOperand::Flags::MONone;
+ if (isVolatile)
+ MOFlags = MachineMemOperand::Flags::MOVolatile;
+ MachineMemOperand::Flags LoadMOFlags = MOFlags;
+ if (SrcPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(),
+ DAG.getDataLayout()))
+ LoadMOFlags |= MachineMemOperand::Flags::MODereferenceable;
+ if (auto *V = SrcPtrInfo.V.dyn_cast<const Value *>())
+ if (isa<GlobalVariable>(V) && cast<GlobalVariable>(V)->isConstant())
+ LoadMOFlags |= MachineMemOperand::Flags::MOInvariant;
+ MachineMemOperand::Flags StoreMOFlags = MOFlags;
+ if (DstPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(),
+ DAG.getDataLayout()))
+ StoreMOFlags |= MachineMemOperand::Flags::MODereferenceable;
+
+ // Emit up to MaxLoads loads, then a TokenFactor barrier, then the
+ // same number of stores. The loads and stores may get combined into
+ // ldm/stm later on.
+ while (EmittedNumMemOps < NumMemOps) {
+ for (i = 0; i < MaxLoads && EmittedNumMemOps + i < NumMemOps; ++i) {
+ Loads[i] = DAG.getLoad(VT, dl, Chain,
+ DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
+ DAG.getConstant(SrcOff, dl, MVT::i32)),
+ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0),
+ LoadMOFlags);
+ TFOps[i] = Loads[i].getValue(1);
+ SrcOff += VTSize;
+ }
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
+
+ for (i = 0; i < MaxLoads && EmittedNumMemOps + i < NumMemOps; ++i) {
+ TFOps[i] = DAG.getStore(
+ Chain, dl, Loads[i],
+ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
+ DAG.getConstant(DstOff, dl, MVT::i32)),
+ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), StoreMOFlags);
+ DstOff += VTSize;
+ }
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
+
+ EmittedNumMemOps += i;
+ }
+
+ if (BytesLeft == 0)
+ return Chain;
+
+ // Issue loads / stores for the trailing (1 - 3) bytes.
+ unsigned BytesLeftSave = BytesLeft;
+ i = 0;
+ while (BytesLeft) {
+ if (BytesLeft >= 2) {
+ VT = MVT::i16;
+ VTSize = 2;
+ } else {
+ VT = MVT::i8;
+ VTSize = 1;
+ }
+
+ Loads[i] = DAG.getLoad(VT, dl, Chain,
+ DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
+ DAG.getConstant(SrcOff, dl, MVT::i32)),
+ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0),
+ LoadMOFlags);
+ TFOps[i] = Loads[i].getValue(1);
+ ++i;
+ SrcOff += VTSize;
+ BytesLeft -= VTSize;
+ }
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
+
+ i = 0;
+ BytesLeft = BytesLeftSave;
+ while (BytesLeft) {
+ if (BytesLeft >= 2) {
+ VT = MVT::i16;
+ VTSize = 2;
+ } else {
+ VT = MVT::i8;
+ VTSize = 1;
+ }
+
+ TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
+ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
+ DAG.getConstant(DstOff, dl, MVT::i32)),
+ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0),
+ StoreMOFlags);
+ ++i;
+ DstOff += VTSize;
+ BytesLeft -= VTSize;
+ }
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
+}
+
static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
const SelectionDAG &DAG,
ConstantSDNode *ConstantSize,
@@ -192,6 +302,12 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
Alignment.value(), RTLIB::MEMCPY);
+ if (Subtarget.isCortexM7() &&
+ (Subtarget.wantsMEMCPYAsLdSt() ||
+ (isVolatile && Subtarget.wantsVolatileMEMCPYAsLdSt())))
+ return EmitMemcpyAsLdSt(DAG, dl, Subtarget, Chain, Dst, Src, SizeVal,
+ isVolatile, DstPtrInfo, SrcPtrInfo);
+
unsigned BytesLeft = SizeVal & 3;
unsigned NumMemOps = SizeVal >> 2;
unsigned EmittedNumMemOps = 0;
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
index 275b1c0f8dc017..6ff422c15b1201 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -44,6 +44,12 @@ class ARMSelectionDAGInfo : public SelectionDAGTargetInfo {
MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo) const override;
+ SDValue EmitMemcpyAsLdSt(SelectionDAG &DAG, SDLoc dl,
+ const ARMSubtarget &Subtarget, SDValue Chain,
+ SDValue Dst, SDValue Src, uint64_t SizeVal,
+ bool isVolatile, MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const;
+
SDValue
EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
SDValue Dst, SDValue Src, SDValue Size,
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 00239ff94b7ba5..12cd6b15a5b5dc 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -465,6 +465,16 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
/// True if fast-isel is used.
bool useFastISel() const;
+ /// True if ARMISD::MEMCPY should not be created/expanded specially (e.g.
+ /// as LDM/STM pairs).
+ bool wantsMEMCPYAsLdSt() const { return HasV7Ops && ARMProcClass == MClass; }
+
+ /// True if volatile ARMISD::MEMCPY should not be created/expanded specially
+ /// (e.g. as LDM/STM pairs).
+ bool wantsVolatileMEMCPYAsLdSt() const {
+ return ARMProcClass == MClass && HasV6Ops && !HasV7Ops;
+ }
+
/// Returns the correct return opcode for the current feature set.
/// Use BX if available to allow mixing thumb/arm code, but fall back
/// to plain mov pc,lr on ARMv4.
diff --git a/llvm/test/CodeGen/ARM/memcpy-v7m.ll b/llvm/test/CodeGen/ARM/memcpy-v7m.ll
new file mode 100644
index 00000000000000..2a90f44fe3d348
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/memcpy-v7m.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=thumbv7em-eabi -mcpu=cortex-m7 -verify-machineinstrs %s -o - | FileCheck %s
+
+ at d = external global [64 x i32]
+ at s = external global [64 x i32]
+ at d_32 = external global[32 x i32]
+ at s_32 = external global[32 x i32]
+
+
+; Function Attrs: nounwind
+define void @t1() #0 {
+; CHECK-LABEL: t1:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movw r0, :lower16:d
+; CHECK-NEXT: movw r2, :lower16:s
+; CHECK-NEXT: movt r0, :upper16:d
+; CHECK-NEXT: movt r2, :upper16:s
+; CHECK-NEXT: ldr r1, [r0]
+; CHECK-NEXT: str r1, [r2]
+; CHECK-NEXT: ldr r3, [r0, #4]
+; CHECK-NEXT: str r3, [r2, #4]
+; CHECK-NEXT: ldr r1, [r0, #8]
+; CHECK-NEXT: ldr r3, [r0, #12]
+; CHECK-NEXT: ldrb r0, [r0, #16]
+; CHECK-NEXT: strd r1, r3, [r2, #8]
+; CHECK-NEXT: strb r0, [r2, #16]
+; CHECK-NEXT: bx lr
+entry:
+; We use '[rl0-9]+' to allow 'r0'..'r12', 'lr'
+ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 17, i32 4, i1 false)
+ ret void
+}
+
+; Function Attrs: nounwind
+define void @t2() #0 {
+; CHECK-LABEL: t2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movw r0, :lower16:d
+; CHECK-NEXT: movw r1, :lower16:s
+; CHECK-NEXT: movt r0, :upper16:d
+; CHECK-NEXT: movt r1, :upper16:s
+; CHECK-NEXT: ldr.w r2, [r0, #11]
+; CHECK-NEXT: str.w r2, [r1, #11]
+; CHECK-NEXT: ldr r2, [r0]
+; CHECK-NEXT: str r2, [r1]
+; CHECK-NEXT: ldr r2, [r0, #4]
+; CHECK-NEXT: str r2, [r1, #4]
+; CHECK-NEXT: ldr r0, [r0, #8]
+; CHECK-NEXT: str r0, [r1, #8]
+; CHECK-NEXT: bx lr
+entry:
+ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 15, i32 4, i1 false)
+ ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
+
+
+define void @t3() #0 {
+; CHECK-LABEL: t3:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r0, :lower16:d_32
+; CHECK-NEXT: movw r2, :lower16:s_32
+; CHECK-NEXT: movt r0, :upper16:d_32
+; CHECK-NEXT: movt r2, :upper16:s_32
+; CHECK-NEXT: ldr r1, [r0]
+; CHECK-NEXT: str r1, [r2]
+; CHECK-NEXT: ldr r3, [r0, #4]
+; CHECK-NEXT: str r3, [r2, #4]
+; CHECK-NEXT: ldr r1, [r0, #8]
+; CHECK-NEXT: ldr r3, [r0, #12]
+; CHECK-NEXT: ldrb r0, [r0, #16]
+; CHECK-NEXT: strd r1, r3, [r2, #8]
+; CHECK-NEXT: strb r0, [r2, #16]
+; CHECK-NEXT: bx lr
+ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([32 x i32]* @s_32 to i8*), i8* bitcast ([32 x i32]* @d_32 to i8*), i32 17, i32 4, i1 false)
+ ret void
+}
+
+define void @t4() #0 {
+; CHECK-LABEL: t4:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r0, :lower16:d_32
+; CHECK-NEXT: movw r1, :lower16:s_32
+; CHECK-NEXT: movt r0, :upper16:d_32
+; CHECK-NEXT: movt r1, :upper16:s_32
+; CHECK-NEXT: ldr.w r2, [r0, #11]
+; CHECK-NEXT: str.w r2, [r1, #11]
+; CHECK-NEXT: ldr r2, [r0]
+; CHECK-NEXT: str r2, [r1]
+; CHECK-NEXT: ldr r2, [r0, #4]
+; CHECK-NEXT: str r2, [r1, #4]
+; CHECK-NEXT: ldr r0, [r0, #8]
+; CHECK-NEXT: str r0, [r1, #8]
+; CHECK-NEXT: bx lr
+ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([32 x i32]* @s_32 to i8*), i8* bitcast ([32 x i32]* @d_32 to i8*), i32 15, i32 4, i1 false)
+ ret void
+}
+
+define void @t5() #0 {
+; CHECK-LABEL: t5:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: movw r0, :lower16:d
+; CHECK-NEXT: movw r1, :lower16:s
+; CHECK-NEXT: movt r0, :upper16:d
+; CHECK-NEXT: movt r1, :upper16:s
+; CHECK-NEXT: ldr r0, [r0]
+; CHECK-NEXT: ldr r1, [r1]
+; CHECK-NEXT: add.w r12, r0, #12
+; CHECK-NEXT: ldr r3, [r0, #24]
+; CHECK-NEXT: ldrd r2, lr, [r0, #4]
+; CHECK-NEXT: ldm.w r12, {r4, r5, r12}
+; CHECK-NEXT: str r3, [r1, #24]
+; CHECK-NEXT: add.w r3, r1, #12
+; CHECK-NEXT: strd r2, lr, [r1, #4]
+; CHECK-NEXT: stm.w r3, {r4, r5, r12}
+; CHECK-NEXT: ldr r0, [r0, #28]
+; CHECK-NEXT: str r0, [r1, #28]
+; CHECK-NEXT: pop {r4, r5, r7, pc}
+entry:
+ %0 = load i32*, i32** @s, align 4
+ %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
+ %1 = bitcast i32* %arrayidx to i8*
+ %2 = load i32*, i32** @d, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
+ %3 = bitcast i32* %arrayidx1 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 28, i32 4, i1 false)
+ ret void
+}
+
+define void @t6() #0 {
+; CHECK-LABEL: t6:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: movw r0, :lower16:d
+; CHECK-NEXT: movw r1, :lower16:s
+; CHECK-NEXT: movt r0, :upper16:d
+; CHECK-NEXT: movt r1, :upper16:s
+; CHECK-NEXT: ldr r0, [r0]
+; CHECK-NEXT: ldr r1, [r1]
+; CHECK-NEXT: add.w r12, r0, #12
+; CHECK-NEXT: ldr r3, [r0, #24]
+; CHECK-NEXT: ldrd r2, lr, [r0, #4]
+; CHECK-NEXT: ldm.w r12, {r4, r5, r12}
+; CHECK-NEXT: str r3, [r1, #24]
+; CHECK-NEXT: add.w r3, r1, #12
+; CHECK-NEXT: strd r2, lr, [r1, #4]
+; CHECK-NEXT: stm.w r3, {r4, r5, r12}
+; CHECK-NEXT: ldr r0, [r0, #28]
+; CHECK-NEXT: str r0, [r1, #28]
+; CHECK-NEXT: pop {r4, r5, r7, pc}
+entry:
+ %0 = load i32*, i32** @s, align 8
+ %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
+ %1 = bitcast i32* %arrayidx to i8*
+ %2 = load i32*, i32** @d, align 8
+ %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
+ %3 = bitcast i32* %arrayidx1 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 28, i32 4, i1 false)
+ ret void
+}
>From 6253180285262140e70bc72dd54a9446c0492378 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Wed, 18 Sep 2024 15:04:27 +0100
Subject: [PATCH 2/3] Responding to review comments
Change-Id: Iab80d5b301219eab2fda94eeec2c097152accd0e
---
llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp | 53 +++++++++++----------
llvm/lib/Target/ARM/ARMSubtarget.h | 12 +----
2 files changed, 31 insertions(+), 34 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index dcf1f3d04a9e17..12db2ab1fca2fa 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -12,6 +12,7 @@
#include "ARMTargetMachine.h"
#include "ARMTargetTransformInfo.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/Support/CommandLine.h"
@@ -148,11 +149,11 @@ SDValue ARMSelectionDAGInfo::EmitMemcpyAsLdSt(
unsigned EmittedNumMemOps = 0;
EVT VT = MVT::i32;
unsigned VTSize = 4;
- unsigned i = 0;
+ unsigned I = 0;
// Emit a maximum of 4 loads in Thumb1 since we have fewer registers
const unsigned MaxLoads = Subtarget.isThumb1Only() ? 4 : 6;
- SDValue TFOps[6];
- SDValue Loads[6];
+ SmallVector<SDValue> TFOps(6);
+ SmallVector<SDValue> Loads(6);
uint64_t SrcOff = 0, DstOff = 0;
MachineMemOperand::Flags MOFlags = MachineMemOperand::Flags::MONone;
@@ -174,28 +175,30 @@ SDValue ARMSelectionDAGInfo::EmitMemcpyAsLdSt(
// same number of stores. The loads and stores may get combined into
// ldm/stm later on.
while (EmittedNumMemOps < NumMemOps) {
- for (i = 0; i < MaxLoads && EmittedNumMemOps + i < NumMemOps; ++i) {
- Loads[i] = DAG.getLoad(VT, dl, Chain,
+ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) {
+ Loads[I] = DAG.getLoad(VT, dl, Chain,
DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
DAG.getConstant(SrcOff, dl, MVT::i32)),
SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0),
LoadMOFlags);
- TFOps[i] = Loads[i].getValue(1);
+ TFOps[I] = Loads[I].getValue(1);
SrcOff += VTSize;
}
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ ArrayRef(TFOps.data(), I));
- for (i = 0; i < MaxLoads && EmittedNumMemOps + i < NumMemOps; ++i) {
- TFOps[i] = DAG.getStore(
- Chain, dl, Loads[i],
+ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) {
+ TFOps[I] = DAG.getStore(
+ Chain, dl, Loads[I],
DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
DAG.getConstant(DstOff, dl, MVT::i32)),
DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), StoreMOFlags);
DstOff += VTSize;
}
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ ArrayRef(TFOps.data(), I));
- EmittedNumMemOps += i;
+ EmittedNumMemOps += I;
}
if (BytesLeft == 0)
@@ -203,7 +206,7 @@ SDValue ARMSelectionDAGInfo::EmitMemcpyAsLdSt(
// Issue loads / stores for the trailing (1 - 3) bytes.
unsigned BytesLeftSave = BytesLeft;
- i = 0;
+ I = 0;
while (BytesLeft) {
if (BytesLeft >= 2) {
VT = MVT::i16;
@@ -213,19 +216,21 @@ SDValue ARMSelectionDAGInfo::EmitMemcpyAsLdSt(
VTSize = 1;
}
- Loads[i] = DAG.getLoad(VT, dl, Chain,
+ Loads[I] = DAG.getLoad(VT, dl, Chain,
DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
DAG.getConstant(SrcOff, dl, MVT::i32)),
SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0),
LoadMOFlags);
- TFOps[i] = Loads[i].getValue(1);
- ++i;
+
+ TFOps[I] = Loads[I].getValue(1);
+ ++I;
SrcOff += VTSize;
BytesLeft -= VTSize;
}
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
+ Chain =
+ DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps.data(), I));
- i = 0;
+ I = 0;
BytesLeft = BytesLeftSave;
while (BytesLeft) {
if (BytesLeft >= 2) {
@@ -236,16 +241,18 @@ SDValue ARMSelectionDAGInfo::EmitMemcpyAsLdSt(
VTSize = 1;
}
- TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
+ TFOps[I] = DAG.getStore(Chain, dl, Loads[I],
DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
DAG.getConstant(DstOff, dl, MVT::i32)),
DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0),
StoreMOFlags);
- ++i;
+ ++I;
DstOff += VTSize;
BytesLeft -= VTSize;
}
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ ArrayRef(TFOps.data(), I));
}
static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
@@ -302,9 +309,7 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
Alignment.value(), RTLIB::MEMCPY);
- if (Subtarget.isCortexM7() &&
- (Subtarget.wantsMEMCPYAsLdSt() ||
- (isVolatile && Subtarget.wantsVolatileMEMCPYAsLdSt())))
+ if (Subtarget.allowInlineMemcpyAsLdSt())
return EmitMemcpyAsLdSt(DAG, dl, Subtarget, Chain, Dst, Src, SizeVal,
isVolatile, DstPtrInfo, SrcPtrInfo);
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 12cd6b15a5b5dc..9a1b3137138d61 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -465,16 +465,6 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
/// True if fast-isel is used.
bool useFastISel() const;
- /// True if ARMISD::MEMCPY should not be created/expanded specially (e.g.
- /// as LDM/STM pairs).
- bool wantsMEMCPYAsLdSt() const { return HasV7Ops && ARMProcClass == MClass; }
-
- /// True if volatile ARMISD::MEMCPY should not be created/expanded specially
- /// (e.g. as LDM/STM pairs).
- bool wantsVolatileMEMCPYAsLdSt() const {
- return ARMProcClass == MClass && HasV6Ops && !HasV7Ops;
- }
-
/// Returns the correct return opcode for the current feature set.
/// Use BX if available to allow mixing thumb/arm code, but fall back
/// to plain mov pc,lr on ARMv4.
@@ -505,6 +495,8 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
unsigned PhysReg) const override;
unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
+
+ bool allowInlineMemcpyAsLdSt() const { return hasV7Ops() && isMClass(); }
};
} // end namespace llvm
>From 753c9de3b228d30d5f07f03a200b0d390be811cb Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Thu, 19 Sep 2024 09:56:03 +0100
Subject: [PATCH 3/3] Responding to review comments
Change-Id: I2c63c7775d8a295f448ebee231c7b8e7022d4868
---
llvm/lib/Target/ARM/ARMFeatures.td | 5 +++++
llvm/lib/Target/ARM/ARMProcessors.td | 2 +-
llvm/lib/Target/ARM/ARMSubtarget.h | 2 +-
3 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td
index 8b0ade54b46d3c..0c90e4e7fa4765 100644
--- a/llvm/lib/Target/ARM/ARMFeatures.td
+++ b/llvm/lib/Target/ARM/ARMFeatures.td
@@ -500,6 +500,11 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler",
"DisablePostRAScheduler", "true",
"Don't schedule again after register allocation">;
+def FeatureUseInlineMemcpyAsLdSt :
+ SubtargetFeature<"use-inline-memcpy-ldst", "UseInlineMemcpyAsLdSt",
+ "true", "Use memcpy inlining as LD/ST instructions">;
+
+
// Armv8.5-A extensions
// Has speculation barrier.
diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td
index e4e122a0d1339b..2270f0669601a8 100644
--- a/llvm/lib/Target/ARM/ARMProcessors.td
+++ b/llvm/lib/Target/ARM/ARMProcessors.td
@@ -96,7 +96,7 @@ def ProcR52plus : SubtargetFeature<"r52plus", "ARMProcFamily", "CortexR52plus",
def ProcM3 : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3",
"Cortex-M3 ARM processors", []>;
def ProcM7 : SubtargetFeature<"m7", "ARMProcFamily", "CortexM7",
- "Cortex-M7 ARM processors", []>;
+ "Cortex-M7 ARM processors", [FeatureUseInlineMemcpyAsLdSt]>;
//===----------------------------------------------------------------------===//
// ARM processors
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 9a1b3137138d61..161fd296dadde0 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -496,7 +496,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
unsigned PhysReg) const override;
unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
- bool allowInlineMemcpyAsLdSt() const { return hasV7Ops() && isMClass(); }
+ bool allowInlineMemcpyAsLdSt() const { return UseInlineMemcpyAsLdSt; }
};
} // end namespace llvm
More information about the llvm-commits
mailing list