[llvm] dfe3ffa - [ARM] Transforming memset to Tail predicated Loop
Malhar Jajoo via llvm-commits
llvm-commits at lists.llvm.org
Fri May 7 05:36:02 PDT 2021
Author: Malhar Jajoo
Date: 2021-05-07T13:35:53+01:00
New Revision: dfe3ffaa4a47ea93cc289b4496c093fbaf73adbc
URL: https://github.com/llvm/llvm-project/commit/dfe3ffaa4a47ea93cc289b4496c093fbaf73adbc
DIFF: https://github.com/llvm/llvm-project/commit/dfe3ffaa4a47ea93cc289b4496c093fbaf73adbc.diff
LOG: [ARM] Transforming memset to Tail predicated Loop
This patch converts llvm.memset intrinsic into Tail Predicated
Hardware loops for a target that supports the Arm M-profile
Vector Extension (MVE).
The llvm.memset is converted to a TP loop for both
constant and non-constant input sizes (of llvm.memset).
Reviewed By: dmgreen
Differential Revision: https://reviews.llvm.org/D100435
Added:
Modified:
llvm/lib/Target/ARM/ARMISelLowering.cpp
llvm/lib/Target/ARM/ARMISelLowering.h
llvm/lib/Target/ARM/ARMInstrMVE.td
llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
llvm/lib/Target/ARM/ARMSubtarget.h
llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
llvm/test/CodeGen/Thumb2/mve-phireg.ll
llvm/test/CodeGen/Thumb2/mve-tp-loop.ll
llvm/test/CodeGen/Thumb2/mve-tp-loop.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 46ac4cd20d41b..f1f3d4c6e895c 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1803,6 +1803,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(ARMISD::CSNEG)
MAKE_CASE(ARMISD::CSINC)
MAKE_CASE(ARMISD::MEMCPYLOOP)
+ MAKE_CASE(ARMISD::MEMSETLOOP)
#undef MAKE_CASE
}
return nullptr;
@@ -11105,7 +11106,6 @@ static Register genTPEntry(MachineBasicBlock *TpEntry,
MachineBasicBlock *TpExit, Register OpSizeReg,
const TargetInstrInfo *TII, DebugLoc Dl,
MachineRegisterInfo &MRI) {
-
// Calculates loop iteration count = ceil(n/16)/16 = ((n + 15)&(-16)) / 16.
Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
@@ -11147,17 +11147,21 @@ static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
const TargetInstrInfo *TII, DebugLoc Dl,
MachineRegisterInfo &MRI, Register OpSrcReg,
Register OpDestReg, Register ElementCountReg,
- Register TotalIterationsReg) {
-
- // First insert 4 PHI nodes for: Current pointer to Src, Dest array, loop
- // iteration counter, predication counter Current position in the src array
- Register SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
- Register CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
- BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
- .addUse(OpSrcReg)
- .addMBB(TpEntry)
- .addUse(CurrSrcReg)
- .addMBB(TpLoopBody);
+ Register TotalIterationsReg, bool IsMemcpy) {
+ // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
+ // array, loop iteration counter, predication counter.
+
+ Register SrcPhiReg, CurrSrcReg;
+ if (IsMemcpy) {
+ // Current position in the src array
+ SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+ CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+ BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
+ .addUse(OpSrcReg)
+ .addMBB(TpEntry)
+ .addUse(CurrSrcReg)
+ .addMBB(TpLoopBody);
+ }
// Current position in the dest array
Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
@@ -11200,19 +11204,23 @@ static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
.add(predOps(ARMCC::AL))
.addReg(0);
- // VLDRB and VSTRB instructions, predicated using VPR
- Register LoadedValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
- BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
- .addDef(CurrSrcReg)
- .addDef(LoadedValueReg)
- .addReg(SrcPhiReg)
- .addImm(16)
- .addImm(ARMVCC::Then)
- .addUse(VccrReg);
+ // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
+ Register SrcValueReg;
+ if (IsMemcpy) {
+ SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
+ BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
+ .addDef(CurrSrcReg)
+ .addDef(SrcValueReg)
+ .addReg(SrcPhiReg)
+ .addImm(16)
+ .addImm(ARMVCC::Then)
+ .addUse(VccrReg);
+ } else
+ SrcValueReg = OpSrcReg;
BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
.addDef(CurrDestReg)
- .addUse(LoadedValueReg, RegState::Kill)
+ .addUse(SrcValueReg)
.addReg(DestPhiReg)
.addImm(16)
.addImm(ARMVCC::Then)
@@ -11259,9 +11267,10 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return BB;
}
- case ARM::MVE_MEMCPYLOOPINST: {
+ case ARM::MVE_MEMCPYLOOPINST:
+ case ARM::MVE_MEMSETLOOPINST: {
- // Transformation below expands MVE_MEMCPYLOOPINST Pseudo instruction
+ // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
// into a Tail Predicated (TP) Loop. It adds the instructions to calculate
// the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
// adds the relevant instructions in the TP loop Body for generation of a
@@ -11301,23 +11310,24 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MF->push_back(TpLoopBody);
// If any instructions are present in the current block after
- // MVE_MEMCPYLOOPINST, split the current block and move the instructions
- // into the newly created exit block. If there are no instructions
- // add an explicit branch to the FallThrough block and then split.
+ // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
+ // move the instructions into the newly created exit block. If there are no
+ // instructions add an explicit branch to the FallThrough block and then
+ // split.
//
// The split is required for two reasons:
// 1) A terminator(t2WhileLoopStart) will be placed at that site.
// 2) Since a TPLoopBody will be added later, any phis in successive blocks
// need to be updated. splitAt() already handles this.
- TpExit = BB->splitAt(MI, false);
+ TpExit = BB->splitAt(MI);
if (TpExit == BB) {
- assert(BB->canFallThrough() &&
- "Exit block must be FallThrough of the block containing memcpy");
+ assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
+ "block containing memcpy/memset Pseudo");
TpExit = BB->getFallThrough();
BuildMI(BB, dl, TII->get(ARM::t2B))
.addMBB(TpExit)
.add(predOps(ARMCC::AL));
- TpExit = BB->splitAt(MI, false);
+ TpExit = BB->splitAt(MI);
}
// Add logic for iteration count
@@ -11325,8 +11335,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
// Add the vectorized (and predicated) loads/store instructions
+ bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
- OpDestReg, OpSizeReg, TotalIterationsReg);
+ OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
// Required to avoid conflict with the MachineVerifier during testing.
Properties.reset(MachineFunctionProperties::Property::NoPHIs);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index b604fae7bc5a1..8c5704427a3d9 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -303,6 +303,9 @@ class VectorType;
// Pseudo-instruction representing a memory copy using a tail predicated
// loop
MEMCPYLOOP,
+ // Pseudo-instruction representing a memset using a tail predicated
+ // loop
+ MEMSETLOOP,
// V8.1MMainline condition select
CSINV, // Conditional select invert.
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index bf9e52fbdf113..d7c034800b2be 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -6877,6 +6877,18 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
[(MVE_MEMCPYLOOPNODE rGPR:$dst, rGPR:$src, rGPR:$sz)]>;
}
+def SDT_MVEMEMSETLOOPNODE
+ : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisVT<1, v16i8>, SDTCisVT<2, i32>]>;
+def MVE_MEMSETLOOPNODE : SDNode<"ARMISD::MEMSETLOOP", SDT_MVEMEMSETLOOPNODE,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
+ def MVE_MEMSETLOOPINST : PseudoInst<(outs),
+ (ins rGPR:$dst, MQPR:$src, rGPR:$sz),
+ NoItinerary,
+ [(MVE_MEMSETLOOPNODE rGPR:$dst, MQPR:$src, rGPR:$sz)]>;
+}
+
def MVE_DLSTP_8 : MVE_DLSTP<"dlstp.8", 0b00>;
def MVE_DLSTP_16 : MVE_DLSTP<"dlstp.16", 0b01>;
def MVE_DLSTP_32 : MVE_DLSTP<"dlstp.32", 0b10>;
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index a0c82851b2db1..12d4ad889897e 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -139,6 +139,33 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
return CallResult.second;
}
+static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
+ const SelectionDAG &DAG,
+ ConstantSDNode *ConstantSize,
+ Align Alignment, bool IsMemcpy) {
+ auto &F = DAG.getMachineFunction().getFunction();
+ if (!EnableMemtransferTPLoop)
+ return false;
+ if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
+ return true;
+ // Do not generate inline TP loop if optimizations is disabled,
+ // or if optimization for size (-Os or -Oz) is on.
+ if (F.hasOptNone() || F.hasOptSize())
+ return false;
+ // If cli option is unset, for memset always generate inline TP.
+ // For memcpy, check some conditions
+ if (!IsMemcpy)
+ return true;
+ if (!ConstantSize && Alignment >= Align(4))
+ return true;
+ if (ConstantSize &&
+ ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
+ ConstantSize->getZExtValue() <
+ Subtarget.getMaxMemcpyTPInlineSizeThreshold())
+ return true;
+ return false;
+}
+
SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
@@ -147,29 +174,8 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
- auto GenInlineTP = [&](const ARMSubtarget &Subtarget,
- const SelectionDAG &DAG) {
- auto &F = DAG.getMachineFunction().getFunction();
- if (!EnableMemtransferTPLoop)
- return false;
- if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
- return true;
- // Do not generate inline TP loop if optimizations is disabled,
- // or if optimization for size (-Os or -Oz) is on.
- if (F.hasOptNone() || F.hasOptSize())
- return false;
- // If cli option is unset
- if (!ConstantSize && Alignment >= Align(4))
- return true;
- if (ConstantSize &&
- ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
- ConstantSize->getZExtValue() <
- Subtarget.getMaxTPLoopInlineSizeThreshold())
- return true;
- return false;
- };
-
- if (Subtarget.hasMVEIntegerOps() && GenInlineTP(Subtarget, DAG))
+ if (Subtarget.hasMVEIntegerOps() &&
+ shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
DAG.getZExtOrTrunc(Size, dl, MVT::i32));
@@ -292,6 +298,22 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment, bool isVolatile,
MachinePointerInfo DstPtrInfo) const {
+
+ const ARMSubtarget &Subtarget =
+ DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
+
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+
+ // Generate TP loop for llvm.memset
+ if (Subtarget.hasMVEIntegerOps() &&
+ shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
+ false)) {
+ Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
+ DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
+ return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
+ DAG.getZExtOrTrunc(Size, dl, MVT::i32));
+ }
+
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
Alignment.value(), RTLIB::MEMSET);
}
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 91c7b7cf20037..5d5fe7199db38 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -538,10 +538,11 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
return 64;
}
- /// getMaxTPLoopSizeThreshold - Returns the maximum memcpy size
- /// that still makes it profitable to inline the call as a Tail
- /// Predicated loop
- unsigned getMaxTPLoopInlineSizeThreshold() const { return 128; }
+ /// getMaxMemcpyTPInlineSizeThreshold - Returns the maximum size
+ /// that still makes it profitable to inline a llvm.memcpy as a Tail
+ /// Predicated loop.
+ /// This threshold should only be used for constant size inputs.
+ unsigned getMaxMemcpyTPInlineSizeThreshold() const { return 128; }
/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
index a489a02e6d029..3469dba948d33 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
@@ -58,28 +58,35 @@ for.body: ; preds = %entry, %for.body
define void @test_memset(i32* nocapture %x, i32 %n, i32 %m) {
; CHECK-LABEL: test_memset:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT: push {r4, r5, r6, r7, lr}
-; CHECK-NEXT: .pad #4
-; CHECK-NEXT: sub sp, #4
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: cmp r1, #1
-; CHECK-NEXT: blt .LBB1_3
-; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: mov r4, r2
-; CHECK-NEXT: mov r5, r1
-; CHECK-NEXT: mov r6, r0
-; CHECK-NEXT: lsls r7, r2, #2
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r4, pc}
+; CHECK-NEXT: .LBB1_1: @ %for.body.preheader
+; CHECK-NEXT: lsl.w r12, r2, #2
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: b .LBB1_2
; CHECK-NEXT: .LBB1_2: @ %for.body
-; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mov r0, r6
-; CHECK-NEXT: mov r1, r4
-; CHECK-NEXT: bl __aeabi_memclr4
-; CHECK-NEXT: add r6, r7
-; CHECK-NEXT: subs r5, #1
-; CHECK-NEXT: bne .LBB1_2
-; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #4
-; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NEXT: @ Child Loop BB1_4 Depth 2
+; CHECK-NEXT: mov r4, r0
+; CHECK-NEXT: mov r3, r2
+; CHECK-NEXT: wlstp.8 lr, r3, .LBB1_3
+; CHECK-NEXT: b .LBB1_4
+; CHECK-NEXT: .LBB1_3: @ %for.body
+; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT: add r0, r12
+; CHECK-NEXT: subs r1, #1
+; CHECK-NEXT: beq .LBB1_5
+; CHECK-NEXT: b .LBB1_2
+; CHECK-NEXT: .LBB1_4: @ Parent Loop BB1_2 Depth=1
+; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT: vstrb.8 q0, [r4], #16
+; CHECK-NEXT: letp lr, .LBB1_4
+; CHECK-NEXT: b .LBB1_3
+; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup
+; CHECK-NEXT: pop {r4, pc}
entry:
%cmp5 = icmp sgt i32 %n, 0
br i1 %cmp5, label %for.body, label %for.cond.cleanup
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index 81d0c6c0f0b3e..d6e505e170742 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o 2>/dev/null - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp --arm-memtransfer-tploop=allow %s -o 2>/dev/null - | FileCheck %s
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 1, !"min_enum_size", i32 4}
@@ -592,141 +592,147 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: .pad #24
-; CHECK-NEXT: sub sp, #24
-; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: .pad #32
+; CHECK-NEXT: sub sp, #32
+; CHECK-NEXT: strd r0, r2, [sp, #24] @ 8-byte Folded Spill
; CHECK-NEXT: cmp r3, #0
-; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: mov r0, r3
; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrne r0, [sp, #112]
+; CHECK-NEXT: ldrne r0, [sp, #136]
; CHECK-NEXT: cmpne r0, #0
; CHECK-NEXT: bne .LBB10_2
; CHECK-NEXT: .LBB10_1: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #24
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: add sp, #32
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: .LBB10_2: @ %for.cond1.preheader.us.preheader
-; CHECK-NEXT: ldr.w r9, [sp, #116]
-; CHECK-NEXT: mov r6, r1
-; CHECK-NEXT: movs r1, #1
-; CHECK-NEXT: mov r11, r2
-; CHECK-NEXT: bic r10, r9, #3
-; CHECK-NEXT: mov.w r8, #0
-; CHECK-NEXT: sub.w r0, r10, #4
-; CHECK-NEXT: add.w r0, r1, r0, lsr #2
-; CHECK-NEXT: ldr r1, [sp, #112]
-; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT: lsl.w r0, r9, #1
-; CHECK-NEXT: str r0, [sp] @ 4-byte Spill
-; CHECK-NEXT: adr r0, .LCPI10_0
-; CHECK-NEXT: vdup.32 q4, r1
-; CHECK-NEXT: vldrw.u32 q5, [r0]
-; CHECK-NEXT: lsls r4, r1, #1
-; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT: vshl.i32 q6, q4, #2
-; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: ldr.w r12, [sp, #140]
+; CHECK-NEXT: movs r7, #1
+; CHECK-NEXT: mov.w r11, #0
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: bic r2, r12, #3
+; CHECK-NEXT: subs r3, r2, #4
+; CHECK-NEXT: add.w r0, r7, r3, lsr #2
+; CHECK-NEXT: ldr r7, [sp, #136]
+; CHECK-NEXT: adr r3, .LCPI10_0
; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: lsl.w r0, r12, #1
+; CHECK-NEXT: vdup.32 q1, r7
+; CHECK-NEXT: vldrw.u32 q2, [r3]
+; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: lsls r6, r7, #1
+; CHECK-NEXT: vshl.i32 q3, q1, #2
+; CHECK-NEXT: movs r3, #0
+; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: b .LBB10_5
; CHECK-NEXT: .LBB10_3: @ %for.cond5.preheader.us73.preheader
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT: add.w r0, r11, r12, lsl #1
-; CHECK-NEXT: mov r1, r4
-; CHECK-NEXT: bl __aeabi_memclr
+; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: add.w r3, r0, r5, lsl #1
+; CHECK-NEXT: mov r5, r6
+; CHECK-NEXT: wlstp.8 lr, r5, .LBB10_4
+; CHECK-NEXT: b .LBB10_15
; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
-; CHECK-NEXT: add r8, r9
-; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT: add r1, r0
-; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: adds r1, #1
-; CHECK-NEXT: cmp r1, r0
+; CHECK-NEXT: add r11, r12
+; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: add r3, r0
+; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: adds r3, #1
+; CHECK-NEXT: cmp r3, r0
; CHECK-NEXT: beq .LBB10_1
; CHECK-NEXT: .LBB10_5: @ %for.cond1.preheader.us
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB10_8 Depth 2
; CHECK-NEXT: @ Child Loop BB10_11 Depth 3
; CHECK-NEXT: @ Child Loop BB10_14 Depth 3
-; CHECK-NEXT: ldr r0, [sp, #112]
-; CHECK-NEXT: cmp.w r9, #0
-; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: mul r12, r1, r0
+; CHECK-NEXT: @ Child Loop BB10_15 Depth 2
+; CHECK-NEXT: mul r5, r3, r7
+; CHECK-NEXT: cmp.w r12, #0
+; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: beq .LBB10_3
; CHECK-NEXT: @ %bb.6: @ %for.cond5.preheader.us.us.preheader
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: b .LBB10_8
; CHECK-NEXT: .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT: ldr r0, [sp, #112]
-; CHECK-NEXT: add.w r3, r1, r12
-; CHECK-NEXT: adds r1, #1
-; CHECK-NEXT: cmp r1, r0
-; CHECK-NEXT: strh.w r2, [r11, r3, lsl #1]
+; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: add.w r0, r8, r5
+; CHECK-NEXT: add.w r8, r8, #1
+; CHECK-NEXT: cmp r8, r7
+; CHECK-NEXT: strh.w r10, [r3, r0, lsl #1]
; CHECK-NEXT: beq .LBB10_4
; CHECK-NEXT: .LBB10_8: @ %for.cond5.preheader.us.us
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
; CHECK-NEXT: @ => This Loop Header: Depth=2
; CHECK-NEXT: @ Child Loop BB10_11 Depth 3
; CHECK-NEXT: @ Child Loop BB10_14 Depth 3
-; CHECK-NEXT: cmp.w r9, #3
+; CHECK-NEXT: cmp.w r12, #3
; CHECK-NEXT: bhi .LBB10_10
; CHECK-NEXT: @ %bb.9: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT: movs r7, #0
-; CHECK-NEXT: movs r2, #0
+; CHECK-NEXT: movs r4, #0
+; CHECK-NEXT: mov.w r10, #0
; CHECK-NEXT: b .LBB10_13
; CHECK-NEXT: .LBB10_10: @ %vector.ph
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: vmov q1, q4
-; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: vmlas.u32 q1, q5, r1
+; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: vmov q5, q1
+; CHECK-NEXT: vmov.i32 q4, #0x0
+; CHECK-NEXT: vmlas.u32 q5, q2, r8
; CHECK-NEXT: dls lr, r0
-; CHECK-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: .LBB10_11: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT: vadd.i32 q2, q1, q6
-; CHECK-NEXT: vldrh.s32 q3, [r6, q1, uxtw #1]
-; CHECK-NEXT: vldrh.s32 q1, [r2], #8
-; CHECK-NEXT: vmul.i32 q1, q3, q1
-; CHECK-NEXT: vadd.i32 q0, q1, q0
-; CHECK-NEXT: vmov q1, q2
+; CHECK-NEXT: vadd.i32 q6, q5, q3
+; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1]
+; CHECK-NEXT: vldrh.s32 q5, [r3], #8
+; CHECK-NEXT: vmul.i32 q5, q7, q5
+; CHECK-NEXT: vadd.i32 q4, q5, q4
+; CHECK-NEXT: vmov q5, q6
; CHECK-NEXT: le lr, .LBB10_11
; CHECK-NEXT: @ %bb.12: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT: vaddv.u32 r2, q0
-; CHECK-NEXT: cmp r10, r9
-; CHECK-NEXT: mov r7, r10
+; CHECK-NEXT: vaddv.u32 r10, q4
+; CHECK-NEXT: cmp r2, r12
+; CHECK-NEXT: mov r4, r2
; CHECK-NEXT: beq .LBB10_7
; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT: ldr r0, [sp, #112]
-; CHECK-NEXT: add.w r5, r8, r7
-; CHECK-NEXT: sub.w lr, r9, r7
-; CHECK-NEXT: mla r3, r0, r7, r1
-; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT: add.w r5, r0, r5, lsl #1
-; CHECK-NEXT: add.w r3, r6, r3, lsl #1
+; CHECK-NEXT: mla r3, r7, r4, r8
+; CHECK-NEXT: add.w r0, r11, r4
+; CHECK-NEXT: ldr r7, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: sub.w lr, r12, r4
+; CHECK-NEXT: add.w r9, r7, r0, lsl #1
+; CHECK-NEXT: ldr r7, [sp, #136]
+; CHECK-NEXT: add.w r3, r1, r3, lsl #1
; CHECK-NEXT: .LBB10_14: @ %for.body8.us.us
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT: ldrsh.w r0, [r3]
-; CHECK-NEXT: add r3, r4
-; CHECK-NEXT: ldrsh r7, [r5], #2
-; CHECK-NEXT: smlabb r2, r0, r7, r2
+; CHECK-NEXT: ldrsh.w r4, [r3]
+; CHECK-NEXT: add r3, r6
+; CHECK-NEXT: ldrsh r0, [r9], #2
+; CHECK-NEXT: smlabb r10, r4, r0, r10
; CHECK-NEXT: le lr, .LBB10_14
; CHECK-NEXT: b .LBB10_7
+; CHECK-NEXT: .LBB10_15: @ Parent Loop BB10_5 Depth=1
+; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT: vstrb.8 q0, [r3], #16
+; CHECK-NEXT: letp lr, .LBB10_15
+; CHECK-NEXT: b .LBB10_4
; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: @ %bb.15:
+; CHECK-NEXT: @ %bb.16:
; CHECK-NEXT: .LCPI10_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 1 @ 0x1
diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
index b746dc87cb8a6..ee0605c704362 100644
--- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp --arm-memtransfer-tploop=allow -verify-machineinstrs %s -o - | FileCheck %s
; verify-machineinstrs previously caught the incorrect use of QPR in the stack reloads.
@@ -147,65 +147,74 @@ define dso_local i32 @e() #0 {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT: .vsave {d8, d9, d10, d11}
-; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: .pad #392
-; CHECK-NEXT: sub sp, #392
-; CHECK-NEXT: movw r9, :lower16:.L_MergedGlobals
-; CHECK-NEXT: vldr s0, .LCPI1_0
-; CHECK-NEXT: movt r9, :upper16:.L_MergedGlobals
-; CHECK-NEXT: vldr s3, .LCPI1_1
-; CHECK-NEXT: mov r7, r9
-; CHECK-NEXT: mov r5, r9
-; CHECK-NEXT: ldr r0, [r7, #4]!
-; CHECK-NEXT: movw r4, :lower16:e
-; CHECK-NEXT: ldr r1, [r5, #8]!
-; CHECK-NEXT: movt r4, :upper16:e
-; CHECK-NEXT: vmov r6, s3
-; CHECK-NEXT: vdup.32 q4, r7
-; CHECK-NEXT: vmov s1, r7
-; CHECK-NEXT: vmov q1[2], q1[0], r5, r5
-; CHECK-NEXT: vmov s9, r4
-; CHECK-NEXT: vmov q1[3], q1[1], r6, r4
-; CHECK-NEXT: vmov.f32 s2, s1
-; CHECK-NEXT: vmov q3, q4
-; CHECK-NEXT: vmov.f32 s8, s0
-; CHECK-NEXT: vmov q5, q4
-; CHECK-NEXT: vmov.f32 s10, s1
-; CHECK-NEXT: vstrw.32 q1, [sp, #76]
-; CHECK-NEXT: vmov q1[2], q1[0], r7, r6
-; CHECK-NEXT: mov.w r8, #4
-; CHECK-NEXT: mov.w r10, #0
-; CHECK-NEXT: vmov q1[3], q1[1], r7, r4
-; CHECK-NEXT: vmov.32 q3[0], r4
-; CHECK-NEXT: vmov.32 q5[1], r4
-; CHECK-NEXT: str r1, [r0]
-; CHECK-NEXT: vmov.f32 s11, s3
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: .pad #416
+; CHECK-NEXT: sub sp, #416
+; CHECK-NEXT: movw r7, :lower16:.L_MergedGlobals
+; CHECK-NEXT: vldr s12, .LCPI1_0
+; CHECK-NEXT: movt r7, :upper16:.L_MergedGlobals
+; CHECK-NEXT: vldr s15, .LCPI1_1
+; CHECK-NEXT: mov r3, r7
+; CHECK-NEXT: mov r4, r7
+; CHECK-NEXT: ldr r0, [r3, #4]!
+; CHECK-NEXT: movw r2, :lower16:e
+; CHECK-NEXT: ldr r6, [r4, #8]!
+; CHECK-NEXT: vmov r5, s15
+; CHECK-NEXT: vmov s13, r3
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: movt r2, :upper16:e
+; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: vmov q0[2], q0[0], r4, r4
+; CHECK-NEXT: vmov s21, r2
+; CHECK-NEXT: vmov.f32 s14, s13
+; CHECK-NEXT: vmov q0[3], q0[1], r5, r2
+; CHECK-NEXT: vmov.f32 s20, s12
+; CHECK-NEXT: vdup.32 q7, r3
+; CHECK-NEXT: vmov q6[2], q6[0], r3, r5
+; CHECK-NEXT: vmov.f32 s22, s13
+; CHECK-NEXT: vstrw.32 q0, [sp, #100]
+; CHECK-NEXT: vmov q0, q7
+; CHECK-NEXT: vmov q6[3], q6[1], r3, r2
+; CHECK-NEXT: vmov q4, q7
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: vmov.32 q7[1], r2
+; CHECK-NEXT: vmov.f32 s23, s15
; CHECK-NEXT: movs r1, #64
-; CHECK-NEXT: strh.w r8, [sp, #390]
-; CHECK-NEXT: strd r0, r10, [sp, #24]
-; CHECK-NEXT: vstrw.32 q0, [sp, #44]
-; CHECK-NEXT: str r0, [r0]
-; CHECK-NEXT: vstrw.32 q2, [r0]
+; CHECK-NEXT: str r0, [sp, #48]
; CHECK-NEXT: vstrw.32 q5, [r0]
-; CHECK-NEXT: vstrw.32 q3, [r0]
-; CHECK-NEXT: vstrw.32 q1, [r0]
-; CHECK-NEXT: bl __aeabi_memclr4
-; CHECK-NEXT: vmov q0[2], q0[0], r5, r7
-; CHECK-NEXT: vmov q1[2], q1[0], r7, r7
-; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
-; CHECK-NEXT: vmov q1[3], q1[1], r5, r6
-; CHECK-NEXT: vmov.32 q4[0], r10
+; CHECK-NEXT: str r6, [r0]
+; CHECK-NEXT: vstrw.32 q7, [r0]
+; CHECK-NEXT: str r0, [r0]
; CHECK-NEXT: vstrw.32 q0, [r0]
-; CHECK-NEXT: str.w r10, [r9]
-; CHECK-NEXT: vstrw.32 q4, [r0]
+; CHECK-NEXT: vstrw.32 q6, [r0]
+; CHECK-NEXT: mov.w r8, #0
+; CHECK-NEXT: vmov q1[2], q1[0], r4, r3
+; CHECK-NEXT: vmov q2[2], q2[0], r3, r3
+; CHECK-NEXT: mov.w r12, #4
+; CHECK-NEXT: vmov q1[3], q1[1], r2, r4
+; CHECK-NEXT: vmov q2[3], q2[1], r4, r5
+; CHECK-NEXT: vmov.32 q4[0], r8
+; CHECK-NEXT: @ implicit-def: $r2
+; CHECK-NEXT: str.w r8, [sp, #52]
+; CHECK-NEXT: strh.w r12, [sp, #414]
+; CHECK-NEXT: vstrw.32 q3, [sp, #68]
+; CHECK-NEXT: wlstp.8 lr, r1, .LBB1_2
+; CHECK-NEXT: .LBB1_1: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT: vstrb.8 q0, [r2], #16
+; CHECK-NEXT: letp lr, .LBB1_1
+; CHECK-NEXT: .LBB1_2: @ %entry
; CHECK-NEXT: vstrw.32 q1, [r0]
-; CHECK-NEXT: str.w r8, [sp, #308]
-; CHECK-NEXT: .LBB1_1: @ %for.cond
+; CHECK-NEXT: str.w r8, [r7]
+; CHECK-NEXT: vstrw.32 q4, [r0]
+; CHECK-NEXT: vstrw.32 q2, [r0]
+; CHECK-NEXT: str.w r12, [sp, #332]
+; CHECK-NEXT: .LBB1_3: @ %for.cond
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: b .LBB1_1
+; CHECK-NEXT: b .LBB1_3
; CHECK-NEXT: .p2align 2
-; CHECK-NEXT: @ %bb.2:
+; CHECK-NEXT: @ %bb.4:
; CHECK-NEXT: .LCPI1_0:
; CHECK-NEXT: .long 0x00000004 @ float 5.60519386E-45
; CHECK-NEXT: .LCPI1_1:
diff --git a/llvm/test/CodeGen/Thumb2/mve-tp-loop.ll b/llvm/test/CodeGen/Thumb2/mve-tp-loop.ll
index a87fff9aa92aa..6eb1338eaf25d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-tp-loop.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-tp-loop.ll
@@ -8,6 +8,7 @@
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) #1
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1
+declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg)
define void @test1(i8* noalias nocapture %X, i8* noalias nocapture readonly %Y, i32 %n){
; CHECK-LABEL: test1:
@@ -281,5 +282,132 @@ for.cond.cleanup: ; preds = %entry
ret void
}
+; Check that WLSTP loop is generated for simplest case of align = 1
+define void @test12(i8* %X, i8 zeroext %c, i32 %n) {
+; CHECK-LABEL: test12:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: vdup.8 q0, r1
+; CHECK-NEXT: wlstp.8 lr, r2, .LBB11_2
+; CHECK-NEXT: .LBB11_1: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vstrb.8 q0, [r0], #16
+; CHECK-NEXT: letp lr, .LBB11_1
+; CHECK-NEXT: .LBB11_2: @ %entry
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ call void @llvm.memset.p0i8.i32(i8* align 1 %X, i8 %c, i32 %n, i1 false)
+ ret void
+}
+
+
+; Check that WLSTP loop is generated for alignment >= 4
+define void @test13(i32* %X, i8 zeroext %c, i32 %n) {
+; CHECK-LABEL: test13:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: vdup.8 q0, r1
+; CHECK-NEXT: wlstp.8 lr, r2, .LBB12_2
+; CHECK-NEXT: .LBB12_1: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vstrb.8 q0, [r0], #16
+; CHECK-NEXT: letp lr, .LBB12_1
+; CHECK-NEXT: .LBB12_2: @ %entry
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %0 = bitcast i32* %X to i8*
+ call void @llvm.memset.p0i8.i32(i8* align 4 %0, i8 %c, i32 %n, i1 false)
+ ret void
+}
+
+
+; Checks that transform correctly handles input with some arithmetic on input arguments.
+; void test14(int* X, char c, int n)
+; {
+; memset(X+2, c, (n*2)+10);
+; }
+
+define void @test14(i32* %X, i8 zeroext %c, i32 %n) {
+; CHECK-LABEL: test14:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: movs r3, #10
+; CHECK-NEXT: add.w r2, r3, r2, lsl #1
+; CHECK-NEXT: vdup.8 q0, r1
+; CHECK-NEXT: adds r0, #8
+; CHECK-NEXT: wlstp.8 lr, r2, .LBB13_2
+; CHECK-NEXT: .LBB13_1: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vstrb.8 q0, [r0], #16
+; CHECK-NEXT: letp lr, .LBB13_1
+; CHECK-NEXT: .LBB13_2: @ %entry
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %add.ptr = getelementptr inbounds i32, i32* %X, i32 2
+ %0 = bitcast i32* %add.ptr to i8*
+ %mul = shl nsw i32 %n, 1
+ %add = add nsw i32 %mul, 10
+ call void @llvm.memset.p0i8.i32(i8* nonnull align 4 %0, i8 %c, i32 %add, i1 false)
+ ret void
+}
+
+
+
+
+; Checks that transform handles for-loops (that get implicitly converted to memset)
+; void test15(int* X, char Y, int n){
+; for(int i = 0; i < n; ++i){
+; X[i] = c;
+; }
+; }
+
+define void @test15(i8* nocapture %X, i8 zeroext %c, i32 %n) {
+; CHECK-LABEL: test15:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: cmp r2, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: bxlt lr
+; CHECK-NEXT: .LBB14_1: @ %for.body.preheader
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: vdup.8 q0, r1
+; CHECK-NEXT: wlstp.8 lr, r2, .LBB14_3
+; CHECK-NEXT: .LBB14_2: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vstrb.8 q0, [r0], #16
+; CHECK-NEXT: letp lr, .LBB14_2
+; CHECK-NEXT: .LBB14_3: @ %for.body.preheader
+; CHECK-NEXT: pop.w {r7, lr}
+; CHECK-NEXT: bx lr
+entry:
+ %cmp4 = icmp sgt i32 %n, 0
+ br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ call void @llvm.memset.p0i8.i32(i8* align 4 %X, i8 %c, i32 %n, i1 false)
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.body.preheader, %entry
+ ret void
+}
+
+; Checks that transform handles case with 0 as src value. No
diff erence is expected.
+define void @test16(i32* %X, i8 zeroext %c, i32 %n) {
+; CHECK-LABEL: test16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: wlstp.8 lr, r2, .LBB15_2
+; CHECK-NEXT: .LBB15_1: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vstrb.8 q0, [r0], #16
+; CHECK-NEXT: letp lr, .LBB15_1
+; CHECK-NEXT: .LBB15_2: @ %entry
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %0 = bitcast i32* %X to i8*
+ call void @llvm.memset.p0i8.i32(i8* align 4 %0, i8 0, i32 %n, i1 false)
+ ret void
+}
+
attributes #0 = { noinline optnone }
attributes #1 = { optsize }
diff --git a/llvm/test/CodeGen/Thumb2/mve-tp-loop.mir b/llvm/test/CodeGen/Thumb2/mve-tp-loop.mir
index 53404c08511ea..e652846197ab5 100644
--- a/llvm/test/CodeGen/Thumb2/mve-tp-loop.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-tp-loop.mir
@@ -6,6 +6,8 @@
; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg)
+ ; Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
+ declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg)
define void @test1(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {
entry:
@@ -30,6 +32,27 @@
ret void
}
+ define void @test3(i32* nocapture %X, i8 zeroext %c, i32 %n) {
+ entry:
+ %0 = bitcast i32* %X to i8*
+ tail call void @llvm.memset.p0i8.i32(i8* align 4 %0, i8 %c, i32 %n, i1 false)
+ ret void
+ }
+
+
+ define void @test4(i8* nocapture %X, i8 zeroext %c, i32 %n) {
+ entry:
+ %cmp4 = icmp sgt i32 %n, 0
+ br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup
+
+ for.body.preheader: ; preds = %entry
+ call void @llvm.memset.p0i8.i32(i8* align 1 %X, i8 %c, i32 %n, i1 false)
+ br label %for.cond.cleanup
+
+ for.cond.cleanup: ; preds = %for.body.preheader, %entry
+ ret void
+ }
+
...
---
name: test1
@@ -56,7 +79,7 @@ body: |
; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg
; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg
; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]]
- ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
+ ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1
; CHECK: t2LoopEnd [[t2LoopDec]], %bb.1, implicit-def $cpsr
; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg
@@ -97,7 +120,7 @@ body: |
; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg
; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg
; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]]
- ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
+ ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1
; CHECK: t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def $cpsr
; CHECK: t2B %bb.4, 14 /* CC::al */, $noreg
@@ -125,3 +148,92 @@ body: |
tBX_RET 14 /* CC::al */, $noreg
...
+---
+name: test3
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $r0, $r1, $r2
+
+ ; CHECK-LABEL: name: test3
+ ; CHECK: liveins: $r0, $r1, $r2
+ ; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2
+ ; CHECK: [[COPY1:%[0-9]+]]:mqpr = COPY $r1
+ ; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0
+ ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
+ ; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
+ ; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
+ ; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]]
+ ; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.2, implicit-def $cpsr
+ ; CHECK: .1:
+ ; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.0, %8, %bb.1
+ ; CHECK: [[PHI1:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.0, %10, %bb.1
+ ; CHECK: [[PHI2:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.0, %12, %bb.1
+ ; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI2]], 0, $noreg
+ ; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 16, 14 /* CC::al */, $noreg, $noreg
+ ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[COPY1]], [[PHI]], 16, 1, [[MVE_VCTP8_]]
+ ; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI1]], 1
+ ; CHECK: t2LoopEnd [[t2LoopDec]], %bb.1, implicit-def $cpsr
+ ; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg
+ ; CHECK: .2.entry:
+ ; CHECK: tBX_RET 14 /* CC::al */, $noreg
+ %2:rgpr = COPY $r2
+ %1:mqpr = COPY $r1
+ %0:rgpr = COPY $r0
+ MVE_MEMSETLOOPINST %0, %1, %2
+ tBX_RET 14 /* CC::al */, $noreg
+
+...
+---
+name: test4
+alignment: 2
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: test4
+ ; CHECK: bb.0.entry:
+ ; CHECK: successors: %bb.1(0x50000000), %bb.2(0x30000000)
+ ; CHECK: liveins: $r0, $r1, $r2
+ ; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2
+ ; CHECK: [[COPY1:%[0-9]+]]:mqpr = COPY $r1
+ ; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0
+ ; CHECK: t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+ ; CHECK: t2Bcc %bb.2, 11 /* CC::lt */, $cpsr
+ ; CHECK: t2B %bb.1, 14 /* CC::al */, $noreg
+ ; CHECK: bb.1.for.body.preheader:
+ ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
+ ; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
+ ; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
+ ; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]]
+ ; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.4, implicit-def $cpsr
+ ; CHECK: bb.3:
+ ; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.1, %8, %bb.3
+ ; CHECK: [[PHI1:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.1, %10, %bb.3
+ ; CHECK: [[PHI2:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.1, %12, %bb.3
+ ; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI2]], 0, $noreg
+ ; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 16, 14 /* CC::al */, $noreg, $noreg
+ ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[COPY1]], [[PHI]], 16, 1, [[MVE_VCTP8_]]
+ ; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI1]], 1
+ ; CHECK: t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def $cpsr
+ ; CHECK: t2B %bb.4, 14 /* CC::al */, $noreg
+ ; CHECK: bb.4.for.body.preheader:
+ ; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg
+ ; CHECK: bb.2.for.cond.cleanup:
+ ; CHECK: tBX_RET 14 /* CC::al */, $noreg
+ bb.0.entry:
+ successors: %bb.1(0x50000000), %bb.2(0x30000000)
+ liveins: $r0, $r1, $r2
+
+ %2:rgpr = COPY $r2
+ %1:mqpr = COPY $r1
+ %0:rgpr = COPY $r0
+ t2CMPri %2, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+ t2Bcc %bb.2, 11 /* CC::lt */, $cpsr
+ t2B %bb.1, 14 /* CC::al */, $noreg
+
+ bb.1.for.body.preheader:
+ MVE_MEMSETLOOPINST %0, %1, %2
+
+ bb.2.for.cond.cleanup:
+ tBX_RET 14 /* CC::al */, $noreg
+
+...
More information about the llvm-commits
mailing list