[llvm] 87c5659 - [ARM] Sink splats to fma intrinsics
David Green via llvm-commits
llvm-commits at lists.llvm.org
Wed May 13 06:59:57 PDT 2020
Author: David Green
Date: 2020-05-13T14:58:30+01:00
New Revision: 87c56594dd98bc4fe6932713325f410a1b68e962
URL: https://github.com/llvm/llvm-project/commit/87c56594dd98bc4fe6932713325f410a1b68e962
DIFF: https://github.com/llvm/llvm-project/commit/87c56594dd98bc4fe6932713325f410a1b68e962.diff
LOG: [ARM] Sink splats to fma intrinsics
Similar to fmul/fadd, we can sink a splat into a loop containing a fma
in order to use more register instruction variants. For that there are
also adjustments to the sinking code to handle more than 2 arguments.
Differential Revision: https://reviews.llvm.org/D78386
Added:
Modified:
llvm/lib/Target/ARM/ARMISelLowering.cpp
llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
llvm/test/CodeGen/Thumb2/mve-vldst4.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index eb2fb1c9c5d7..e8e6b6f900c7 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -15707,6 +15707,12 @@ bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
auto *Sub = cast<Instruction>(*I->users().begin());
return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
};
+ auto IsFMS = [&](Instruction *I) {
+ if (match(I->getOperand(0), m_FNeg(m_Value())) ||
+ match(I->getOperand(1), m_FNeg(m_Value())))
+ return true;
+ return false;
+ };
auto IsSinker = [&](Instruction *I, int Operand) {
switch (I->getOpcode()) {
@@ -15724,31 +15730,45 @@ bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
case Instruction::LShr:
case Instruction::AShr:
return Operand == 1;
+ case Instruction::Call:
+ if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::fma:
+ return !IsFMS(I);
+ default:
+ return false;
+ }
+ }
+ return false;
default:
return false;
}
};
- int Op = 0;
- if (!isa<ShuffleVectorInst>(I->getOperand(Op)))
- Op = 1;
- if (!IsSinker(I, Op))
- return false;
- if (!match(I->getOperand(Op),
- m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()),
- m_Undef(), m_ZeroMask()))) {
- return false;
- }
- Instruction *Shuffle = cast<Instruction>(I->getOperand(Op));
- // All uses of the shuffle should be sunk to avoid duplicating it across gpr
- // and vector registers
- for (Use &U : Shuffle->uses()) {
- Instruction *Insn = cast<Instruction>(U.getUser());
- if (!IsSinker(Insn, U.getOperandNo()))
- return false;
+ for (auto OpIdx : enumerate(I->operands())) {
+ Value *Op = OpIdx.value().get();
+ // Make sure we are not already sinking this operand
+ if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
+ continue;
+ // We are looking for a splat that can be sunk.
+ if (!match(Op, m_ShuffleVector(
+ m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()),
+ m_Undef(), m_ZeroMask())))
+ continue;
+ if (!IsSinker(I, OpIdx.index()))
+ continue;
+
+ Instruction *Shuffle = cast<Instruction>(Op);
+ // All uses of the shuffle should be sunk to avoid duplicating it across gpr
+ // and vector registers
+ for (Use &U : Shuffle->uses()) {
+ Instruction *Insn = cast<Instruction>(U.getUser());
+ if (!IsSinker(Insn, U.getOperandNo()))
+ return false;
+ }
+ Ops.push_back(&Shuffle->getOperandUse(0));
+ Ops.push_back(&OpIdx.value());
}
- Ops.push_back(&Shuffle->getOperandUse(0));
- Ops.push_back(&I->getOperandUse(Op));
return true;
}
diff --git a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
index de200eb09779..4888f78c7dae 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
@@ -10,15 +10,13 @@ define arm_aapcs_vfpcc void @fmas1(float* nocapture readonly %x, float* nocaptur
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: vmov r12, s0
-; CHECK-NEXT: vdup.32 q0, r12
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB0_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmov q3, q0
-; CHECK-NEXT: vldrw.u32 q1, [r1], #16
-; CHECK-NEXT: vldrw.u32 q2, [r0], #16
-; CHECK-NEXT: vfma.f32 q3, q2, q1
-; CHECK-NEXT: vstrw.32 q3, [r2], #16
+; CHECK-NEXT: vldrw.u32 q0, [r1], #16
+; CHECK-NEXT: vldrw.u32 q1, [r0], #16
+; CHECK-NEXT: vfmas.f32 q1, q0, r12
+; CHECK-NEXT: vstrw.32 q1, [r2], #16
; CHECK-NEXT: letp lr, .LBB0_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
@@ -127,14 +125,13 @@ define arm_aapcs_vfpcc void @fma1(float* nocapture readonly %x, float* nocapture
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: vmov r12, s0
-; CHECK-NEXT: vdup.32 q0, r12
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB2_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q1, [r0], #16
-; CHECK-NEXT: vldrw.u32 q2, [r1], #16
-; CHECK-NEXT: vfma.f32 q2, q1, q0
-; CHECK-NEXT: vstrw.32 q2, [r2], #16
+; CHECK-NEXT: vldrw.u32 q0, [r0], #16
+; CHECK-NEXT: vldrw.u32 q1, [r1], #16
+; CHECK-NEXT: vfma.f32 q1, q0, r12
+; CHECK-NEXT: vstrw.32 q1, [r2], #16
; CHECK-NEXT: letp lr, .LBB2_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
@@ -242,17 +239,15 @@ define arm_aapcs_vfpcc void @fmss1(float* nocapture readonly %x, float* nocaptur
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
-; CHECK-NEXT: vmov r12, s0
+; CHECK-NEXT: vneg.f32 s0, s0
; CHECK-NEXT: dlstp.32 lr, r3
-; CHECK-NEXT: eor r12, r12, #-2147483648
-; CHECK-NEXT: vdup.32 q0, r12
+; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: .LBB4_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmov q3, q0
-; CHECK-NEXT: vldrw.u32 q1, [r1], #16
-; CHECK-NEXT: vldrw.u32 q2, [r0], #16
-; CHECK-NEXT: vfma.f32 q3, q2, q1
-; CHECK-NEXT: vstrw.32 q3, [r2], #16
+; CHECK-NEXT: vldrw.u32 q0, [r1], #16
+; CHECK-NEXT: vldrw.u32 q1, [r0], #16
+; CHECK-NEXT: vfmas.f32 q1, q0, r12
+; CHECK-NEXT: vstrw.32 q1, [r2], #16
; CHECK-NEXT: letp lr, .LBB4_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
@@ -484,16 +479,15 @@ define arm_aapcs_vfpcc void @fms1(float* nocapture readonly %x, float* nocapture
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
-; CHECK-NEXT: vmov r12, s0
+; CHECK-NEXT: vneg.f32 s0, s0
; CHECK-NEXT: dlstp.32 lr, r3
-; CHECK-NEXT: eor r12, r12, #-2147483648
-; CHECK-NEXT: vdup.32 q0, r12
+; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: .LBB8_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q1, [r0], #16
-; CHECK-NEXT: vldrw.u32 q2, [r1], #16
-; CHECK-NEXT: vfma.f32 q2, q1, q0
-; CHECK-NEXT: vstrw.32 q2, [r2], #16
+; CHECK-NEXT: vldrw.u32 q0, [r0], #16
+; CHECK-NEXT: vldrw.u32 q1, [r1], #16
+; CHECK-NEXT: vfma.f32 q1, q0, r12
+; CHECK-NEXT: vstrw.32 q1, [r2], #16
; CHECK-NEXT: letp lr, .LBB8_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
@@ -604,15 +598,14 @@ define arm_aapcs_vfpcc void @fms3(float* nocapture readonly %x, float* nocapture
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: vmov r12, s0
-; CHECK-NEXT: vdup.32 q0, r12
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB10_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q1, [r1], #16
-; CHECK-NEXT: vldrw.u32 q2, [r0], #16
-; CHECK-NEXT: vneg.f32 q1, q1
-; CHECK-NEXT: vfma.f32 q1, q2, q0
-; CHECK-NEXT: vstrw.32 q1, [r2], #16
+; CHECK-NEXT: vldrw.u32 q0, [r1], #16
+; CHECK-NEXT: vldrw.u32 q1, [r0], #16
+; CHECK-NEXT: vneg.f32 q0, q0
+; CHECK-NEXT: vfma.f32 q0, q1, r12
+; CHECK-NEXT: vstrw.32 q0, [r2], #16
; CHECK-NEXT: letp lr, .LBB10_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
index 5c4d982d66ad..8fba6d97eaa6 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
@@ -4,235 +4,233 @@
define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %numRows, i32 %numCols, i32 %scale.coerce) #0 {
; CHECK-LABEL: vldst4:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r7, lr}
-; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #104
-; CHECK-NEXT: sub sp, #104
+; CHECK-NEXT: .pad #88
+; CHECK-NEXT: sub sp, #88
; CHECK-NEXT: muls r2, r3, r2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: cmp.w r3, r2, lsr #2
; CHECK-NEXT: beq.w .LBB0_3
; CHECK-NEXT: @ %bb.1: @ %vector.ph
+; CHECK-NEXT: vldr.16 s0, [sp, #160]
; CHECK-NEXT: mvn r3, #7
; CHECK-NEXT: and.w r2, r3, r2, lsr #2
-; CHECK-NEXT: vldr.16 s0, [sp, #176]
-; CHECK-NEXT: subs r2, #8
; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: vmov.f16 r12, s0
+; CHECK-NEXT: subs r2, #8
; CHECK-NEXT: add.w lr, r3, r2, lsr #3
-; CHECK-NEXT: vmov.f16 r2, s0
-; CHECK-NEXT: vdup.16 q0, r2
-; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.u16 q6, [r0, #32]
-; CHECK-NEXT: vldrh.u16 q4, [r0, #48]
-; CHECK-NEXT: vldrh.u16 q0, [r0], #64
-; CHECK-NEXT: vmov r3, s24
-; CHECK-NEXT: vmovx.f16 s12, s16
-; CHECK-NEXT: vmov.16 q1[4], r3
-; CHECK-NEXT: vmov r2, s26
-; CHECK-NEXT: vmov.16 q1[5], r2
-; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vmov.16 q1[6], r2
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vldrh.u16 q7, [r0, #-48]
-; CHECK-NEXT: vmov.16 q2[0], r2
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov.16 q2[1], r3
-; CHECK-NEXT: vmov r2, s28
-; CHECK-NEXT: vmov.16 q2[2], r2
-; CHECK-NEXT: vmov r2, s18
-; CHECK-NEXT: vmov.16 q1[7], r2
-; CHECK-NEXT: vmov r2, s30
-; CHECK-NEXT: vmov.16 q2[3], r2
-; CHECK-NEXT: vmov.f32 s10, s6
-; CHECK-NEXT: vmov.f32 s11, s7
-; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT: vmul.f16 q5, q2, q1
-; CHECK-NEXT: vmovx.f16 s4, s2
-; CHECK-NEXT: vmov r3, s4
-; CHECK-NEXT: vmovx.f16 s4, s0
-; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: vmovx.f16 s8, s28
-; CHECK-NEXT: vmov.16 q1[0], r2
-; CHECK-NEXT: vmov r2, s8
-; CHECK-NEXT: vmov.16 q1[1], r3
-; CHECK-NEXT: vmovx.f16 s8, s26
+; CHECK-NEXT: vldrh.u16 q4, [r0, #32]
+; CHECK-NEXT: vldrh.u16 q3, [r0, #48]
+; CHECK-NEXT: vldrh.u16 q7, [r0], #64
+; CHECK-NEXT: vmov r2, s17
+; CHECK-NEXT: vmovx.f16 s8, s13
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: vmov r3, s19
+; CHECK-NEXT: vmov.16 q0[5], r3
+; CHECK-NEXT: vmov r2, s13
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: vmov r2, s15
+; CHECK-NEXT: vmov r3, s29
+; CHECK-NEXT: vldrh.u16 q5, [r0, #-48]
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vmov r2, s31
+; CHECK-NEXT: vmov.16 q1[0], r3
+; CHECK-NEXT: vmov.16 q1[1], r2
+; CHECK-NEXT: vmov r2, s21
; CHECK-NEXT: vmov.16 q1[2], r2
-; CHECK-NEXT: vmov r2, s8
-; CHECK-NEXT: vmovx.f16 s8, s24
-; CHECK-NEXT: vmov r12, s23
-; CHECK-NEXT: vmov r3, s8
-; CHECK-NEXT: vmovx.f16 s0, s1
-; CHECK-NEXT: vmov.16 q2[4], r3
-; CHECK-NEXT: vmov r3, s27
-; CHECK-NEXT: vmov.16 q2[5], r2
-; CHECK-NEXT: vmov r2, s12
-; CHECK-NEXT: vmovx.f16 s12, s18
-; CHECK-NEXT: vmov.16 q2[6], r2
-; CHECK-NEXT: vmov r2, s12
-; CHECK-NEXT: vmovx.f16 s12, s30
-; CHECK-NEXT: vmov.16 q2[7], r2
-; CHECK-NEXT: vmov r2, s12
+; CHECK-NEXT: vmov r2, s23
; CHECK-NEXT: vmov.16 q1[3], r2
-; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s6, s10
-; CHECK-NEXT: vstrw.32 q5, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT: vmov.f32 s7, s11
-; CHECK-NEXT: vmov.16 q2[0], r12
-; CHECK-NEXT: vmul.f16 q1, q1, q3
-; CHECK-NEXT: vmov r2, s7
-; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: vmovx.f16 s4, s23
-; CHECK-NEXT: vmov.16 q2[1], r2
-; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: vmov.16 q2[4], r2
-; CHECK-NEXT: vmov r2, s25
-; CHECK-NEXT: vmov.16 q1[4], r2
-; CHECK-NEXT: vmov r2, s17
-; CHECK-NEXT: vmov.16 q1[5], r3
+; CHECK-NEXT: vmov.f32 s6, s2
+; CHECK-NEXT: vmov.f32 s7, s3
+; CHECK-NEXT: vmul.f16 q0, q1, r12
+; CHECK-NEXT: vmovx.f16 s4, s21
+; CHECK-NEXT: vmov q6, q0
+; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT: vmovx.f16 s0, s31
; CHECK-NEXT: vmov r3, s1
-; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT: vmov.16 q1[6], r2
-; CHECK-NEXT: vmov r2, s3
-; CHECK-NEXT: vmov.16 q2[0], r3
-; CHECK-NEXT: vmov.16 q2[1], r2
-; CHECK-NEXT: vmov r2, s29
-; CHECK-NEXT: vmov.16 q2[2], r2
-; CHECK-NEXT: vmov r2, s19
-; CHECK-NEXT: vmov.16 q1[7], r2
-; CHECK-NEXT: vmov r2, s31
-; CHECK-NEXT: vmov.16 q2[3], r2
; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov.f32 s10, s6
-; CHECK-NEXT: vmov.f32 s11, s7
-; CHECK-NEXT: vmovx.f16 s4, s3
-; CHECK-NEXT: vmov r3, s4
-; CHECK-NEXT: vmov.16 q0[0], r2
-; CHECK-NEXT: vmovx.f16 s4, s29
-; CHECK-NEXT: vmov.16 q0[1], r3
+; CHECK-NEXT: vmovx.f16 s0, s29
+; CHECK-NEXT: vmov r4, s0
+; CHECK-NEXT: vmov.16 q0[0], r4
+; CHECK-NEXT: vmov.16 q0[1], r2
; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: vmovx.f16 s4, s27
+; CHECK-NEXT: vmovx.f16 s4, s19
; CHECK-NEXT: vmov.16 q0[2], r2
; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: vmovx.f16 s4, s25
-; CHECK-NEXT: vmul.f16 q5, q2, q3
-; CHECK-NEXT: vmov r3, s4
-; CHECK-NEXT: vmovx.f16 s8, s17
-; CHECK-NEXT: vmov.16 q1[4], r3
-; CHECK-NEXT: vmov r12, s20
+; CHECK-NEXT: vmovx.f16 s4, s17
+; CHECK-NEXT: vmov r4, s4
+; CHECK-NEXT: vmov.16 q1[4], r4
; CHECK-NEXT: vmov.16 q1[5], r2
; CHECK-NEXT: vmov r2, s8
-; CHECK-NEXT: vmovx.f16 s8, s19
+; CHECK-NEXT: vmovx.f16 s8, s15
; CHECK-NEXT: vmov.16 q1[6], r2
; CHECK-NEXT: vmov r2, s8
-; CHECK-NEXT: vmovx.f16 s8, s31
+; CHECK-NEXT: vmovx.f16 s8, s23
; CHECK-NEXT: vmov.16 q1[7], r2
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: vmov.16 q0[3], r2
-; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s2, s6
; CHECK-NEXT: vmov.f32 s3, s7
-; CHECK-NEXT: vmov.16 q1[2], r12
-; CHECK-NEXT: vmul.f16 q4, q0, q3
-; CHECK-NEXT: vmovx.f16 s0, s20
-; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vmovx.f16 s20, s23
+; CHECK-NEXT: vmov.16 q1[2], r3
+; CHECK-NEXT: vmul.f16 q0, q0, r12
+; CHECK-NEXT: vmov r3, s18
+; CHECK-NEXT: vmov r2, s1
+; CHECK-NEXT: vmov q2, q0
+; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT: vmovx.f16 s0, s25
; CHECK-NEXT: vmov.16 q1[3], r2
; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmovx.f16 s0, s16
+; CHECK-NEXT: vmovx.f16 s0, s9
; CHECK-NEXT: vmov.16 q1[6], r2
; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmovx.f16 s16, s19
+; CHECK-NEXT: vmovx.f16 s8, s12
; CHECK-NEXT: vmov.16 q1[7], r2
-; CHECK-NEXT: vmov r3, s8
+; CHECK-NEXT: vmov r2, s16
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: vmov r2, s12
+; CHECK-NEXT: vmov.16 q0[5], r3
+; CHECK-NEXT: vmov r3, s28
; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: vmov r2, s30
+; CHECK-NEXT: vmov.16 q1[0], r3
+; CHECK-NEXT: vmov.16 q1[1], r2
+; CHECK-NEXT: vmov r2, s20
+; CHECK-NEXT: vmov.16 q1[2], r2
+; CHECK-NEXT: vmov r2, s14
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vmov r2, s22
+; CHECK-NEXT: vmov.16 q1[3], r2
+; CHECK-NEXT: vmov.f32 s6, s2
+; CHECK-NEXT: vmov.f32 s7, s3
+; CHECK-NEXT: vmovx.f16 s0, s30
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: vmovx.f16 s0, s28
+; CHECK-NEXT: vmov r4, s0
+; CHECK-NEXT: vmul.f16 q6, q1, r12
+; CHECK-NEXT: vmov.16 q0[0], r4
+; CHECK-NEXT: vmovx.f16 s4, s20
+; CHECK-NEXT: vmov.16 q0[1], r2
; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: vmovx.f16 s0, s4
-; CHECK-NEXT: vmov.16 q6[0], r2
+; CHECK-NEXT: vmovx.f16 s4, s18
+; CHECK-NEXT: vmov.16 q0[2], r2
+; CHECK-NEXT: vmov r2, s4
+; CHECK-NEXT: vmovx.f16 s4, s16
+; CHECK-NEXT: vmov r4, s4
+; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vmov.16 q1[4], r4
+; CHECK-NEXT: vmov r3, s25
+; CHECK-NEXT: vmov.16 q1[5], r2
+; CHECK-NEXT: vmov r2, s8
+; CHECK-NEXT: vmovx.f16 s8, s14
+; CHECK-NEXT: vmov.16 q1[6], r2
+; CHECK-NEXT: vmov r2, s8
+; CHECK-NEXT: vmovx.f16 s8, s22
+; CHECK-NEXT: vmov.16 q1[7], r2
+; CHECK-NEXT: vmov r2, s8
+; CHECK-NEXT: vmov.16 q0[3], r2
+; CHECK-NEXT: vmov.16 q5[0], r3
+; CHECK-NEXT: vmov.f32 s2, s6
+; CHECK-NEXT: vmov.f32 s3, s7
+; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: vmul.f16 q2, q0, r12
+; CHECK-NEXT: vmovx.f16 s0, s25
+; CHECK-NEXT: vmov r2, s9
+; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill
+; CHECK-NEXT: vmov.16 q5[1], r2
; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov.16 q6[1], r3
-; CHECK-NEXT: vmovx.f16 s0, s8
-; CHECK-NEXT: vmov.16 q6[4], r2
+; CHECK-NEXT: vmovx.f16 s0, s9
+; CHECK-NEXT: vmov.16 q5[4], r2
; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov.16 q6[5], r2
-; CHECK-NEXT: vmov r2, s21
-; CHECK-NEXT: vmov r3, s17
+; CHECK-NEXT: vmovx.f16 s0, s19
+; CHECK-NEXT: vmov.16 q5[5], r2
+; CHECK-NEXT: vmov r2, s19
; CHECK-NEXT: vmov.16 q3[2], r2
-; CHECK-NEXT: vmovx.f16 s0, s21
+; CHECK-NEXT: vmov r3, s7
; CHECK-NEXT: vmov.16 q3[3], r3
; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmovx.f16 s0, s17
+; CHECK-NEXT: vmovx.f16 s0, s7
; CHECK-NEXT: vmov.16 q3[6], r2
; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: vmovx.f16 s0, s27
; CHECK-NEXT: vmov.16 q3[7], r2
-; CHECK-NEXT: vmov r2, s5
-; CHECK-NEXT: vmov r3, s9
+; CHECK-NEXT: vmov r2, s27
; CHECK-NEXT: vmov.16 q7[0], r2
-; CHECK-NEXT: vmovx.f16 s0, s5
+; CHECK-NEXT: vmov r3, s11
; CHECK-NEXT: vmov.16 q7[1], r3
; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmovx.f16 s0, s9
+; CHECK-NEXT: vmovx.f16 s0, s11
; CHECK-NEXT: vmov.16 q7[4], r2
; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill
; CHECK-NEXT: vmov.16 q7[5], r2
-; CHECK-NEXT: vmov r2, s22
-; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill
-; CHECK-NEXT: vmov r3, s18
+; CHECK-NEXT: vmov r2, s16
+; CHECK-NEXT: vmovx.f16 s0, s16
; CHECK-NEXT: vmov.16 q3[2], r2
-; CHECK-NEXT: vmovx.f16 s0, s22
+; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: vmovx.f16 s16, s18
; CHECK-NEXT: vmov.16 q3[3], r3
; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmovx.f16 s0, s18
+; CHECK-NEXT: vmovx.f16 s0, s4
; CHECK-NEXT: vmov.16 q3[6], r2
; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmovx.f16 s4, s6
+; CHECK-NEXT: vmovx.f16 s0, s24
; CHECK-NEXT: vmov.16 q3[7], r2
-; CHECK-NEXT: vmov r2, s6
-; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: vmov r2, s24
+; CHECK-NEXT: vmov.16 q1[0], r2
+; CHECK-NEXT: vmov r3, s8
+; CHECK-NEXT: vmov.16 q1[1], r3
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: vmovx.f16 s0, s8
+; CHECK-NEXT: vmov.16 q1[4], r2
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: vmovx.f16 s24, s26
+; CHECK-NEXT: vmov.16 q1[5], r2
+; CHECK-NEXT: vmov r2, s26
; CHECK-NEXT: vmov r3, s10
+; CHECK-NEXT: vmov.16 q0[0], r2
; CHECK-NEXT: vmov.16 q0[1], r3
-; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: vmovx.f16 s4, s10
+; CHECK-NEXT: vmov r2, s24
+; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload
; CHECK-NEXT: vmov.16 q0[4], r2
-; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: vmov.16 q0[5], r2
-; CHECK-NEXT: vmov r2, s23
-; CHECK-NEXT: vmov.16 q1[2], r2
-; CHECK-NEXT: vmov r3, s19
-; CHECK-NEXT: vmov r2, s20
-; CHECK-NEXT: vmov.16 q1[3], r3
-; CHECK-NEXT: vmov.16 q1[6], r2
-; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vmovx.f16 s16, s11
-; CHECK-NEXT: vmov.16 q1[7], r2
+; CHECK-NEXT: vmov r2, s18
+; CHECK-NEXT: vmov.16 q6[2], r2
+; CHECK-NEXT: vmov r3, s10
; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: vmovx.f16 s16, s10
; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload
-; CHECK-NEXT: vldrw.u32 q5, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT: vmov.16 q4[5], r2
-; CHECK-NEXT: vmov.f32 s1, s13
-; CHECK-NEXT: vmov.f32 s29, s9
-; CHECK-NEXT: vmov.f32 s31, s11
-; CHECK-NEXT: vmov q2, q4
-; CHECK-NEXT: vmov.f32 s25, s21
-; CHECK-NEXT: vmov.f32 s9, s5
-; CHECK-NEXT: vmov.f32 s3, s15
-; CHECK-NEXT: vmov.f32 s11, s7
+; CHECK-NEXT: vmov.16 q6[3], r3
+; CHECK-NEXT: vmov.16 q6[6], r2
+; CHECK-NEXT: vmov r2, s16
+; CHECK-NEXT: vmovx.f16 s16, s10
+; CHECK-NEXT: vmov.16 q6[7], r2
+; CHECK-NEXT: vmov r2, s16
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: vmov.f32 s5, s13
+; CHECK-NEXT: vmov.f32 s1, s25
+; CHECK-NEXT: vmov.f32 s3, s27
; CHECK-NEXT: vstrh.16 q0, [r1, #32]
-; CHECK-NEXT: vmov.f32 s27, s23
-; CHECK-NEXT: vstrh.16 q2, [r1, #48]
-; CHECK-NEXT: vstrh.16 q6, [r1], #64
-; CHECK-NEXT: vstrh.16 q7, [r1, #-48]
+; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s7, s15
+; CHECK-NEXT: vmov.f32 s29, s1
+; CHECK-NEXT: vmov.f32 s31, s3
+; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT: vstrh.16 q7, [r1, #48]
+; CHECK-NEXT: vstrh.16 q1, [r1], #64
+; CHECK-NEXT: vmov.f32 s21, s1
+; CHECK-NEXT: vmov.f32 s23, s3
+; CHECK-NEXT: vstrh.16 q5, [r1, #-48]
; CHECK-NEXT: le lr, .LBB0_2
; CHECK-NEXT: .LBB0_3: @ %while.end
-; CHECK-NEXT: add sp, #104
+; CHECK-NEXT: add sp, #88
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: pop {r7, pc}
+; CHECK-NEXT: pop {r4, pc}
entry:
%tmp.0.extract.trunc = trunc i32 %scale.coerce to i16
%l0 = bitcast i16 %tmp.0.extract.trunc to half
More information about the llvm-commits
mailing list