[llvm] 6c0a1ed - [ARM] Add FP handling for MVE lane interleaving
David Green via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 12 07:28:26 PDT 2021
Author: David Green
Date: 2021-04-12T15:28:13+01:00
New Revision: 6c0a1ed3a94ff34e6d9500cdfd04858b1a6f72aa
URL: https://github.com/llvm/llvm-project/commit/6c0a1ed3a94ff34e6d9500cdfd04858b1a6f72aa
DIFF: https://github.com/llvm/llvm-project/commit/6c0a1ed3a94ff34e6d9500cdfd04858b1a6f72aa.diff
LOG: [ARM] Add FP handling for MVE lane interleaving
FP16 to FP32 converts can be handled in MVE lane interleaving, much like
the sext/zext lowering we do. This expands the pass with fpext and
fptrunc handling, and basic fp operations allowing more efficient
lowering of fp vectors.
Differential Revision: https://reviews.llvm.org/D97292
Added:
Modified:
llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp b/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
index 9189298a9d04..94dcfd112258 100644
--- a/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
+++ b/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
@@ -123,17 +123,20 @@ static bool isProfitableToInterleave(SmallSetVector<Instruction *, 4> &Exts,
// T=VLDRH.16; A=VMOVNB T; B=VMOVNT T
// But those VMOVL may be folded into a VMULL.
- // But expensive extends/truncs are always good to remove.
- for (auto *E : Exts)
- if (!isa<LoadInst>(E->getOperand(0))) {
+ // But expensive extends/truncs are always good to remove. FPExts always
+ // involve extra VCVT's so are always considered to be beneficial to convert.
+ for (auto *E : Exts) {
+ if (isa<FPExtInst>(E) || !isa<LoadInst>(E->getOperand(0))) {
LLVM_DEBUG(dbgs() << "Beneficial due to " << *E << "\n");
return true;
}
- for (auto *T : Truncs)
+ }
+ for (auto *T : Truncs) {
if (T->hasOneUse() && !isa<StoreInst>(*T->user_begin())) {
LLVM_DEBUG(dbgs() << "Beneficial due to " << *T << "\n");
return true;
}
+ }
// Otherwise, we know we have a load(ext), see if any of the Extends are a
// vmull. This is a simple heuristic and certainly not perfect.
@@ -172,6 +175,7 @@ static bool tryInterleave(Instruction *Start,
switch (I->getOpcode()) {
// Truncs
case Instruction::Trunc:
+ case Instruction::FPTrunc:
if (Truncs.count(I))
continue;
Truncs.insert(I);
@@ -181,6 +185,7 @@ static bool tryInterleave(Instruction *Start,
// Extend leafs
case Instruction::SExt:
case Instruction::ZExt:
+ case Instruction::FPExt:
if (Exts.count(I))
continue;
for (auto *Use : I->users())
@@ -196,6 +201,9 @@ static bool tryInterleave(Instruction *Start,
case Instruction::LShr:
case Instruction::Shl:
case Instruction::ICmp:
+ case Instruction::FCmp:
+ case Instruction::FAdd:
+ case Instruction::FMul:
case Instruction::Select:
if (Ops.count(I))
continue;
@@ -297,9 +305,11 @@ static bool tryInterleave(Instruction *Start,
LLVM_DEBUG(dbgs() << "Replacing ext " << *I << "\n");
Builder.SetInsertPoint(I);
Value *Shuffle = Builder.CreateShuffleVector(I->getOperand(0), LeafMask);
+ bool FPext = isa<FPExtInst>(I);
bool Sext = isa<SExtInst>(I);
- Value *Ext = Sext ? Builder.CreateSExt(Shuffle, I->getType())
- : Builder.CreateZExt(Shuffle, I->getType());
+ Value *Ext = FPext ? Builder.CreateFPExt(Shuffle, I->getType())
+ : Sext ? Builder.CreateSExt(Shuffle, I->getType())
+ : Builder.CreateZExt(Shuffle, I->getType());
I->replaceAllUsesWith(Ext);
LLVM_DEBUG(dbgs() << " with " << *Shuffle << "\n");
}
diff --git a/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll b/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll
index 8af3ecaa8a8f..c6e4dd5867b2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll
@@ -360,16 +360,14 @@ define void @both_8(half* nocapture readonly %x, half* noalias nocapture %y) {
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: .LBB7_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
-; CHECK-NEXT: vcvtb.f32.f16 q1, q1
-; CHECK-NEXT: vmul.f32 q1, q1, q0
-; CHECK-NEXT: vcvtb.f16.f32 q1, q1
-; CHECK-NEXT: vstrh.32 q1, [r1, #8]
-; CHECK-NEXT: vldrh.u32 q1, [r0], #16
-; CHECK-NEXT: vcvtb.f32.f16 q1, q1
+; CHECK-NEXT: vldrh.u16 q1, [r0], #16
+; CHECK-NEXT: vcvtb.f32.f16 q2, q1
+; CHECK-NEXT: vcvtt.f32.f16 q1, q1
+; CHECK-NEXT: vmul.f32 q2, q2, q0
; CHECK-NEXT: vmul.f32 q1, q1, q0
-; CHECK-NEXT: vcvtb.f16.f32 q1, q1
-; CHECK-NEXT: vstrh.32 q1, [r1], #16
+; CHECK-NEXT: vcvtb.f16.f32 q2, q2
+; CHECK-NEXT: vcvtt.f16.f32 q2, q1
+; CHECK-NEXT: vstrb.8 q2, [r1], #16
; CHECK-NEXT: le lr, .LBB7_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
@@ -412,26 +410,22 @@ define void @both_16(half* nocapture readonly %x, half* noalias nocapture %y) {
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: .LBB8_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.u32 q1, [r0, #24]
-; CHECK-NEXT: vcvtb.f32.f16 q1, q1
-; CHECK-NEXT: vmul.f32 q1, q1, q0
-; CHECK-NEXT: vcvtb.f16.f32 q1, q1
-; CHECK-NEXT: vstrh.32 q1, [r1, #24]
-; CHECK-NEXT: vldrh.u32 q1, [r0, #16]
-; CHECK-NEXT: vcvtb.f32.f16 q1, q1
-; CHECK-NEXT: vmul.f32 q1, q1, q0
-; CHECK-NEXT: vcvtb.f16.f32 q1, q1
-; CHECK-NEXT: vstrh.32 q1, [r1, #16]
-; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
-; CHECK-NEXT: vcvtb.f32.f16 q1, q1
+; CHECK-NEXT: vldrh.u16 q1, [r0, #16]
+; CHECK-NEXT: vcvtb.f32.f16 q2, q1
+; CHECK-NEXT: vcvtt.f32.f16 q1, q1
+; CHECK-NEXT: vmul.f32 q2, q2, q0
; CHECK-NEXT: vmul.f32 q1, q1, q0
-; CHECK-NEXT: vcvtb.f16.f32 q1, q1
-; CHECK-NEXT: vstrh.32 q1, [r1, #8]
-; CHECK-NEXT: vldrh.u32 q1, [r0], #32
-; CHECK-NEXT: vcvtb.f32.f16 q1, q1
+; CHECK-NEXT: vcvtb.f16.f32 q2, q2
+; CHECK-NEXT: vcvtt.f16.f32 q2, q1
+; CHECK-NEXT: vldrh.u16 q1, [r0], #32
+; CHECK-NEXT: vstrh.16 q2, [r1, #16]
+; CHECK-NEXT: vcvtb.f32.f16 q2, q1
+; CHECK-NEXT: vcvtt.f32.f16 q1, q1
+; CHECK-NEXT: vmul.f32 q2, q2, q0
; CHECK-NEXT: vmul.f32 q1, q1, q0
-; CHECK-NEXT: vcvtb.f16.f32 q1, q1
-; CHECK-NEXT: vstrh.32 q1, [r1], #32
+; CHECK-NEXT: vcvtb.f16.f32 q2, q2
+; CHECK-NEXT: vcvtt.f16.f32 q2, q1
+; CHECK-NEXT: vstrh.16 q2, [r1], #32
; CHECK-NEXT: le lr, .LBB8_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
More information about the llvm-commits
mailing list