[llvm] 146dad0 - [ARM] MVE FP16 cost adjustments
David Green via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 6 07:58:38 PDT 2020
Author: David Green
Date: 2020-07-06T15:57:51+01:00
New Revision: 146dad0077b46a0fb8e158c10490c1774db5a762
URL: https://github.com/llvm/llvm-project/commit/146dad0077b46a0fb8e158c10490c1774db5a762
DIFF: https://github.com/llvm/llvm-project/commit/146dad0077b46a0fb8e158c10490c1774db5a762.diff
LOG: [ARM] MVE FP16 cost adjustments
This adjusts the MVE fp16 cost model, similar to how we already do for
integer casts. It uses the base cost of 1 per cvt for most fp extend /
truncates, but adjusts it for loads and stores where we know that a
extending load has been used to get the load into the correct lane, and
only an MVE VCVTB is then needed.
Differential Revision: https://reviews.llvm.org/D81813
Added:
Modified:
llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 04a259657321..44dfb9e8c129 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -229,6 +229,18 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
}
+
+ static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
+ // FPExtends are similar but also require the VCVT instructions.
+ {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
+ {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
+ };
+ if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
+ if (const auto *Entry =
+ ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
+ DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
+ return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
+ }
}
// The truncate of a store is free. This is the mirror of extends above.
@@ -247,6 +259,17 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
DstTy.getSimpleVT()))
return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
}
+
+ static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
+ {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
+ {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
+ };
+ if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
+ if (const auto *Entry =
+ ConvertCostTableLookup(MVEFLoadConversionTbl, ISD, SrcTy.getSimpleVT(),
+ DstTy.getSimpleVT()))
+ return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
+ }
}
// NEON vector operations that can extend their inputs.
@@ -955,6 +978,22 @@ int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
return LT.first * 4;
}
+ // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
+ // Same for stores.
+ if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
+ ((Opcode == Instruction::Load && I->hasOneUse() &&
+ isa<FPExtInst>(*I->user_begin())) ||
+ (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
+ FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
+ Type *DstTy =
+ Opcode == Instruction::Load
+ ? (*I->user_begin())->getType()
+ : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
+ if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
+ DstTy->getScalarType()->isFloatTy())
+ return ST->getMVEVectorCostFactor();
+ }
+
int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
? ST->getMVEVectorCostFactor()
: 1;
diff --git a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
index 7628f09fc646..491e0900e08a 100644
--- a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
+++ b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
@@ -1284,8 +1284,8 @@ define i32 @load_fpextends() {
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r2 = fpext half %loadf16 to double
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r3 = fpext float %loadf32 to double
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = fpext <2 x half> %loadv2f16 to <2 x float>
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
+; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
+; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4 = fpext <16 x half> %loadv16f16 to <16 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5 = fpext <2 x half> %loadv2f16 to <2 x double>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v6 = fpext <4 x half> %loadv4f16 to <4 x double>
@@ -1294,8 +1294,8 @@ define i32 @load_fpextends() {
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v9 = fpext <2 x float> %loadv2f32 to <2 x double>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v10 = fpext <4 x float> %loadv4f32 to <4 x double>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v11 = fpext <8 x float> %loadv8f32 to <8 x double>
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %loadv4f16ou = load <4 x half>, <4 x half>* undef, align 8
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float>
+; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %loadv4f16ou = load <4 x half>, <4 x half>* undef, align 8
+; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; CHECK-V8M-MAIN-RECIP-LABEL: 'load_fpextends'
@@ -1396,8 +1396,8 @@ define i32 @load_fpextends() {
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r2 = fpext half %loadf16 to double
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r3 = fpext float %loadf32 to double
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = fpext <2 x half> %loadv2f16 to <2 x float>
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
+; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
+; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4 = fpext <16 x half> %loadv16f16 to <16 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5 = fpext <2 x half> %loadv2f16 to <2 x double>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v6 = fpext <4 x half> %loadv4f16 to <4 x double>
@@ -1407,7 +1407,7 @@ define i32 @load_fpextends() {
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v10 = fpext <4 x float> %loadv4f32 to <4 x double>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v11 = fpext <8 x float> %loadv8f32 to <8 x double>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4f16ou = load <4 x half>, <4 x half>* undef, align 8
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float>
+; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; CHECK-V8M-MAIN-SIZE-LABEL: 'load_fpextends'
@@ -1558,9 +1558,9 @@ define i32 @load_fptrunc() {
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %i3264 = fptrunc double undef to float
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632 = fptrunc <2 x float> undef to <2 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v21664 = fptrunc <2 x double> undef to <2 x half>
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half>
+; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v41664 = fptrunc <4 x double> undef to <4 x half>
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half>
+; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v81664 = fptrunc <8 x double> undef to <8 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v23264 = fptrunc <2 x double> undef to <2 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v43264 = fptrunc <4 x double> undef to <4 x float>
@@ -1569,7 +1569,7 @@ define i32 @load_fptrunc() {
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store float %i3264, float* undef, align 4
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x half> %v21632, <2 x half>* undef, align 4
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x half> %v21664, <2 x half>* undef, align 4
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: store <4 x half> %v41632, <4 x half>* undef, align 8
+; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x half> %v41632, <4 x half>* undef, align 8
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: store <4 x half> %v41664, <4 x half>* undef, align 8
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <8 x half> %v81632, <8 x half>* undef, align 16
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <8 x half> %v81664, <8 x half>* undef, align 16
@@ -1658,9 +1658,9 @@ define i32 @load_fptrunc() {
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %i3264 = fptrunc double undef to float
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632 = fptrunc <2 x float> undef to <2 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v21664 = fptrunc <2 x double> undef to <2 x half>
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half>
+; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v41664 = fptrunc <4 x double> undef to <4 x half>
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half>
+; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v81664 = fptrunc <8 x double> undef to <8 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v23264 = fptrunc <2 x double> undef to <2 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v43264 = fptrunc <4 x double> undef to <4 x float>
@@ -2784,8 +2784,8 @@ define i32 @maskedload_fpextends() {
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v9 = fpext <2 x float> %loadv2f32 to <2 x double>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v10 = fpext <4 x float> %loadv4f32 to <4 x double>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v11 = fpext <8 x float> %loadv8f32 to <8 x double>
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %loadv4f16ou = load <4 x half>, <4 x half>* undef, align 8
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float>
+; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %loadv4f16ou = load <4 x half>, <4 x half>* undef, align 8
+; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; CHECK-V8M-MAIN-RECIP-LABEL: 'maskedload_fpextends'
@@ -2877,7 +2877,7 @@ define i32 @maskedload_fpextends() {
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v10 = fpext <4 x float> %loadv4f32 to <4 x double>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v11 = fpext <8 x float> %loadv8f32 to <8 x double>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4f16ou = load <4 x half>, <4 x half>* undef, align 8
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float>
+; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; CHECK-V8M-MAIN-SIZE-LABEL: 'maskedload_fpextends'
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll b/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
index 747bac801f61..ff3e03c7bad4 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
@@ -562,14 +562,12 @@ for.body:
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
-; TODO: this fpext could be allowed, but we don't lower it very efficiently yet,
-; so reject this for now.
define void @fpext_allowed(float* noalias nocapture %A, half* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
; CHECK-LABEL: fpext_allowed(
-; PREFER-FOLDING-NOT: vector.body:
+; PREFER-FOLDING: vector.body:
; PREFER-FOLDING-NOT: llvm.masked.load
; PREFER-FOLDING-NOT: llvm.masked.store
-; PREFER-FOLDING-NOT: br i1 %{{.*}}, label %{{.*}}, label %vector.body
+; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
entry:
br label %for.body
@@ -591,14 +589,12 @@ for.body:
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
-; TODO: this fptrunc could be allowed, but we don't lower it very efficiently yet,
-; so reject this for now.
define void @fptrunc_allowed(half* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
; CHECK-LABEL: fptrunc_allowed(
-; PREFER-FOLDING-NOT: vector.body:
+; PREFER-FOLDING: vector.body:
; PREFER-FOLDING-NOT: llvm.masked.load
; PREFER-FOLDING-NOT: llvm.masked.store
-; PREFER-FOLDING-NOT: br i1 %{{.*}}, label %{{.*}}, label %vector.body
+; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
entry:
br label %for.body
More information about the llvm-commits
mailing list