[llvm] r246807 - [PowerPC] Include the permutation cost for unaligned vector loads
Hal Finkel via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 3 14:23:18 PDT 2015
Author: hfinkel
Date: Thu Sep 3 16:23:18 2015
New Revision: 246807
URL: http://llvm.org/viewvc/llvm-project?rev=246807&view=rev
Log:
[PowerPC] Include the permutation cost for unaligned vector loads
Pre-P8, when we generate code for unaligned vector loads (for Altivec and QPX
types), even when accounting for the combining that takes place for multiple
consecutive such loads, there is at least one load instructions and one
permutation for each load. Make sure the cost reported reflects the cost of the
permutes as well.
Modified:
llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
llvm/trunk/test/Analysis/CostModel/PowerPC/load_store.ll
llvm/trunk/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll
Modified: llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp?rev=246807&r1=246806&r2=246807&view=diff
==============================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp Thu Sep 3 16:23:18 2015
@@ -333,6 +333,18 @@ int PPCTTIImpl::getMemoryOpCost(unsigned
bool IsQPXType = ST->hasQPX() &&
(LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
+ // If we can use the permutation-based load sequence, then this is also
+ // relatively cheap (not counting loop-invariant instructions): one load plus
+ // one permute (the last load in a series has extra cost, but we're
+ // neglecting that here). Note that on the P7, we should do unaligned loads
+ // for Altivec types using the VSX instructions, but that's more expensive
+ // than using the permutation-based load sequence. On the P8, that's no
+ // longer true.
+ if (Opcode == Instruction::Load &&
+ ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
+ Alignment >= LT.second.getScalarType().getStoreSize())
+ return Cost + LT.first; // Add the cost of the permutations.
+
// For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
// P7, unaligned vector loads are more expensive than the permutation-based
// load sequence, so that might be used instead, but regardless, the net cost
@@ -340,14 +352,6 @@ int PPCTTIImpl::getMemoryOpCost(unsigned
if (IsVSXType || (ST->hasVSX() && IsAltivecType))
return Cost;
- // If we can use the permutation-based load sequence, then this is also
- // relatively cheap (not counting loop-invariant instructions).
- bool PermutationLoad = Opcode == Instruction::Load &&
- (IsAltivecType || IsQPXType) &&
- Alignment >= LT.second.getScalarType().getStoreSize();
- if (PermutationLoad)
- return Cost;
-
// PPC in general does not support unaligned loads and stores. They'll need
// to be decomposed based on the alignment factor.
Modified: llvm/trunk/test/Analysis/CostModel/PowerPC/load_store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/PowerPC/load_store.ll?rev=246807&r1=246806&r2=246807&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/PowerPC/load_store.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/PowerPC/load_store.ll Thu Sep 3 16:23:18 2015
@@ -34,7 +34,7 @@ define i32 @loads(i32 %arg) {
; CHECK: cost of 48 {{.*}} load
load <4 x i16>, <4 x i16>* undef, align 2
- ; CHECK: cost of 1 {{.*}} load
+ ; CHECK: cost of 2 {{.*}} load
load <4 x i32>, <4 x i32>* undef, align 4
; CHECK: cost of 46 {{.*}} load
Modified: llvm/trunk/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll?rev=246807&r1=246806&r2=246807&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll Thu Sep 3 16:23:18 2015
@@ -8,7 +8,7 @@ entry:
ret <16 x i8> %r
; CHECK-LABEL: test_l_v16i8
-; CHECK: cost of 1 for instruction: %r = load <16 x i8>, <16 x i8>* %p, align 1
+; CHECK: cost of 2 for instruction: %r = load <16 x i8>, <16 x i8>* %p, align 1
}
define <32 x i8> @test_l_v32i8(<32 x i8>* %p) #0 {
@@ -17,7 +17,7 @@ entry:
ret <32 x i8> %r
; CHECK-LABEL: test_l_v32i8
-; CHECK: cost of 2 for instruction: %r = load <32 x i8>, <32 x i8>* %p, align 1
+; CHECK: cost of 4 for instruction: %r = load <32 x i8>, <32 x i8>* %p, align 1
}
define <8 x i16> @test_l_v8i16(<8 x i16>* %p) #0 {
@@ -26,7 +26,7 @@ entry:
ret <8 x i16> %r
; CHECK-LABEL: test_l_v8i16
-; CHECK: cost of 1 for instruction: %r = load <8 x i16>, <8 x i16>* %p, align 2
+; CHECK: cost of 2 for instruction: %r = load <8 x i16>, <8 x i16>* %p, align 2
}
define <16 x i16> @test_l_v16i16(<16 x i16>* %p) #0 {
@@ -35,7 +35,7 @@ entry:
ret <16 x i16> %r
; CHECK-LABEL: test_l_v16i16
-; CHECK: cost of 2 for instruction: %r = load <16 x i16>, <16 x i16>* %p, align 2
+; CHECK: cost of 4 for instruction: %r = load <16 x i16>, <16 x i16>* %p, align 2
}
define <4 x i32> @test_l_v4i32(<4 x i32>* %p) #0 {
@@ -44,7 +44,7 @@ entry:
ret <4 x i32> %r
; CHECK-LABEL: test_l_v4i32
-; CHECK: cost of 1 for instruction: %r = load <4 x i32>, <4 x i32>* %p, align 4
+; CHECK: cost of 2 for instruction: %r = load <4 x i32>, <4 x i32>* %p, align 4
}
define <8 x i32> @test_l_v8i32(<8 x i32>* %p) #0 {
@@ -53,7 +53,7 @@ entry:
ret <8 x i32> %r
; CHECK-LABEL: test_l_v8i32
-; CHECK: cost of 2 for instruction: %r = load <8 x i32>, <8 x i32>* %p, align 4
+; CHECK: cost of 4 for instruction: %r = load <8 x i32>, <8 x i32>* %p, align 4
}
define <2 x i64> @test_l_v2i64(<2 x i64>* %p) #0 {
@@ -80,7 +80,7 @@ entry:
ret <4 x float> %r
; CHECK-LABEL: test_l_v4float
-; CHECK: cost of 1 for instruction: %r = load <4 x float>, <4 x float>* %p, align 4
+; CHECK: cost of 2 for instruction: %r = load <4 x float>, <4 x float>* %p, align 4
}
define <8 x float> @test_l_v8float(<8 x float>* %p) #0 {
@@ -89,7 +89,7 @@ entry:
ret <8 x float> %r
; CHECK-LABEL: test_l_v8float
-; CHECK: cost of 2 for instruction: %r = load <8 x float>, <8 x float>* %p, align 4
+; CHECK: cost of 4 for instruction: %r = load <8 x float>, <8 x float>* %p, align 4
}
define <2 x double> @test_l_v2double(<2 x double>* %p) #0 {
@@ -224,7 +224,7 @@ entry:
ret <4 x float> %r
; CHECK-LABEL: test_l_qv4float
-; CHECK: cost of 1 for instruction: %r = load <4 x float>, <4 x float>* %p, align 4
+; CHECK: cost of 2 for instruction: %r = load <4 x float>, <4 x float>* %p, align 4
}
define <8 x float> @test_l_qv8float(<8 x float>* %p) #1 {
@@ -233,7 +233,7 @@ entry:
ret <8 x float> %r
; CHECK-LABEL: test_l_qv8float
-; CHECK: cost of 2 for instruction: %r = load <8 x float>, <8 x float>* %p, align 4
+; CHECK: cost of 4 for instruction: %r = load <8 x float>, <8 x float>* %p, align 4
}
define <4 x double> @test_l_qv4double(<4 x double>* %p) #1 {
@@ -242,7 +242,7 @@ entry:
ret <4 x double> %r
; CHECK-LABEL: test_l_qv4double
-; CHECK: cost of 1 for instruction: %r = load <4 x double>, <4 x double>* %p, align 8
+; CHECK: cost of 2 for instruction: %r = load <4 x double>, <4 x double>* %p, align 8
}
define <8 x double> @test_l_qv8double(<8 x double>* %p) #1 {
@@ -251,7 +251,7 @@ entry:
ret <8 x double> %r
; CHECK-LABEL: test_l_qv8double
-; CHECK: cost of 2 for instruction: %r = load <8 x double>, <8 x double>* %p, align 8
+; CHECK: cost of 4 for instruction: %r = load <8 x double>, <8 x double>* %p, align 8
}
define void @test_s_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 {
More information about the llvm-commits
mailing list