[llvm] r246807 - [PowerPC] Include the permutation cost for unaligned vector loads

Thu Sep 3 14:23:18 PDT 2015

Author: hfinkel
Date: Thu Sep  3 16:23:18 2015
New Revision: 246807

URL: http://llvm.org/viewvc/llvm-project?rev=246807&view=rev
Log:
[PowerPC] Include the permutation cost for unaligned vector loads

Pre-P8, when we generate code for unaligned vector loads (for Altivec and QPX
types), even when accounting for the combining that takes place for multiple
consecutive such loads, there is at least one load instructions and one
permutation for each load. Make sure the cost reported reflects the cost of the
permutes as well.

Modified:
    llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
    llvm/trunk/test/Analysis/CostModel/PowerPC/load_store.ll
    llvm/trunk/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll

Modified: llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp?rev=246807&r1=246806&r2=246807&view=diff
==============================================================================

--- llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp Thu Sep  3 16:23:18 2015
@@ -333,6 +333,18 @@ int PPCTTIImpl::getMemoryOpCost(unsigned
   bool IsQPXType = ST->hasQPX() &&
                    (LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
 
+  // If we can use the permutation-based load sequence, then this is also
+  // relatively cheap (not counting loop-invariant instructions): one load plus
+  // one permute (the last load in a series has extra cost, but we're
+  // neglecting that here). Note that on the P7, we should do unaligned loads
+  // for Altivec types using the VSX instructions, but that's more expensive
+  // than using the permutation-based load sequence. On the P8, that's no
+  // longer true.
+  if (Opcode == Instruction::Load &&
+      ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
+      Alignment >= LT.second.getScalarType().getStoreSize())
+    return Cost + LT.first; // Add the cost of the permutations.
+
   // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
   // P7, unaligned vector loads are more expensive than the permutation-based
   // load sequence, so that might be used instead, but regardless, the net cost
@@ -340,14 +352,6 @@ int PPCTTIImpl::getMemoryOpCost(unsigned
   if (IsVSXType || (ST->hasVSX() && IsAltivecType))
     return Cost;
 
-  // If we can use the permutation-based load sequence, then this is also
-  // relatively cheap (not counting loop-invariant instructions).
-  bool PermutationLoad = Opcode == Instruction::Load &&
-                         (IsAltivecType || IsQPXType) &&
-                         Alignment >= LT.second.getScalarType().getStoreSize();
-  if (PermutationLoad)
-    return Cost;
-
   // PPC in general does not support unaligned loads and stores. They'll need
   // to be decomposed based on the alignment factor.
 

Modified: llvm/trunk/test/Analysis/CostModel/PowerPC/load_store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/PowerPC/load_store.ll?rev=246807&r1=246806&r2=246807&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/PowerPC/load_store.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/PowerPC/load_store.ll Thu Sep  3 16:23:18 2015
@@ -34,7 +34,7 @@ define i32 @loads(i32 %arg) {
   ; CHECK: cost of 48 {{.*}} load
   load <4 x i16>, <4 x i16>* undef, align 2
 
-  ; CHECK: cost of 1 {{.*}} load
+  ; CHECK: cost of 2 {{.*}} load
   load <4 x i32>, <4 x i32>* undef, align 4
 
   ; CHECK: cost of 46 {{.*}} load

Modified: llvm/trunk/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll?rev=246807&r1=246806&r2=246807&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll Thu Sep  3 16:23:18 2015
@@ -8,7 +8,7 @@ entry:
   ret <16 x i8> %r
 
 ; CHECK-LABEL: test_l_v16i8
-; CHECK: cost of 1 for instruction:   %r = load <16 x i8>, <16 x i8>* %p, align 1
+; CHECK: cost of 2 for instruction:   %r = load <16 x i8>, <16 x i8>* %p, align 1
 }
 
 define <32 x i8> @test_l_v32i8(<32 x i8>* %p) #0 {
@@ -17,7 +17,7 @@ entry:
   ret <32 x i8> %r
 
 ; CHECK-LABEL: test_l_v32i8
-; CHECK: cost of 2 for instruction:   %r = load <32 x i8>, <32 x i8>* %p, align 1
+; CHECK: cost of 4 for instruction:   %r = load <32 x i8>, <32 x i8>* %p, align 1
 }
 
 define <8 x i16> @test_l_v8i16(<8 x i16>* %p) #0 {
@@ -26,7 +26,7 @@ entry:
   ret <8 x i16> %r
 
 ; CHECK-LABEL: test_l_v8i16
-; CHECK: cost of 1 for instruction:   %r = load <8 x i16>, <8 x i16>* %p, align 2
+; CHECK: cost of 2 for instruction:   %r = load <8 x i16>, <8 x i16>* %p, align 2
 }
 
 define <16 x i16> @test_l_v16i16(<16 x i16>* %p) #0 {
@@ -35,7 +35,7 @@ entry:
   ret <16 x i16> %r
 
 ; CHECK-LABEL: test_l_v16i16
-; CHECK: cost of 2 for instruction:   %r = load <16 x i16>, <16 x i16>* %p, align 2
+; CHECK: cost of 4 for instruction:   %r = load <16 x i16>, <16 x i16>* %p, align 2
 }
 
 define <4 x i32> @test_l_v4i32(<4 x i32>* %p) #0 {
@@ -44,7 +44,7 @@ entry:
   ret <4 x i32> %r
 
 ; CHECK-LABEL: test_l_v4i32
-; CHECK: cost of 1 for instruction:   %r = load <4 x i32>, <4 x i32>* %p, align 4
+; CHECK: cost of 2 for instruction:   %r = load <4 x i32>, <4 x i32>* %p, align 4
 }
 
 define <8 x i32> @test_l_v8i32(<8 x i32>* %p) #0 {
@@ -53,7 +53,7 @@ entry:
   ret <8 x i32> %r
 
 ; CHECK-LABEL: test_l_v8i32
-; CHECK: cost of 2 for instruction:   %r = load <8 x i32>, <8 x i32>* %p, align 4
+; CHECK: cost of 4 for instruction:   %r = load <8 x i32>, <8 x i32>* %p, align 4
 }
 
 define <2 x i64> @test_l_v2i64(<2 x i64>* %p) #0 {
@@ -80,7 +80,7 @@ entry:
   ret <4 x float> %r
 
 ; CHECK-LABEL: test_l_v4float
-; CHECK: cost of 1 for instruction:   %r = load <4 x float>, <4 x float>* %p, align 4
+; CHECK: cost of 2 for instruction:   %r = load <4 x float>, <4 x float>* %p, align 4
 }
 
 define <8 x float> @test_l_v8float(<8 x float>* %p) #0 {
@@ -89,7 +89,7 @@ entry:
   ret <8 x float> %r
 
 ; CHECK-LABEL: test_l_v8float
-; CHECK: cost of 2 for instruction:   %r = load <8 x float>, <8 x float>* %p, align 4
+; CHECK: cost of 4 for instruction:   %r = load <8 x float>, <8 x float>* %p, align 4
 }
 
 define <2 x double> @test_l_v2double(<2 x double>* %p) #0 {
@@ -224,7 +224,7 @@ entry:
   ret <4 x float> %r
 
 ; CHECK-LABEL: test_l_qv4float
-; CHECK: cost of 1 for instruction:   %r = load <4 x float>, <4 x float>* %p, align 4
+; CHECK: cost of 2 for instruction:   %r = load <4 x float>, <4 x float>* %p, align 4
 }
 
 define <8 x float> @test_l_qv8float(<8 x float>* %p) #1 {
@@ -233,7 +233,7 @@ entry:
   ret <8 x float> %r
 
 ; CHECK-LABEL: test_l_qv8float
-; CHECK: cost of 2 for instruction:   %r = load <8 x float>, <8 x float>* %p, align 4
+; CHECK: cost of 4 for instruction:   %r = load <8 x float>, <8 x float>* %p, align 4
 }
 
 define <4 x double> @test_l_qv4double(<4 x double>* %p) #1 {
@@ -242,7 +242,7 @@ entry:
   ret <4 x double> %r
 
 ; CHECK-LABEL: test_l_qv4double
-; CHECK: cost of 1 for instruction:   %r = load <4 x double>, <4 x double>* %p, align 8
+; CHECK: cost of 2 for instruction:   %r = load <4 x double>, <4 x double>* %p, align 8
 }
 
 define <8 x double> @test_l_qv8double(<8 x double>* %p) #1 {
@@ -251,7 +251,7 @@ entry:
   ret <8 x double> %r
 
 ; CHECK-LABEL: test_l_qv8double
-; CHECK: cost of 2 for instruction:   %r = load <8 x double>, <8 x double>* %p, align 8
+; CHECK: cost of 4 for instruction:   %r = load <8 x double>, <8 x double>* %p, align 8
 }
 
 define void @test_s_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 {