[llvm] r324823 - [X86][SSE] Increase PMULLD costs to better match hardware
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sat Feb 10 11:27:10 PST 2018
Author: rksimon
Date: Sat Feb 10 11:27:10 2018
New Revision: 324823
URL: http://llvm.org/viewvc/llvm-project?rev=324823&view=rev
Log:
[X86][SSE] Increase PMULLD costs to better match hardware
Until Skylake, most hardware could only issue a PMULLD op every other cycle
Modified:
llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/trunk/test/Analysis/CostModel/X86/arith.ll
llvm/trunk/test/Analysis/CostModel/X86/vshift-shl-cost.ll
Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=324823&r1=324822&r2=324823&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Sat Feb 10 11:27:10 2018
@@ -433,7 +433,9 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v16i32, 1 }, // pmulld
+ { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
+ { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
+ { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
{ ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
// Vectorizing division is a bad idea. See the SSE2 table for more comments.
@@ -572,7 +574,7 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i16, 1 }, // pmullw
- { ISD::MUL, MVT::v8i32, 1 }, // pmulld
+ { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
{ ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
{ ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
@@ -667,7 +669,7 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
{ ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
- { ISD::MUL, MVT::v4i32, 1 } // pmulld
+ { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
};
if (ST->hasSSE41())
Modified: llvm/trunk/test/Analysis/CostModel/X86/arith.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/arith.ll?rev=324823&r1=324822&r2=324823&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/arith.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/arith.ll Sat Feb 10 11:27:10 2018
@@ -496,21 +496,21 @@ define i32 @mul(i32 %arg) {
; CHECK: cost of 1 {{.*}} %I32 = mul
%I32 = mul i32 undef, undef
; SSSE3: cost of 6 {{.*}} %V4I32 = mul
- ; SSE42: cost of 1 {{.*}} %V4I32 = mul
- ; AVX: cost of 1 {{.*}} %V4I32 = mul
- ; AVX2: cost of 1 {{.*}} %V4I32 = mul
+ ; SSE42: cost of 2 {{.*}} %V4I32 = mul
+ ; AVX: cost of 2 {{.*}} %V4I32 = mul
+ ; AVX2: cost of 2 {{.*}} %V4I32 = mul
; AVX512: cost of 1 {{.*}} %V4I32 = mul
%V4I32 = mul <4 x i32> undef, undef
; SSSE3: cost of 12 {{.*}} %V8I32 = mul
- ; SSE42: cost of 2 {{.*}} %V8I32 = mul
+ ; SSE42: cost of 4 {{.*}} %V8I32 = mul
; AVX: cost of 4 {{.*}} %V8I32 = mul
- ; AVX2: cost of 1 {{.*}} %V8I32 = mul
+ ; AVX2: cost of 2 {{.*}} %V8I32 = mul
; AVX512: cost of 1 {{.*}} %V8I32 = mul
%V8I32 = mul <8 x i32> undef, undef
; SSSE3: cost of 24 {{.*}} %V16I32 = mul
- ; SSE42: cost of 4 {{.*}} %V16I32 = mul
+ ; SSE42: cost of 8 {{.*}} %V16I32 = mul
; AVX: cost of 8 {{.*}} %V16I32 = mul
- ; AVX2: cost of 2 {{.*}} %V16I32 = mul
+ ; AVX2: cost of 4 {{.*}} %V16I32 = mul
; AVX512: cost of 1 {{.*}} %V16I32 = mul
%V16I32 = mul <16 x i32> undef, undef
Modified: llvm/trunk/test/Analysis/CostModel/X86/vshift-shl-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/vshift-shl-cost.ll?rev=324823&r1=324822&r2=324823&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/vshift-shl-cost.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/vshift-shl-cost.ll Sat Feb 10 11:27:10 2018
@@ -398,8 +398,8 @@ define <8 x i64> @constant_shift_v8i64(<
define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i32':
; SSE2: Found an estimated cost of 6 for instruction: %shift
-; SSE41: Found an estimated cost of 1 for instruction: %shift
-; AVX: Found an estimated cost of 1 for instruction: %shift
+; SSE41: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 2 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
; XOPAVX: Found an estimated cost of 1 for instruction: %shift
@@ -411,7 +411,7 @@ define <4 x i32> @constant_shift_v4i32(<
define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i32':
; SSE2: Found an estimated cost of 12 for instruction: %shift
-; SSE41: Found an estimated cost of 2 for instruction: %shift
+; SSE41: Found an estimated cost of 4 for instruction: %shift
; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
@@ -424,7 +424,7 @@ define <8 x i32> @constant_shift_v8i32(<
define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i32':
; SSE2: Found an estimated cost of 24 for instruction: %shift
-; SSE41: Found an estimated cost of 4 for instruction: %shift
+; SSE41: Found an estimated cost of 8 for instruction: %shift
; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
@@ -694,7 +694,7 @@ define <8 x i16> @test2(<8 x i16> %a) {
; With SSE4.1, v4i32 shifts can be lowered into a single pmulld instruction.
-; Make sure that the estimated cost is always 1 except for the case where
+; Make sure that the estimated cost is always 2 except for the case where
; we only have SSE2 support. With SSE2, we are forced to special lower the
; v4i32 mul as a 2x shuffle, 2x pmuludq, 2x shuffle.
@@ -704,8 +704,8 @@ define <4 x i32> @test3(<4 x i32> %a) {
}
; CHECK: 'Cost Model Analysis' for function 'test3':
; SSE2: Found an estimated cost of 6 for instruction: %shl
-; SSE41: Found an estimated cost of 1 for instruction: %shl
-; AVX: Found an estimated cost of 1 for instruction: %shl
+; SSE41: Found an estimated cost of 2 for instruction: %shl
+; AVX: Found an estimated cost of 2 for instruction: %shl
; AVX2: Found an estimated cost of 1 for instruction: %shl
; XOP: Found an estimated cost of 1 for instruction: %shl
@@ -716,8 +716,8 @@ define <4 x i32> @test4(<4 x i32> %a) {
}
; CHECK: 'Cost Model Analysis' for function 'test4':
; SSE2: Found an estimated cost of 6 for instruction: %shl
-; SSE41: Found an estimated cost of 1 for instruction: %shl
-; AVX: Found an estimated cost of 1 for instruction: %shl
+; SSE41: Found an estimated cost of 2 for instruction: %shl
+; AVX: Found an estimated cost of 2 for instruction: %shl
; AVX2: Found an estimated cost of 1 for instruction: %shl
; XOP: Found an estimated cost of 1 for instruction: %shl
@@ -775,7 +775,7 @@ define <8 x i32> @test7(<8 x i32> %a) {
}
; CHECK: 'Cost Model Analysis' for function 'test7':
; SSE2: Found an estimated cost of 12 for instruction: %shl
-; SSE41: Found an estimated cost of 2 for instruction: %shl
+; SSE41: Found an estimated cost of 4 for instruction: %shl
; AVX: Found an estimated cost of 4 for instruction: %shl
; AVX2: Found an estimated cost of 1 for instruction: %shl
; XOPAVX: Found an estimated cost of 4 for instruction: %shl
@@ -823,7 +823,7 @@ define <16 x i32> @test10(<16 x i32> %a)
}
; CHECK: 'Cost Model Analysis' for function 'test10':
; SSE2: Found an estimated cost of 24 for instruction: %shl
-; SSE41: Found an estimated cost of 4 for instruction: %shl
+; SSE41: Found an estimated cost of 8 for instruction: %shl
; AVX: Found an estimated cost of 8 for instruction: %shl
; AVX2: Found an estimated cost of 2 for instruction: %shl
; XOPAVX: Found an estimated cost of 8 for instruction: %shl
More information about the llvm-commits
mailing list