[llvm] r285304 - [X86][AVX512DQ] Improve lowering of MUL v2i64 and v4i64
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 27 08:27:01 PDT 2016
Author: rksimon
Date: Thu Oct 27 10:27:00 2016
New Revision: 285304
URL: http://llvm.org/viewvc/llvm-project?rev=285304&view=rev
Log:
[X86][AVX512DQ] Improve lowering of MUL v2i64 and v4i64
With DQI but without VLX, lower v2i64 and v4i64 MUL operations with v8i64 MUL (vpmullq).
Updated cost table accordingly.
Differential Revision: https://reviews.llvm.org/D26011
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/trunk/test/Analysis/CostModel/X86/arith.ll
llvm/trunk/test/CodeGen/X86/avx512-arith.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=285304&r1=285303&r2=285304&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Oct 27 10:27:00 2016
@@ -19854,6 +19854,25 @@ static SDValue LowerMUL(SDValue Op, cons
assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply");
+ // AVX512DQ - extend to 512 bit vector.
+ // FIXME: This can possibly be converted to a tablegen pattern.
+ if (Subtarget.hasDQI()) {
+ assert(!Subtarget.hasVLX() && "AVX512DQVL vXi64 multiply is legal");
+ assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
+ "AVX512DQ v8i64 multiply is legal");
+
+ MVT NewVT = MVT::getVectorVT(MVT::i64, 512 / VT.getScalarSizeInBits());
+ SDValue A512 =
+ DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, DAG.getUNDEF(NewVT), A,
+ DAG.getIntPtrConstant(0, dl));
+ SDValue B512 =
+ DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, DAG.getUNDEF(NewVT), B,
+ DAG.getIntPtrConstant(0, dl));
+ SDValue MulNode = DAG.getNode(ISD::MUL, dl, NewVT, A512, B512);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MulNode,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
// Ahi = psrlqi(a, 32);
// Bhi = psrlqi(b, 32);
//
Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=285304&r1=285303&r2=285304&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Thu Oct 27 10:27:00 2016
@@ -204,6 +204,19 @@ int X86TTIImpl::getArithmeticInstrCost(
return LT.first * Entry->Cost;
}
+ static const CostTblEntry AVX512DQCostTable[] = {
+ { ISD::MUL, MVT::v2i64, 1 },
+ { ISD::MUL, MVT::v4i64, 1 },
+ { ISD::MUL, MVT::v8i64, 1 }
+ };
+
+ // Look for AVX512DQ lowering tricks for custom cases.
+ if (ST->hasDQI()) {
+ if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
static const CostTblEntry AVX512BWCostTable[] = {
// Vectorizing division is a bad idea. See the SSE2 table for more comments.
{ ISD::SDIV, MVT::v64i8, 64*20 },
Modified: llvm/trunk/test/Analysis/CostModel/X86/arith.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/arith.ll?rev=285304&r1=285303&r2=285304&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/arith.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/arith.ll Thu Oct 27 10:27:00 2016
@@ -4,6 +4,7 @@
; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512DQ
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
@@ -429,19 +430,25 @@ define i32 @mul(i32 %arg) {
; SSE42: cost of 9 {{.*}} %A = mul
; AVX: cost of 9 {{.*}} %A = mul
; AVX2: cost of 9 {{.*}} %A = mul
- ; AVX512: cost of 9 {{.*}} %A = mul
+ ; AVX512F: cost of 9 {{.*}} %A = mul
+ ; AVX512BW: cost of 9 {{.*}} %A = mul
+ ; AVX512DQ: cost of 1 {{.*}} %A = mul
%A = mul <2 x i64> undef, undef
; SSSE3: cost of 18 {{.*}} %B = mul
; SSE42: cost of 18 {{.*}} %B = mul
; AVX: cost of 18 {{.*}} %B = mul
; AVX2: cost of 9 {{.*}} %B = mul
- ; AVX512: cost of 9 {{.*}} %B = mul
+ ; AVX512F: cost of 9 {{.*}} %B = mul
+ ; AVX512BW: cost of 9 {{.*}} %B = mul
+ ; AVX512DQ: cost of 1 {{.*}} %B = mul
%B = mul <4 x i64> undef, undef
; SSSE3: cost of 36 {{.*}} %C = mul
; SSE42: cost of 36 {{.*}} %C = mul
; AVX: cost of 36 {{.*}} %C = mul
; AVX2: cost of 18 {{.*}} %C = mul
- ; AVX512: cost of 2 {{.*}} %C = mul
+ ; AVX512F: cost of 2 {{.*}} %C = mul
+ ; AVX512BW: cost of 2 {{.*}} %C = mul
+ ; AVX512DQ: cost of 1 {{.*}} %C = mul
%C = mul <8 x i64> undef, undef
; SSSE3: cost of 6 {{.*}} %D = mul
@@ -515,7 +522,9 @@ define void @mul_2i32() {
; SSE42: cost of 9 {{.*}} %A0 = mul
; AVX: cost of 9 {{.*}} %A0 = mul
; AVX2: cost of 9 {{.*}} %A0 = mul
- ; AVX512: cost of 9 {{.*}} %A0 = mul
+ ; AVX512F: cost of 9 {{.*}} %A0 = mul
+ ; AVX512BW: cost of 9 {{.*}} %A0 = mul
+ ; AVX512DQ: cost of 1 {{.*}} %A0 = mul
%A0 = mul <2 x i32> undef, undef
ret void
Modified: llvm/trunk/test/CodeGen/X86/avx512-arith.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-arith.ll?rev=285304&r1=285303&r2=285304&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-arith.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-arith.ll Thu Oct 27 10:27:00 2016
@@ -182,15 +182,10 @@ define <4 x i64> @imulq256(<4 x i64> %y,
;
; AVX512DQ-LABEL: imulq256:
; AVX512DQ: ## BB#0:
-; AVX512DQ-NEXT: vpmuludq %ymm0, %ymm1, %ymm2
-; AVX512DQ-NEXT: vpsrlq $32, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
-; AVX512DQ-NEXT: vpsllq $32, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpsrlq $32, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpaddq %ymm0, %ymm3, %ymm0
-; AVX512DQ-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512DQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT: vpmullq %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512DQ-NEXT: retq
;
; SKX-LABEL: imulq256:
@@ -243,15 +238,10 @@ define <2 x i64> @imulq128(<2 x i64> %y,
;
; AVX512DQ-LABEL: imulq128:
; AVX512DQ: ## BB#0:
-; AVX512DQ-NEXT: vpmuludq %xmm0, %xmm1, %xmm2
-; AVX512DQ-NEXT: vpsrlq $32, %xmm0, %xmm3
-; AVX512DQ-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
-; AVX512DQ-NEXT: vpsllq $32, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpsrlq $32, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
-; AVX512DQ-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpaddq %xmm0, %xmm3, %xmm0
-; AVX512DQ-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512DQ-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT: vpmullq %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512DQ-NEXT: retq
;
; SKX-LABEL: imulq128:
More information about the llvm-commits
mailing list