[llvm] [X86] Lower vXi8 multiplies using PMADDUBSW on SSSE3+ targets (PR #95690)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 16 01:12:45 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
Extends https://github.com/llvm/llvm-project/pull/95403 to handle non-constant cases - we can avoid unpacks/extensions from vXi8 to vXi16 by using PMADDUBSW instead and truncating the vXi16 results back together.
Most targets would benefit from performing this for non-constant cases as well - its just Intel Core/SandyBridge era CPUs that might experience additional Port0/15 contention.
Fixes https://github.com/llvm/llvm-project/issues/90748
---
Patch is 273.09 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95690.diff
18 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+13-10)
- (modified) llvm/lib/Target/X86/X86TargetTransformInfo.cpp (+14-6)
- (modified) llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll (+12-12)
- (modified) llvm/test/Analysis/CostModel/X86/arith-int-latency.ll (+8-8)
- (modified) llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll (+8-8)
- (modified) llvm/test/Analysis/CostModel/X86/arith-int.ll (+20-20)
- (modified) llvm/test/Analysis/CostModel/X86/rem-codesize.ll (+55-17)
- (modified) llvm/test/Analysis/CostModel/X86/rem-latency.ll (+14-14)
- (modified) llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll (+14-14)
- (modified) llvm/test/Analysis/CostModel/X86/rem.ll (+109-33)
- (modified) llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/avx2-arith.ll (+8-10)
- (modified) llvm/test/CodeGen/X86/midpoint-int-vec-128.ll (+185-206)
- (modified) llvm/test/CodeGen/X86/midpoint-int-vec-256.ll (+270-325)
- (modified) llvm/test/CodeGen/X86/min-legal-vector-width.ll (+58-75)
- (modified) llvm/test/CodeGen/X86/pmul.ll (+123-143)
- (modified) llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll (+7-10)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll (+35-35)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f27c935812f51..02af650d69c75 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28506,17 +28506,19 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
- // For vXi8 mul-by-constant, try PMADDUBSW to avoid the need for extension.
+ // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
// Don't do this if we only need to unpack one half.
- if (Subtarget.hasSSSE3() &&
- ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
- bool IsLoLaneAllZeroOrUndef = true;
- bool IsHiLaneAllZeroOrUndef = true;
- for (auto [Idx, Val] : enumerate(B->ops())) {
- if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
- IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
- else
- IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
+ if (Subtarget.hasSSSE3()) {
+ bool BIsBuildVector = isa<BuildVectorSDNode>(B);
+ bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
+ bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
+ if (BIsBuildVector) {
+ for (auto [Idx, Val] : enumerate(B->ops())) {
+ if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
+ IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
+ else
+ IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
+ }
}
if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
@@ -28531,6 +28533,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
}
}
+
// Extract the lo/hi parts to any extend to i16.
// We're going to mask off the low byte of each result element of the
// pmullw, so it doesn't matter what's in the high byte of each 16-bit
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 64cacd74153fe..dd97c1f590a31 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -852,8 +852,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
{ ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
{ ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
- { ISD::MUL, MVT::v32i8, { 6, 11,10,11 } }, // extend/pmullw/trunc
- { ISD::MUL, MVT::v64i8, { 6, 12,10,11 } }, // unpack/pmullw
+ { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
+ { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
{ ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
{ ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
@@ -1119,7 +1119,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
{ ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
{ ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
- { ISD::MUL, MVT::v32i8, { 6, 11,10,20 } }, // unpack/pmullw
+ { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
{ ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
{ ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
{ ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
@@ -1170,8 +1170,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
// We don't have to scalarize unsupported ops. We can issue two half-sized
// operations and we only need to extract the upper YMM half.
// Two ops + 1 extract + 1 insert = 4.
- { ISD::MUL, MVT::v32i8, { 12, 12, 22, 23 } }, // unpack/pmullw + split
- { ISD::MUL, MVT::v16i8, { 5, 6, 10, 12 } }, // unpack/pmullw
+ { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
+ { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
{ ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
{ ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
{ ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
@@ -1311,7 +1311,6 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
{ ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
- { ISD::MUL, MVT::v16i8, { 6, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack
{ ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
};
@@ -1320,6 +1319,15 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
+ static const CostKindTblEntry SSSE3CostTable[] = {
+ { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
+ };
+
+ if (ST->hasSSSE3())
+ if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
+ if (auto KindCost = Entry->Cost[CostKind])
+ return LT.first * *KindCost;
+
static const CostKindTblEntry SSE2CostTable[] = {
// We don't correctly identify costs of casts because they are marked as
// custom.
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll b/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
index 050beb7fc25a3..f5ad65817950e 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
@@ -791,9 +791,9 @@ define i32 @mul(i32 %arg) {
; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I8 = mul <2 x i8> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = mul <64 x i8> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; SSE42-LABEL: 'mul'
@@ -835,9 +835,9 @@ define i32 @mul(i32 %arg) {
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I8 = mul <2 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX1-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX2-LABEL: 'mul'
@@ -858,8 +858,8 @@ define i32 @mul(i32 %arg) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'mul'
@@ -880,7 +880,7 @@ define i32 @mul(i32 %arg) {
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
@@ -902,8 +902,8 @@ define i32 @mul(i32 %arg) {
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'mul'
@@ -924,7 +924,7 @@ define i32 @mul(i32 %arg) {
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll b/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll
index 6cf278e98bd85..ed58f0f554e23 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll
@@ -680,8 +680,8 @@ define i32 @mul(i32 %arg) {
; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX2-LABEL: 'mul'
@@ -702,8 +702,8 @@ define i32 @mul(i32 %arg) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'mul'
@@ -724,7 +724,7 @@ define i32 @mul(i32 %arg) {
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
@@ -746,8 +746,8 @@ define i32 @mul(i32 %arg) {
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'mul'
@@ -768,7 +768,7 @@ define i32 @mul(i32 %arg) {
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
index b5ca132d8c51d..c9ee064822636 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
@@ -680,8 +680,8 @@ define i32 @mul(i32 %arg) {
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX2-LABEL: 'mul'
@@ -702,8 +702,8 @@ define i32 @mul(i32 %arg) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'mul'
@@ -724,7 +724,7 @@ define i32 @mul(i32 %arg) {
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = mul <32 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
@@ -746,8 +746,8 @@ define i32 @mul(i32 %arg) {
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'mul'
@@ -768,7 +768,7 @@ define i32 @mul(i32 %arg) {
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef
...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/95690
More information about the llvm-commits
mailing list