[llvm] 22530e7 - [CostModel][X86] Update vXi8 mul costs for AVX512BW/AVX2/AVX1/SSE
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sat Jun 15 23:28:50 PDT 2024
Author: Simon Pilgrim
Date: 2024-06-16T07:27:35+01:00
New Revision: 22530e7985083032fe708848abb88b77be78e5ce
URL: https://github.com/llvm/llvm-project/commit/22530e7985083032fe708848abb88b77be78e5ce
DIFF: https://github.com/llvm/llvm-project/commit/22530e7985083032fe708848abb88b77be78e5ce.diff
LOG: [CostModel][X86] Update vXi8 mul costs for AVX512BW/AVX2/AVX1/SSE
Later levels were inheriting some of the worst case costs from SSE/AVX1 etc.
Based off llvm-mca numbers from the check_cost_tables.py script in https://github.com/RKSimon/llvm-scripts
Cleanup prep work for #90748
Added:
Modified:
llvm/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
llvm/test/Analysis/CostModel/X86/arith-int-latency.ll
llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
llvm/test/Analysis/CostModel/X86/arith-int.ll
llvm/test/Analysis/CostModel/X86/rem-codesize.ll
llvm/test/Analysis/CostModel/X86/rem-latency.ll
llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll
llvm/test/Analysis/CostModel/X86/rem.ll
llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll
llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 74948778ccf85..64cacd74153fe 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -851,7 +851,9 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
{ ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
{ ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
- { ISD::MUL, MVT::v64i8, { 5, 10,10,11 } },
+ { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
+ { ISD::MUL, MVT::v32i8, { 6, 11,10,11 } }, // extend/pmullw/trunc
+ { ISD::MUL, MVT::v64i8, { 6, 12,10,11 } }, // unpack/pmullw
{ ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
{ ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
@@ -1117,7 +1119,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
{ ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
{ ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
- { ISD::MUL, MVT::v32i8, { 6, 11,10,19 } }, // unpack/pmullw
+ { ISD::MUL, MVT::v32i8, { 6, 11,10,20 } }, // unpack/pmullw
{ ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
{ ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
{ ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
@@ -1168,7 +1170,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
// We don't have to scalarize unsupported ops. We can issue two half-sized
// operations and we only need to extract the upper YMM half.
// Two ops + 1 extract + 1 insert = 4.
- { ISD::MUL, MVT::v32i8, { 12, 13, 22, 23 } }, // unpack/pmullw + split
+ { ISD::MUL, MVT::v32i8, { 12, 12, 22, 23 } }, // unpack/pmullw + split
+ { ISD::MUL, MVT::v16i8, { 5, 6, 10, 12 } }, // unpack/pmullw
{ ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
{ ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
{ ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
@@ -1308,7 +1311,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
{ ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
- { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack
+ { ISD::MUL, MVT::v16i8, { 6, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack
{ ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
};
@@ -1353,7 +1356,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
{ ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
{ ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
- { ISD::MUL, MVT::v16i8, { 5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
+ { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
{ ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
{ ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
{ ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll b/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
index 64dce905e8aa1..050beb7fc25a3 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
@@ -901,7 +901,7 @@ define i32 @mul(i32 %arg) {
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I8 = mul <2 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = mul <16 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll b/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll
index e001368853768..6cf278e98bd85 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll
@@ -679,9 +679,9 @@ define i32 @mul(i32 %arg) {
; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I8 = mul <2 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX2-LABEL: 'mul'
@@ -745,9 +745,9 @@ define i32 @mul(i32 %arg) {
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I8 = mul <2 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'mul'
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
index c5ae4d8c901a0..b5ca132d8c51d 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
@@ -702,8 +702,8 @@ define i32 @mul(i32 %arg) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'mul'
@@ -724,7 +724,7 @@ define i32 @mul(i32 %arg) {
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
@@ -745,8 +745,8 @@ define i32 @mul(i32 %arg) {
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I8 = mul <2 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
@@ -768,7 +768,7 @@ define i32 @mul(i32 %arg) {
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int.ll b/llvm/test/Analysis/CostModel/X86/arith-int.ll
index 45bb058873ddc..e0f7afd3eb42d 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int.ll
@@ -905,9 +905,9 @@ define i32 @mul(i32 %arg) {
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = mul <2 x i8> undef, undef
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = mul <4 x i8> undef, undef
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'mul'
@@ -927,9 +927,9 @@ define i32 @mul(i32 %arg) {
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = mul <2 x i8> undef, undef
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = mul <4 x i8> undef, undef
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'mul'
@@ -1015,9 +1015,9 @@ define i32 @mul(i32 %arg) {
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = mul <2 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = mul <16 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'mul'
@@ -1059,9 +1059,9 @@ define i32 @mul(i32 %arg) {
; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I8 = mul <2 x i8> undef, undef
; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I8 = mul <4 x i8> undef, undef
; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; GLM-LABEL: 'mul'
@@ -1081,9 +1081,9 @@ define i32 @mul(i32 %arg) {
; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = mul <2 x i8> undef, undef
; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = mul <4 x i8> undef, undef
; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; GLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; GLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef
; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%I64 = mul i64 undef, undef
diff --git a/llvm/test/Analysis/CostModel/X86/rem-codesize.ll b/llvm/test/Analysis/CostModel/X86/rem-codesize.ll
index 9ae78fb3ec606..86e67eda2b1f2 100644
--- a/llvm/test/Analysis/CostModel/X86/rem-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/rem-codesize.ll
@@ -379,7 +379,7 @@ define i32 @srem_constpow2() {
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
@@ -617,7 +617,7 @@ define i32 @srem_uniformconstpow2() {
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
diff --git a/llvm/test/Analysis/CostModel/X86/rem-latency.ll b/llvm/test/Analysis/CostModel/X86/rem-latency.ll
index 4fa4dfd392d09..1428c6cdaca63 100644
--- a/llvm/test/Analysis/CostModel/X86/rem-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/rem-latency.ll
@@ -322,9 +322,9 @@ define i32 @srem_constpow2() {
; AVX1-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; AVX1-NEXT: Cost Model: Found an estimated cost of 204 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16
-; AVX1-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX1-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX1-NEXT: Cost Model: Found an estimated cost of 260 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX1-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX1-NEXT: Cost Model: Found an estimated cost of 129 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX1-NEXT: Cost Model: Found an estimated cost of 258 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX2-LABEL: 'srem_constpow2'
@@ -379,9 +379,9 @@ define i32 @srem_constpow2() {
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; SLM-LABEL: 'srem_constpow2'
@@ -560,9 +560,9 @@ define i32 @srem_uniformconstpow2() {
; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; AVX1-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16
-; AVX1-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX1-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX1-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX2-LABEL: 'srem_uniformconstpow2'
@@ -617,9 +617,9 @@ define i32 @srem_uniformconstpow2() {
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; SLM-LABEL: 'srem_uniformconstpow2'
diff --git a/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll
index f737b4369d853..2dc4c823cb539 100644
--- a/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll
@@ -342,8 +342,8 @@ define i32 @srem_constpow2() {
; AVX2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
; AVX2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 266 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 268 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'srem_constpow2'
@@ -361,7 +361,7 @@ define i32 @srem_constpow2() {
; AVX512F-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
; AVX512F-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 173 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
@@ -379,8 +379,8 @@ define i32 @srem_constpow2() {
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
@@ -580,8 +580,8 @@ define i32 @srem_uniformconstpow2() {
; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'srem_uniformconstpow2'
@@ -599,7 +599,7 @@ define i32 @srem_uniformconstpow2() {
; AVX512F-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
@@ -617,8 +617,8 @@ define i32 @srem_uniformconstpow2() {
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
diff --git a/llvm/test/Analysis/CostModel/X86/rem.ll b/llvm/test/Analysis/CostModel/X86/rem.ll
index 8bbf775f8efd0..7ca17b2af033a 100644
--- a/llvm/test/Analysis/CostModel/X86/rem.ll
+++ b/llvm/test/Analysis/CostModel/X86/rem.ll
@@ -721,9 +721,9 @@ define i32 @srem_constpow2() {
; SSE2-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; SSE2-NEXT: Cost Model: Found an estimated cost of 204 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
-; SSE2-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 300 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 304 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'srem_constpow2'
@@ -740,9 +740,9 @@ define i32 @srem_constpow2() {
; SSE42-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; SSE42-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
-; SSE42-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; SSE42-NEXT: Cost Model: Found an estimated cost of 198 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; SSE42-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SSE42-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SSE42-NEXT: Cost Model: Found an estimated cost of 200 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SSE42-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'srem_constpow2'
@@ -816,9 +816,9 @@ define i32 @srem_constpow2() {
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SLM-LABEL: 'srem_constpow2'
@@ -835,9 +835,9 @@ define i32 @srem_constpow2() {
; SLM-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; SLM-NEXT: Cost Model: Found an estimated cost of 280 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
-; SLM-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; SLM-NEXT: Cost Model: Found an estimated cost of 198 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; SLM-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SLM-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SLM-NEXT: Cost Model: Found an estimated cost of 200 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SLM-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; GLM-LABEL: 'srem_constpow2'
@@ -854,9 +854,9 @@ define i32 @srem_constpow2() {
; GLM-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; GLM-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
-; GLM-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; GLM-NEXT: Cost Model: Found an estimated cost of 198 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; GLM-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; GLM-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; GLM-NEXT: Cost Model: Found an estimated cost of 200 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; GLM-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%I64 = srem i64 undef, 16
@@ -978,9 +978,9 @@ define i32 @srem_uniformconstpow2() {
; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
-; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'srem_uniformconstpow2'
@@ -997,9 +997,9 @@ define i32 @srem_uniformconstpow2() {
; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
-; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'srem_uniformconstpow2'
@@ -1073,9 +1073,9 @@ define i32 @srem_uniformconstpow2() {
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SLM-LABEL: 'srem_uniformconstpow2'
@@ -1092,9 +1092,9 @@ define i32 @srem_uniformconstpow2() {
; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
-; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; GLM-LABEL: 'srem_uniformconstpow2'
@@ -1111,9 +1111,9 @@ define i32 @srem_uniformconstpow2() {
; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
-; GLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; GLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; GLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; GLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; GLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; GLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%I64 = srem i64 undef, 16
diff --git a/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll b/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll
index 204290bd4e40f..251af66ef7be0 100644
--- a/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll
+++ b/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll
@@ -192,7 +192,7 @@ entry:
define <16 x i8> @slm-costs_8_v16_mul(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: 'slm-costs_8_v16_mul'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = mul nsw <16 x i8> %a, %b
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = mul nsw <16 x i8> %a, %b
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %res
;
entry:
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
index 97be7304bb0c8..3d69f15fc5f24 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
@@ -51,16 +51,11 @@ define i8 @ext1_ext1_shl(<16 x i8> %x, <16 x i8> %y) {
; Negative test - eliminating extract is profitable, but vector multiply is expensive.
define i8 @ext13_ext13_mul(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext13_ext13_mul(
-; SSE-NEXT: [[TMP1:%.*]] = mul <16 x i8> [[X:%.*]], [[Y:%.*]]
-; SSE-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 13
-; SSE-NEXT: ret i8 [[R]]
-;
-; AVX-LABEL: @ext13_ext13_mul(
-; AVX-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 13
-; AVX-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 13
-; AVX-NEXT: [[R:%.*]] = mul i8 [[E0]], [[E1]]
-; AVX-NEXT: ret i8 [[R]]
+; CHECK-LABEL: @ext13_ext13_mul(
+; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 13
+; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 13
+; CHECK-NEXT: [[R:%.*]] = mul i8 [[E0]], [[E1]]
+; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 13
%e1 = extractelement <16 x i8> %y, i32 13
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
index c32ce70c2653f..52f7cd859a1ab 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
@@ -51,16 +51,11 @@ define i8 @ext1_ext1_shl(<16 x i8> %x, <16 x i8> %y) {
; Negative test - eliminating extract is profitable, but vector multiply is expensive.
define i8 @ext13_ext13_mul(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext13_ext13_mul(
-; SSE-NEXT: [[TMP1:%.*]] = mul <16 x i8> [[X:%.*]], [[Y:%.*]]
-; SSE-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 13
-; SSE-NEXT: ret i8 [[R]]
-;
-; AVX-LABEL: @ext13_ext13_mul(
-; AVX-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 13
-; AVX-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 13
-; AVX-NEXT: [[R:%.*]] = mul i8 [[E0]], [[E1]]
-; AVX-NEXT: ret i8 [[R]]
+; CHECK-LABEL: @ext13_ext13_mul(
+; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 13
+; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 13
+; CHECK-NEXT: [[R:%.*]] = mul i8 [[E0]], [[E1]]
+; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 13
%e1 = extractelement <16 x i8> %y, i32 13
More information about the llvm-commits
mailing list