[llvm] [X86] Lower vXi8 multiplies using PMADDUBSW on SSSE3+ targets (PR #95690)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 16 01:15:08 PDT 2024
github-actions[bot] wrote:
<!--LLVM CODE FORMAT COMMENT: {clang-format}-->
:warning: C/C++ code formatter, clang-format found issues in your code. :warning:
<details>
<summary>
You can test this locally with the following command:
</summary>
``````````bash
git-clang-format --diff 22530e7985083032fe708848abb88b77be78e5ce ac4cb9eb1ebba47e94934fd3859295b54fd03704 -- llvm/lib/Target/X86/X86ISelLowering.cpp llvm/lib/Target/X86/X86TargetTransformInfo.cpp
``````````
</details>
<details>
<summary>
View the diff from clang-format here.
</summary>
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 02af650d69..7d7631f925 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28533,7 +28533,6 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
}
}
-
// Extract the lo/hi parts to any extend to i16.
// We're going to mask off the low byte of each result element of the
// pmullw, so it doesn't matter what's in the high byte of each 16-bit
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index dd97c1f590..009febabf7 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -820,46 +820,46 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
return LT.first * *KindCost;
static const CostKindTblEntry AVX512BWCostTable[] = {
- { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
- { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
- { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
- { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
- { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
- { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
- { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
- { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
- { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
-
- { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
- { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
- { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
- { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
- { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
- { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
- { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
- { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
- { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
-
- { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
- { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
-
- { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
- { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
- { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
- { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
-
- { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
- { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
-
- { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
- { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
- { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
- { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
-
- { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
- { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
- { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
- { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
+ {ISD::SHL, MVT::v16i8, {4, 8, 4, 5}}, // extend/vpsllvw/pack sequence.
+ {ISD::SRL, MVT::v16i8, {4, 8, 4, 5}}, // extend/vpsrlvw/pack sequence.
+ {ISD::SRA, MVT::v16i8, {4, 8, 4, 5}}, // extend/vpsravw/pack sequence.
+ {ISD::SHL, MVT::v32i8, {4, 23, 11, 16}}, // extend/vpsllvw/pack sequence.
+ {ISD::SRL, MVT::v32i8, {4, 30, 12, 18}}, // extend/vpsrlvw/pack sequence.
+ {ISD::SRA, MVT::v32i8, {6, 13, 24, 30}}, // extend/vpsravw/pack sequence.
+ {ISD::SHL, MVT::v64i8, {6, 19, 13, 15}}, // extend/vpsllvw/pack sequence.
+ {ISD::SRL, MVT::v64i8, {7, 27, 15, 18}}, // extend/vpsrlvw/pack sequence.
+ {ISD::SRA, MVT::v64i8, {15, 15, 30, 30}}, // extend/vpsravw/pack sequence.
+
+ {ISD::SHL, MVT::v8i16, {1, 1, 1, 1}}, // vpsllvw
+ {ISD::SRL, MVT::v8i16, {1, 1, 1, 1}}, // vpsrlvw
+ {ISD::SRA, MVT::v8i16, {1, 1, 1, 1}}, // vpsravw
+ {ISD::SHL, MVT::v16i16, {1, 1, 1, 1}}, // vpsllvw
+ {ISD::SRL, MVT::v16i16, {1, 1, 1, 1}}, // vpsrlvw
+ {ISD::SRA, MVT::v16i16, {1, 1, 1, 1}}, // vpsravw
+ {ISD::SHL, MVT::v32i16, {1, 1, 1, 1}}, // vpsllvw
+ {ISD::SRL, MVT::v32i16, {1, 1, 1, 1}}, // vpsrlvw
+ {ISD::SRA, MVT::v32i16, {1, 1, 1, 1}}, // vpsravw
+
+ {ISD::ADD, MVT::v64i8, {1, 1, 1, 1}}, // paddb
+ {ISD::ADD, MVT::v32i16, {1, 1, 1, 1}}, // paddw
+
+ {ISD::ADD, MVT::v32i8, {1, 1, 1, 1}}, // paddb
+ {ISD::ADD, MVT::v16i16, {1, 1, 1, 1}}, // paddw
+ {ISD::ADD, MVT::v8i32, {1, 1, 1, 1}}, // paddd
+ {ISD::ADD, MVT::v4i64, {1, 1, 1, 1}}, // paddq
+
+ {ISD::SUB, MVT::v64i8, {1, 1, 1, 1}}, // psubb
+ {ISD::SUB, MVT::v32i16, {1, 1, 1, 1}}, // psubw
+
+ {ISD::MUL, MVT::v16i8, {4, 12, 4, 5}}, // extend/pmullw/trunc
+ {ISD::MUL, MVT::v32i8, {3, 10, 7, 10}}, // pmaddubsw
+ {ISD::MUL, MVT::v64i8, {3, 11, 7, 10}}, // pmaddubsw
+ {ISD::MUL, MVT::v32i16, {1, 5, 1, 1}}, // pmullw
+
+ {ISD::SUB, MVT::v32i8, {1, 1, 1, 1}}, // psubb
+ {ISD::SUB, MVT::v16i16, {1, 1, 1, 1}}, // psubw
+ {ISD::SUB, MVT::v8i32, {1, 1, 1, 1}}, // psubd
+ {ISD::SUB, MVT::v4i64, {1, 1, 1, 1}}, // psubq
};
// Look for AVX512BW lowering tricks for custom cases.
@@ -1092,72 +1092,72 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
return LT.first * *KindCost;
static const CostKindTblEntry AVX2CostTable[] = {
- { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
- { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
- { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
- { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
-
- { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
- { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
- { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
- { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
-
- { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
- { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
- { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
- { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
- { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
- { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
-
- { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
- { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
- { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
- { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
- { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
- { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
- { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
- { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
-
- { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
- { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
- { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
- { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
- { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
- { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
- { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
-
- { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
-
- { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
- { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
-
- { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
- { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
- { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
- { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
- { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
- { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
-
- { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
- { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
- { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
- { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
- { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
- { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
-
- { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
- { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
- { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
- { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
- { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
- { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
-
- { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
- { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
- { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
- { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
- { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
- { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
+ {ISD::SHL, MVT::v16i8, {6, 21, 11, 16}}, // vpblendvb sequence.
+ {ISD::SHL, MVT::v32i8, {6, 23, 11, 22}}, // vpblendvb sequence.
+ {ISD::SHL, MVT::v8i16, {5, 18, 5, 10}}, // extend/vpsrlvd/pack sequence.
+ {ISD::SHL, MVT::v16i16, {8, 10, 10, 14}}, // extend/vpsrlvd/pack sequence.
+
+ {ISD::SRL, MVT::v16i8, {6, 27, 12, 18}}, // vpblendvb sequence.
+ {ISD::SRL, MVT::v32i8, {8, 30, 12, 24}}, // vpblendvb sequence.
+ {ISD::SRL, MVT::v8i16, {5, 11, 5, 10}}, // extend/vpsrlvd/pack sequence.
+ {ISD::SRL, MVT::v16i16, {8, 10, 10, 14}}, // extend/vpsrlvd/pack sequence.
+
+ {ISD::SRA, MVT::v16i8, {17, 17, 24, 30}}, // vpblendvb sequence.
+ {ISD::SRA, MVT::v32i8, {18, 20, 24, 43}}, // vpblendvb sequence.
+ {ISD::SRA, MVT::v8i16, {5, 11, 5, 10}}, // extend/vpsravd/pack sequence.
+ {ISD::SRA, MVT::v16i16, {8, 10, 10, 14}}, // extend/vpsravd/pack sequence.
+ {ISD::SRA, MVT::v2i64, {4, 5, 5, 5}}, // srl/xor/sub sequence.
+ {ISD::SRA, MVT::v4i64, {8, 8, 5, 9}}, // srl/xor/sub sequence.
+
+ {ISD::SUB, MVT::v32i8, {1, 1, 1, 2}}, // psubb
+ {ISD::ADD, MVT::v32i8, {1, 1, 1, 2}}, // paddb
+ {ISD::SUB, MVT::v16i16, {1, 1, 1, 2}}, // psubw
+ {ISD::ADD, MVT::v16i16, {1, 1, 1, 2}}, // paddw
+ {ISD::SUB, MVT::v8i32, {1, 1, 1, 2}}, // psubd
+ {ISD::ADD, MVT::v8i32, {1, 1, 1, 2}}, // paddd
+ {ISD::SUB, MVT::v4i64, {1, 1, 1, 2}}, // psubq
+ {ISD::ADD, MVT::v4i64, {1, 1, 1, 2}}, // paddq
+
+ {ISD::MUL, MVT::v16i8, {5, 18, 6, 12}}, // extend/pmullw/pack
+ {ISD::MUL, MVT::v32i8, {4, 8, 8, 16}}, // pmaddubsw
+ {ISD::MUL, MVT::v16i16, {2, 5, 1, 2}}, // pmullw
+ {ISD::MUL, MVT::v8i32, {4, 10, 1, 2}}, // pmulld
+ {ISD::MUL, MVT::v4i32, {2, 10, 1, 2}}, // pmulld
+ {ISD::MUL, MVT::v4i64, {6, 10, 8, 13}}, // 3*pmuludq/3*shift/2*add
+ {ISD::MUL, MVT::v2i64, {6, 10, 8, 8}}, // 3*pmuludq/3*shift/2*add
+
+ {X86ISD::PMULUDQ, MVT::v4i64, {1, 5, 1, 1}},
+
+ {ISD::FNEG, MVT::v4f64, {1, 1, 1, 2}}, // vxorpd
+ {ISD::FNEG, MVT::v8f32, {1, 1, 1, 2}}, // vxorps
+
+ {ISD::FADD, MVT::f64, {1, 4, 1, 1}}, // vaddsd
+ {ISD::FADD, MVT::f32, {1, 4, 1, 1}}, // vaddss
+ {ISD::FADD, MVT::v2f64, {1, 4, 1, 1}}, // vaddpd
+ {ISD::FADD, MVT::v4f32, {1, 4, 1, 1}}, // vaddps
+ {ISD::FADD, MVT::v4f64, {1, 4, 1, 2}}, // vaddpd
+ {ISD::FADD, MVT::v8f32, {1, 4, 1, 2}}, // vaddps
+
+ {ISD::FSUB, MVT::f64, {1, 4, 1, 1}}, // vsubsd
+ {ISD::FSUB, MVT::f32, {1, 4, 1, 1}}, // vsubss
+ {ISD::FSUB, MVT::v2f64, {1, 4, 1, 1}}, // vsubpd
+ {ISD::FSUB, MVT::v4f32, {1, 4, 1, 1}}, // vsubps
+ {ISD::FSUB, MVT::v4f64, {1, 4, 1, 2}}, // vsubpd
+ {ISD::FSUB, MVT::v8f32, {1, 4, 1, 2}}, // vsubps
+
+ {ISD::FMUL, MVT::f64, {1, 5, 1, 1}}, // vmulsd
+ {ISD::FMUL, MVT::f32, {1, 5, 1, 1}}, // vmulss
+ {ISD::FMUL, MVT::v2f64, {1, 5, 1, 1}}, // vmulpd
+ {ISD::FMUL, MVT::v4f32, {1, 5, 1, 1}}, // vmulps
+ {ISD::FMUL, MVT::v4f64, {1, 5, 1, 2}}, // vmulpd
+ {ISD::FMUL, MVT::v8f32, {1, 5, 1, 2}}, // vmulps
+
+ {ISD::FDIV, MVT::f32, {7, 13, 1, 1}}, // vdivss
+ {ISD::FDIV, MVT::v4f32, {7, 13, 1, 1}}, // vdivps
+ {ISD::FDIV, MVT::v8f32, {14, 21, 1, 3}}, // vdivps
+ {ISD::FDIV, MVT::f64, {14, 20, 1, 1}}, // vdivsd
+ {ISD::FDIV, MVT::v2f64, {14, 20, 1, 1}}, // vdivpd
+ {ISD::FDIV, MVT::v4f64, {28, 35, 1, 3}}, // vdivpd
};
// Look for AVX2 lowering tricks for custom cases.
@@ -1167,99 +1167,139 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
return LT.first * *KindCost;
static const CostKindTblEntry AVX1CostTable[] = {
- // We don't have to scalarize unsupported ops. We can issue two half-sized
- // operations and we only need to extract the upper YMM half.
- // Two ops + 1 extract + 1 insert = 4.
- { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
- { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
- { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
- { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
- { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
- { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
-
- { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
- { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
- { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
- { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
-
- { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
- { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
- { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
- { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
-
- { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
- { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
- { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
- { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
-
- { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
- { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
- { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
- { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
- { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
- { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
- { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
- { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
- { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
- { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
-
- { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
- { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
- { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
- { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
- { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
- { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
- { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
- { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
-
- { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
- { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
- { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
- { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
- { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
- { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
- { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
- { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
-
- { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
- { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
- { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
- { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
- { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
- { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
- { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
- { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
-
- { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
- { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
-
- { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
- { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
- { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
- { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
- { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
- { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
-
- { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
- { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
- { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
- { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
- { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
- { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
-
- { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
- { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
- { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
- { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
- { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
- { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
-
- { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
- { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
- { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
- { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
- { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
- { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
+ // We don't have to scalarize unsupported ops. We can issue two half-sized
+ // operations and we only need to extract the upper YMM half.
+ // Two ops + 1 extract + 1 insert = 4.
+ {ISD::MUL, MVT::v32i8, {10, 11, 18, 19}}, // pmaddubsw + split
+ {ISD::MUL, MVT::v16i8, {5, 6, 8, 12}}, // 2*pmaddubsw/3*and/psllw/or
+ {ISD::MUL, MVT::v16i16, {4, 8, 5, 6}}, // pmullw + split
+ {ISD::MUL, MVT::v8i32, {5, 8, 5, 10}}, // pmulld + split
+ {ISD::MUL, MVT::v4i32, {2, 5, 1, 3}}, // pmulld
+ {ISD::MUL, MVT::v4i64, {12, 15, 19, 20}},
+
+ {ISD::AND, MVT::v32i8, {1, 1, 1, 2}}, // vandps
+ {ISD::AND, MVT::v16i16, {1, 1, 1, 2}}, // vandps
+ {ISD::AND, MVT::v8i32, {1, 1, 1, 2}}, // vandps
+ {ISD::AND, MVT::v4i64, {1, 1, 1, 2}}, // vandps
+
+ {ISD::OR, MVT::v32i8, {1, 1, 1, 2}}, // vorps
+ {ISD::OR, MVT::v16i16, {1, 1, 1, 2}}, // vorps
+ {ISD::OR, MVT::v8i32, {1, 1, 1, 2}}, // vorps
+ {ISD::OR, MVT::v4i64, {1, 1, 1, 2}}, // vorps
+
+ {ISD::XOR, MVT::v32i8, {1, 1, 1, 2}}, // vxorps
+ {ISD::XOR, MVT::v16i16, {1, 1, 1, 2}}, // vxorps
+ {ISD::XOR, MVT::v8i32, {1, 1, 1, 2}}, // vxorps
+ {ISD::XOR, MVT::v4i64, {1, 1, 1, 2}}, // vxorps
+
+ {ISD::SUB, MVT::v32i8, {4, 2, 5, 6}}, // psubb + split
+ {ISD::ADD, MVT::v32i8, {4, 2, 5, 6}}, // paddb + split
+ {ISD::SUB, MVT::v16i16, {4, 2, 5, 6}}, // psubw + split
+ {ISD::ADD, MVT::v16i16, {4, 2, 5, 6}}, // paddw + split
+ {ISD::SUB, MVT::v8i32, {4, 2, 5, 6}}, // psubd + split
+ {ISD::ADD, MVT::v8i32, {4, 2, 5, 6}}, // paddd + split
+ {ISD::SUB, MVT::v4i64, {4, 2, 5, 6}}, // psubq + split
+ {ISD::ADD, MVT::v4i64, {4, 2, 5, 6}}, // paddq + split
+ {ISD::SUB, MVT::v2i64, {1, 1, 1, 1}}, // psubq
+ {ISD::ADD, MVT::v2i64, {1, 1, 1, 1}}, // paddq
+
+ {ISD::SHL, MVT::v16i8, {10, 21, 11, 17}}, // pblendvb sequence.
+ {ISD::SHL, MVT::v32i8, {22, 22, 27, 40}}, // pblendvb sequence + split.
+ {ISD::SHL, MVT::v8i16, {6, 9, 11, 11}}, // pblendvb sequence.
+ {ISD::SHL, MVT::v16i16, {13, 16, 24, 25}}, // pblendvb sequence + split.
+ {ISD::SHL, MVT::v4i32, {3, 11, 4, 6}}, // pslld/paddd/cvttps2dq/pmulld
+ {ISD::SHL,
+ MVT::v8i32,
+ {9, 11, 12, 17}}, // pslld/paddd/cvttps2dq/pmulld + split
+ {ISD::SHL, MVT::v2i64, {2, 4, 4, 6}}, // Shift each lane + blend.
+ {ISD::SHL,
+ MVT::v4i64,
+ {6, 7, 11, 15}}, // Shift each lane + blend + split.
+
+ {ISD::SRL, MVT::v16i8, {11, 27, 12, 18}}, // pblendvb sequence.
+ {ISD::SRL, MVT::v32i8, {23, 23, 30, 43}}, // pblendvb sequence + split.
+ {ISD::SRL, MVT::v8i16, {13, 16, 14, 22}}, // pblendvb sequence.
+ {ISD::SRL, MVT::v16i16, {28, 30, 31, 48}}, // pblendvb sequence + split.
+ {ISD::SRL, MVT::v4i32, {6, 7, 12, 16}}, // Shift each lane + blend.
+ {ISD::SRL,
+ MVT::v8i32,
+ {14, 14, 26, 34}}, // Shift each lane + blend + split.
+ {ISD::SRL, MVT::v2i64, {2, 4, 4, 6}}, // Shift each lane + blend.
+ {ISD::SRL,
+ MVT::v4i64,
+ {6, 7, 11, 15}}, // Shift each lane + blend + split.
+
+ {ISD::SRA, MVT::v16i8, {21, 22, 24, 36}}, // pblendvb sequence.
+ {ISD::SRA, MVT::v32i8, {44, 45, 51, 76}}, // pblendvb sequence + split.
+ {ISD::SRA, MVT::v8i16, {13, 16, 14, 22}}, // pblendvb sequence.
+ {ISD::SRA, MVT::v16i16, {28, 30, 31, 48}}, // pblendvb sequence + split.
+ {ISD::SRA, MVT::v4i32, {6, 7, 12, 16}}, // Shift each lane + blend.
+ {ISD::SRA,
+ MVT::v8i32,
+ {14, 14, 26, 34}}, // Shift each lane + blend + split.
+ {ISD::SRA, MVT::v2i64, {5, 6, 10, 14}}, // Shift each lane + blend.
+ {ISD::SRA,
+ MVT::v4i64,
+ {12, 12, 22, 30}}, // Shift each lane + blend + split.
+
+ {ISD::FNEG,
+ MVT::v4f64,
+ {2, 2, 1, 2}}, // BTVER2 from http://www.agner.org/
+ {ISD::FNEG,
+ MVT::v8f32,
+ {2, 2, 1, 2}}, // BTVER2 from http://www.agner.org/
+
+ {ISD::FADD, MVT::f64, {1, 5, 1, 1}}, // BDVER2 from http://www.agner.org/
+ {ISD::FADD, MVT::f32, {1, 5, 1, 1}}, // BDVER2 from http://www.agner.org/
+ {ISD::FADD,
+ MVT::v2f64,
+ {1, 5, 1, 1}}, // BDVER2 from http://www.agner.org/
+ {ISD::FADD,
+ MVT::v4f32,
+ {1, 5, 1, 1}}, // BDVER2 from http://www.agner.org/
+ {ISD::FADD,
+ MVT::v4f64,
+ {2, 5, 1, 2}}, // BDVER2 from http://www.agner.org/
+ {ISD::FADD,
+ MVT::v8f32,
+ {2, 5, 1, 2}}, // BDVER2 from http://www.agner.org/
+
+ {ISD::FSUB, MVT::f64, {1, 5, 1, 1}}, // BDVER2 from http://www.agner.org/
+ {ISD::FSUB, MVT::f32, {1, 5, 1, 1}}, // BDVER2 from http://www.agner.org/
+ {ISD::FSUB,
+ MVT::v2f64,
+ {1, 5, 1, 1}}, // BDVER2 from http://www.agner.org/
+ {ISD::FSUB,
+ MVT::v4f32,
+ {1, 5, 1, 1}}, // BDVER2 from http://www.agner.org/
+ {ISD::FSUB,
+ MVT::v4f64,
+ {2, 5, 1, 2}}, // BDVER2 from http://www.agner.org/
+ {ISD::FSUB,
+ MVT::v8f32,
+ {2, 5, 1, 2}}, // BDVER2 from http://www.agner.org/
+
+ {ISD::FMUL, MVT::f64, {2, 5, 1, 1}}, // BTVER2 from http://www.agner.org/
+ {ISD::FMUL, MVT::f32, {1, 5, 1, 1}}, // BTVER2 from http://www.agner.org/
+ {ISD::FMUL,
+ MVT::v2f64,
+ {2, 5, 1, 1}}, // BTVER2 from http://www.agner.org/
+ {ISD::FMUL,
+ MVT::v4f32,
+ {1, 5, 1, 1}}, // BTVER2 from http://www.agner.org/
+ {ISD::FMUL,
+ MVT::v4f64,
+ {4, 5, 1, 2}}, // BTVER2 from http://www.agner.org/
+ {ISD::FMUL,
+ MVT::v8f32,
+ {2, 5, 1, 2}}, // BTVER2 from http://www.agner.org/
+
+ {ISD::FDIV, MVT::f32, {14, 14, 1, 1}}, // SNB from http://www.agner.org/
+ {ISD::FDIV, MVT::v4f32, {14, 14, 1, 1}}, // SNB from http://www.agner.org/
+ {ISD::FDIV, MVT::v8f32, {28, 29, 1, 3}}, // SNB from http://www.agner.org/
+ {ISD::FDIV, MVT::f64, {22, 22, 1, 1}}, // SNB from http://www.agner.org/
+ {ISD::FDIV, MVT::v2f64, {22, 22, 1, 1}}, // SNB from http://www.agner.org/
+ {ISD::FDIV, MVT::v4f64, {44, 45, 1, 3}}, // SNB from http://www.agner.org/
};
if (ST->hasAVX())
@@ -1297,21 +1337,21 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
return LT.first * *KindCost;
static const CostKindTblEntry SSE41CostTable[] = {
- { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
- { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
- { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
+ {ISD::SHL, MVT::v16i8, {15, 24, 17, 22}}, // pblendvb sequence.
+ {ISD::SHL, MVT::v8i16, {11, 14, 11, 11}}, // pblendvb sequence.
+ {ISD::SHL, MVT::v4i32, {14, 20, 4, 10}}, // pslld/paddd/cvttps2dq/pmulld
- { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
- { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
- { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
- { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
+ {ISD::SRL, MVT::v16i8, {16, 27, 18, 24}}, // pblendvb sequence.
+ {ISD::SRL, MVT::v8i16, {22, 26, 23, 27}}, // pblendvb sequence.
+ {ISD::SRL, MVT::v4i32, {16, 17, 15, 19}}, // Shift each lane + blend.
+ {ISD::SRL, MVT::v2i64, {4, 6, 5, 7}}, // splat+shuffle sequence.
- { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
- { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
- { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
- { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
+ {ISD::SRA, MVT::v16i8, {38, 41, 30, 36}}, // pblendvb sequence.
+ {ISD::SRA, MVT::v8i16, {22, 26, 23, 27}}, // pblendvb sequence.
+ {ISD::SRA, MVT::v4i32, {16, 17, 15, 19}}, // Shift each lane + blend.
+ {ISD::SRA, MVT::v2i64, {8, 17, 5, 7}}, // splat+shuffle sequence.
- { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
+ {ISD::MUL, MVT::v4i32, {2, 11, 1, 1}} // pmulld (Nehalem from agner.org)
};
if (ST->hasSSE41())
@@ -1320,7 +1360,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
return LT.first * *KindCost;
static const CostKindTblEntry SSSE3CostTable[] = {
- { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
+ {ISD::MUL, MVT::v16i8, {5, 18, 10, 12}}, // 2*pmaddubsw/3*and/psllw/or
};
if (ST->hasSSSE3())
``````````
</details>
https://github.com/llvm/llvm-project/pull/95690
More information about the llvm-commits
mailing list