[clang] [AMDGPU] make w32i16/w32f16 legal (PR #70484)
via cfe-commits
cfe-commits at lists.llvm.org
Fri Oct 27 10:40:02 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Changpeng Fang (changpeng)
<details>
<summary>Changes</summary>
Some upcoming intrinsics will be using these new types
---
Patch is 120.40 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70484.diff
17 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+1-1)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+69-18)
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+10)
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.td (+2-2)
- (modified) llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll (+2-2)
- (modified) llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll (+8-8)
- (modified) llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll (+8-8)
- (modified) llvm/test/Analysis/CostModel/AMDGPU/fadd.ll (+2-2)
- (modified) llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll (+4-4)
- (modified) llvm/test/Analysis/CostModel/AMDGPU/fma.ll (+2-2)
- (modified) llvm/test/Analysis/CostModel/AMDGPU/fmul.ll (+2-2)
- (modified) llvm/test/Analysis/CostModel/AMDGPU/fsub.ll (+2-2)
- (modified) llvm/test/Analysis/CostModel/AMDGPU/mul.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll (-8)
- (modified) llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll (+22)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+254-250)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+236-220)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index adf4e0139e03c1d..4bf68d293425621 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -384,7 +384,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32,
MVT::v32f32, MVT::v32i32, MVT::v2f64, MVT::v2i64, MVT::v3f64,
MVT::v3i64, MVT::v4f64, MVT::v4i64, MVT::v8f64, MVT::v8i64,
- MVT::v16f64, MVT::v16i64},
+ MVT::v16f64, MVT::v16i64, MVT::v32i16, MVT::v32f16},
Custom);
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7d457edad0d5cdf..568f8078373fcfa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -165,6 +165,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
+ addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
+ addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
@@ -275,7 +277,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16,
MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64,
- MVT::v32i32, MVT::v32f32}) {
+ MVT::v32i32, MVT::v32f32, MVT::v32i16, MVT::v32f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -554,7 +556,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMAD, MVT::f16, Legal);
for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
- MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
+ MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v32i16,
+ MVT::v32f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -640,6 +643,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v16f16, Promote);
AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
+ setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
+ setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
+
+ setOperationAction(ISD::STORE, MVT::v32i16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
+ setOperationAction(ISD::STORE, MVT::v32f16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
+
setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
MVT::v2i32, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
@@ -662,12 +675,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
- {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom);
+ {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
+ Custom);
setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
- {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand);
+ {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
+ Expand);
- for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
+ for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
+ MVT::v32i16, MVT::v32f16}) {
setOperationAction(
{ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
Vec16, Custom);
@@ -690,10 +706,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE,
{MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
- MVT::v16f16, MVT::v16i16},
+ MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
Custom);
- for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
+ for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
// Split vector operations.
setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
@@ -701,7 +717,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::SSUBSAT},
VT, Custom);
- for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16})
+ for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
// Split vector operations.
setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
VT, Custom);
@@ -737,8 +753,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT,
{MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
- MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16},
- Custom);
+ MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
+ MVT::v32i16, MVT::v32f16}, Custom);
setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
@@ -5107,7 +5123,7 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
- VT == MVT::v32f32);
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -5130,7 +5146,7 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
- VT == MVT::v32f32);
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
SDValue Lo0, Hi0;
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -5897,7 +5913,8 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
if (IsIEEEMode)
return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
- if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16)
+ if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
+ VT == MVT::v16f16)
return splitBinaryVectorOp(Op, DAG);
return Op;
}
@@ -6415,7 +6432,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
return Combined;
- if (VecSize == 128 || VecSize == 256) {
+ if (VecSize == 128 || VecSize == 256 || VecSize == 512 ) {
SDValue Lo, Hi;
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
@@ -6428,9 +6445,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
Hi = DAG.getBitcast(HiVT,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
DAG.getConstant(1, SL, MVT::i32)));
- } else {
- assert(VecSize == 256);
-
+ } else if (VecSize == 256) {
SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
SDValue Parts[4];
for (unsigned P = 0; P < 4; ++P) {
@@ -6442,6 +6457,20 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
Parts[0], Parts[1]));
Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
Parts[2], Parts[3]));
+ } else {
+ assert(VecSize == 512);
+
+ SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
+ SDValue Parts[8];
+ for (unsigned P = 0; P < 8; ++P) {
+ Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
+ DAG.getConstant(P, SL, MVT::i32));
+ }
+
+ Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
+ Parts[0], Parts[1], Parts[2], Parts[3]));
+ Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
+ Parts[4], Parts[5],Parts[6], Parts[7]));
}
EVT IdxVT = Idx.getValueType();
@@ -6607,6 +6636,27 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
}
+ if (VT == MVT::v32i16 || VT == MVT::v32f16) {
+ EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
+ VT.getVectorNumElements() / 8);
+ MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
+
+ SmallVector<SDValue, 8> Parts[8];
+ for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
+ for (unsigned P = 0; P < 8; ++P)
+ Parts[P].push_back(Op.getOperand(I + P * E));
+ }
+ SDValue Casts[8];
+ for (unsigned P = 0; P < 8; ++P) {
+ SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
+ Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
+ }
+
+ SDValue Blend =
+ DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts);
+ return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
+ }
+
assert(VT == MVT::v2f16 || VT == MVT::v2i16);
assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
@@ -9507,7 +9557,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
- if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
+ if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
+ VT.getSizeInBits() == 512)
return splitTernaryVectorOp(Op, DAG);
assert(VT.getSizeInBits() == 64);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 3cd0821b0f86c5f..e56269438472ee2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1619,6 +1619,16 @@ def : BitConvert <v12i32, v12f32, VReg_384>;
def : BitConvert <v12f32, v12i32, VReg_384>;
// 512-bit bitcast
+def : BitConvert <v32f16, v32i16, VReg_512>;
+def : BitConvert <v32i16, v32f16, VReg_512>;
+def : BitConvert <v32f16, v16i32, VReg_512>;
+def : BitConvert <v32f16, v16f32, VReg_512>;
+def : BitConvert <v16f32, v32f16, VReg_512>;
+def : BitConvert <v16i32, v32f16, VReg_512>;
+def : BitConvert <v32i16, v16i32, VReg_512>;
+def : BitConvert <v32i16, v16f32, VReg_512>;
+def : BitConvert <v16f32, v32i16, VReg_512>;
+def : BitConvert <v16i32, v32i16, VReg_512>;
def : BitConvert <v16i32, v16f32, VReg_512>;
def : BitConvert <v16f32, v16i32, VReg_512>;
def : BitConvert <v8i64, v8f64, VReg_512>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 304ee53cc5a87da..7ea2280c474b05e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -930,7 +930,7 @@ defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>;
defm "" : SRegClass<12, [v12i32, v12f32], SGPR_384Regs, TTMP_384Regs>;
let GlobalPriority = true in {
-defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
+defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], SGPR_512Regs, TTMP_512Regs>;
defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
}
@@ -984,7 +984,7 @@ defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>;
defm VReg_384 : VRegClass<12, [v12i32, v12f32], (add VGPR_384)>;
let GlobalPriority = true in {
-defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
+defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], (add VGPR_512)>;
defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
index 2bebc5ed9b53bde..2a966b4ea178f27 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
@@ -76,7 +76,7 @@ define amdgpu_kernel void @add_i16() #0 {
; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5i16 = add <5 x i16> undef, undef
; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i16 = add <6 x i16> undef, undef
; FAST16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = add <16 x i16> undef, undef
-; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17i16 = add <17 x i16> undef, undef
+; FAST16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17i16 = add <17 x i16> undef, undef
; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; SLOW16-LABEL: 'add_i16'
@@ -98,7 +98,7 @@ define amdgpu_kernel void @add_i16() #0 {
; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5i16 = add <5 x i16> undef, undef
; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i16 = add <6 x i16> undef, undef
; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = add <16 x i16> undef, undef
-; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17i16 = add <17 x i16> undef, undef
+; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17i16 = add <17 x i16> undef, undef
; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SLOW16-SIZE-LABEL: 'add_i16'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
index b57f26cdc2928be..564bc4912af7d30 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
@@ -57,8 +57,8 @@ define i32 @add(i32 %arg) {
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
@@ -115,8 +115,8 @@ define i32 @add(i32 %arg) {
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
@@ -237,8 +237,8 @@ define i32 @sub(i32 %arg) {
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
@@ -295,8 +295,8 @@ define i32 @sub(i32 %arg) {
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-SIZE-NEXT: Cos...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/70484
More information about the cfe-commits
mailing list