[clang] [AMDGPU] make w32i16/w32f16 legal (PR #70484)

Fri Oct 27 10:40:02 PDT 2023

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Changpeng Fang (changpeng)

<details>
<summary>Changes</summary>

Some upcoming intrinsics will be using these new types

---

Patch is 120.40 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70484.diff


17 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+1-1) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+69-18) 
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+10) 
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.td (+2-2) 
- (modified) llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll (+2-2) 
- (modified) llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll (+8-8) 
- (modified) llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll (+8-8) 
- (modified) llvm/test/Analysis/CostModel/AMDGPU/fadd.ll (+2-2) 
- (modified) llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll (+4-4) 
- (modified) llvm/test/Analysis/CostModel/AMDGPU/fma.ll (+2-2) 
- (modified) llvm/test/Analysis/CostModel/AMDGPU/fmul.ll (+2-2) 
- (modified) llvm/test/Analysis/CostModel/AMDGPU/fsub.ll (+2-2) 
- (modified) llvm/test/Analysis/CostModel/AMDGPU/mul.ll (+10-10) 
- (modified) llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll (-8) 
- (modified) llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll (+22) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+254-250) 
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+236-220) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index adf4e0139e03c1d..4bf68d293425621 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -384,7 +384,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
        MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32,
        MVT::v32f32, MVT::v32i32, MVT::v2f64,  MVT::v2i64,  MVT::v3f64,
        MVT::v3i64,  MVT::v4f64,  MVT::v4i64,  MVT::v8f64,  MVT::v8i64,
-       MVT::v16f64, MVT::v16i64},
+       MVT::v16f64, MVT::v16i64, MVT::v32i16, MVT::v32f16},
       Custom);
 
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7d457edad0d5cdf..568f8078373fcfa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -165,6 +165,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
     addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
     addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
+    addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
+    addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
   }
 
   addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
@@ -275,7 +277,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
         MVT::v4f16,  MVT::v3i64,  MVT::v3f64,   MVT::v6i32,  MVT::v6f32,
         MVT::v4i64,  MVT::v4f64,  MVT::v8i64,   MVT::v8f64,  MVT::v8i16,
         MVT::v8f16,  MVT::v16i16, MVT::v16f16,  MVT::v16i64, MVT::v16f64,
-        MVT::v32i32, MVT::v32f32}) {
+        MVT::v32i32, MVT::v32f32, MVT::v32i16,  MVT::v32f16}) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch (Op) {
       case ISD::LOAD:
@@ -554,7 +556,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FMAD, MVT::f16, Legal);
 
     for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
-                   MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
+                   MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v32i16,
+                   MVT::v32f16}) {
       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
         switch (Op) {
         case ISD::LOAD:
@@ -640,6 +643,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::STORE, MVT::v16f16, Promote);
     AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
 
+    setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
+    setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
+
+    setOperationAction(ISD::STORE, MVT::v32i16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
+    setOperationAction(ISD::STORE, MVT::v32f16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
+
     setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
                        MVT::v2i32, Expand);
     setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
@@ -662,12 +675,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
 
     setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
-                       {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom);
+                       {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
+                       Custom);
 
     setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
-                       {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand);
+                       {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
+                       Expand);
 
-    for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
+    for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
+                      MVT::v32i16, MVT::v32f16}) {
       setOperationAction(
           {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
           Vec16, Custom);
@@ -690,10 +706,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
     setOperationAction(ISD::VECTOR_SHUFFLE,
                        {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
-                        MVT::v16f16, MVT::v16i16},
+                        MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
                        Custom);
 
-    for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
+    for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
       // Split vector operations.
       setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
                           ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
@@ -701,7 +717,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                           ISD::SSUBSAT},
                          VT, Custom);
 
-    for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16})
+    for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
       // Split vector operations.
       setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
                          VT, Custom);
@@ -737,8 +753,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::SELECT,
                      {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
-                      MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16},
-                     Custom);
+                      MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
+                      MVT::v32i16, MVT::v32f16}, Custom);
 
   setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
 
@@ -5107,7 +5123,7 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
          VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
          VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
-         VT == MVT::v32f32);
+         VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
 
   SDValue Lo, Hi;
   std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -5130,7 +5146,7 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
          VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
          VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
-         VT == MVT::v32f32);
+         VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
 
   SDValue Lo0, Hi0;
   std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -5897,7 +5913,8 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
   if (IsIEEEMode)
     return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
 
-  if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16)
+  if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
+      VT == MVT::v16f16)
     return splitBinaryVectorOp(Op, DAG);
   return Op;
 }
@@ -6415,7 +6432,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
     return Combined;
 
-  if (VecSize == 128 || VecSize == 256) {
+  if (VecSize == 128 || VecSize == 256 || VecSize == 512 ) {
     SDValue Lo, Hi;
     EVT LoVT, HiVT;
     std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
@@ -6428,9 +6445,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
       Hi = DAG.getBitcast(HiVT,
                           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
                                       DAG.getConstant(1, SL, MVT::i32)));
-    } else {
-      assert(VecSize == 256);
-
+    } else if (VecSize == 256) {
       SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
       SDValue Parts[4];
       for (unsigned P = 0; P < 4; ++P) {
@@ -6442,6 +6457,20 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
                                             Parts[0], Parts[1]));
       Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
                                             Parts[2], Parts[3]));
+    } else {
+      assert(VecSize == 512);
+
+      SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
+      SDValue Parts[8];
+      for (unsigned P = 0; P < 8; ++P) {
+        Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
+                               DAG.getConstant(P, SL, MVT::i32));
+      }
+
+      Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
+                                       Parts[0], Parts[1], Parts[2], Parts[3]));
+      Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
+                                       Parts[4], Parts[5],Parts[6], Parts[7]));
     }
 
     EVT IdxVT = Idx.getValueType();
@@ -6607,6 +6636,27 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
     return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
   }
 
+  if (VT == MVT::v32i16 || VT == MVT::v32f16) {
+    EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
+                                     VT.getVectorNumElements() / 8);
+    MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
+
+    SmallVector<SDValue, 8> Parts[8];
+    for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
+      for (unsigned P = 0; P < 8; ++P)
+        Parts[P].push_back(Op.getOperand(I + P * E));
+    }
+    SDValue Casts[8];
+    for (unsigned P = 0; P < 8; ++P) {
+      SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
+      Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
+    }
+
+    SDValue Blend =
+        DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts);
+    return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
+  }
+
   assert(VT == MVT::v2f16 || VT == MVT::v2i16);
   assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
 
@@ -9507,7 +9557,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
-  if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
+  if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
+      VT.getSizeInBits() == 512)
     return splitTernaryVectorOp(Op, DAG);
 
   assert(VT.getSizeInBits() == 64);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 3cd0821b0f86c5f..e56269438472ee2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1619,6 +1619,16 @@ def : BitConvert <v12i32, v12f32, VReg_384>;
 def : BitConvert <v12f32, v12i32, VReg_384>;
 
 // 512-bit bitcast
+def : BitConvert <v32f16, v32i16, VReg_512>;
+def : BitConvert <v32i16, v32f16, VReg_512>;
+def : BitConvert <v32f16, v16i32, VReg_512>;
+def : BitConvert <v32f16, v16f32, VReg_512>;
+def : BitConvert <v16f32, v32f16, VReg_512>;
+def : BitConvert <v16i32, v32f16, VReg_512>;
+def : BitConvert <v32i16, v16i32, VReg_512>;
+def : BitConvert <v32i16, v16f32, VReg_512>;
+def : BitConvert <v16f32, v32i16, VReg_512>;
+def : BitConvert <v16i32, v32i16, VReg_512>;
 def : BitConvert <v16i32, v16f32, VReg_512>;
 def : BitConvert <v16f32, v16i32, VReg_512>;
 def : BitConvert <v8i64,  v8f64,  VReg_512>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 304ee53cc5a87da..7ea2280c474b05e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -930,7 +930,7 @@ defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>;
 defm "" : SRegClass<12, [v12i32, v12f32], SGPR_384Regs, TTMP_384Regs>;
 
 let GlobalPriority = true in {
-defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
+defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], SGPR_512Regs, TTMP_512Regs>;
 defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
 }
 
@@ -984,7 +984,7 @@ defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>;
 defm VReg_384 : VRegClass<12, [v12i32, v12f32], (add VGPR_384)>;
 
 let GlobalPriority = true in {
-defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
+defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], (add VGPR_512)>;
 defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
 }
 
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
index 2bebc5ed9b53bde..2a966b4ea178f27 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
@@ -76,7 +76,7 @@ define amdgpu_kernel void @add_i16() #0 {
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5i16 = add <5 x i16> undef, undef
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i16 = add <6 x i16> undef, undef
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = add <16 x i16> undef, undef
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v17i16 = add <17 x i16> undef, undef
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17i16 = add <17 x i16> undef, undef
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOW16-LABEL: 'add_i16'
@@ -98,7 +98,7 @@ define amdgpu_kernel void @add_i16() #0 {
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5i16 = add <5 x i16> undef, undef
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i16 = add <6 x i16> undef, undef
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = add <16 x i16> undef, undef
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v17i16 = add <17 x i16> undef, undef
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17i16 = add <17 x i16> undef, undef
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLOW16-SIZE-LABEL: 'add_i16'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
index b57f26cdc2928be..564bc4912af7d30 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
@@ -57,8 +57,8 @@ define i32 @add(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
@@ -115,8 +115,8 @@ define i32 @add(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
@@ -237,8 +237,8 @@ define i32 @sub(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
@@ -295,8 +295,8 @@ define i32 @sub(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-SIZE-NEXT:  Cos...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/70484