[llvm] 2abaa02 - [AArch64] Teach the costmodel about widening muls

Mon Apr 4 04:45:09 PDT 2022

Author: David Green
Date: 2022-04-04T12:45:04+01:00
New Revision: 2abaa027d9dc247cbc8baa3aca1455fa9768c1c4

URL: https://github.com/llvm/llvm-project/commit/2abaa027d9dc247cbc8baa3aca1455fa9768c1c4
DIFF: https://github.com/llvm/llvm-project/commit/2abaa027d9dc247cbc8baa3aca1455fa9768c1c4.diff

LOG: [AArch64] Teach the costmodel about widening muls

A vector mul(sext, sext) or mul(zext, zext) will be code generated as a
single smull or umull instruction. This most notably effects v2i64
multiplies, which are otherwise not legal and need to be expanded.

The oneuse check has also been slightly changed, as it is already
checked from the use of isWideningInstruction in getCastInstrCost.

Differential Revision: https://reviews.llvm.org/D123006

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/AArch64/arith-widening.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0aa39c99dbcd6..2b00652860862 100644

--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1289,26 +1289,32 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
   // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
   // instructions.
   //
-  // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
+  // TODO: Add additional widening operations (e.g., shl, etc.) once we
   //       verify that their extending operands are eliminated during code
   //       generation.
   switch (Opcode) {
   case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
   case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
+  case Instruction::Mul: // SMULL(2), UMULL(2)
     break;
   default:
     return false;
   }
 
   // To be a widening instruction (either the "wide" or "long" versions), the
-  // second operand must be a sign- or zero extend having a single user. We
-  // only consider extends having a single user because they may otherwise not
-  // be eliminated.
+  // second operand must be a sign- or zero extend.
   if (Args.size() != 2 ||
-      (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
-      !Args[1]->hasOneUse())
+      (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])))
     return false;
   auto *Extend = cast<CastInst>(Args[1]);
+  auto *Arg0 = dyn_cast<CastInst>(Args[0]);
+
+  // A mul only has a mull version (not like addw). Both operands need to be
+  // extending and the same type.
+  if (Opcode == Instruction::Mul &&
+      (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() ||
+       Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType()))
+    return false;
 
   // Legalize the destination type and ensure it can be used in a widening
   // operation.
@@ -1346,7 +1352,7 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
 
   // If the cast is observable, and it is used by a widening instruction (e.g.,
   // uaddl, saddw, etc.), it may be free.
-  if (I && I->hasOneUse()) {
+  if (I && I->hasOneUser()) {
     auto *SingleUser = cast<Instruction>(*I->user_begin());
     SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
     if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
@@ -1831,8 +1837,6 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     return Cost;
   }
   case ISD::MUL:
-    if (LT.second != MVT::v2i64)
-      return LT.first;
     // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
     // as elements are extracted from the vectors and the muls scalarized.
     // As getScalarizationOverhead is a bit too pessimistic, we estimate the
@@ -1841,7 +1845,10 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     // - two i64 inserts, and
     // - two muls.
     // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with
-    // LT.first = 2 the cost is 16.
+    // LT.first = 2 the cost is 16. If both operands are extensions it will not
+    // need to scalarize so the cost can be cheaper (smull or umull).
+    if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
+      return LT.first;
     return LT.first * 8;
   case ISD::ADD:
   case ISD::XOR:

diff  --git a/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll b/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll
index 38e28d0062f6f..0b96a0090ea12 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll
@@ -1585,14 +1585,14 @@ define void @extmulv2(<2 x i8> %i8, <2 x i16> %i16, <2 x i32> %i32, <2 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %azl_16_64 = mul <2 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sw_32_64 = sext <2 x i32> %i32 to <2 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %asw_32_64 = mul <2 x i64> %i64, %sw_32_64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sl1_32_64 = sext <2 x i32> %i32 to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sl2_32_64 = sext <2 x i32> %i32 to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %asl_32_64 = mul <2 x i64> %sl1_32_64, %sl2_32_64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sl1_32_64 = sext <2 x i32> %i32 to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sl2_32_64 = sext <2 x i32> %i32 to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %asl_32_64 = mul <2 x i64> %sl1_32_64, %sl2_32_64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %zw_32_64 = zext <2 x i32> %i32 to <2 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %azw_32_64 = mul <2 x i64> %i64, %zw_32_64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %zl1_32_64 = zext <2 x i32> %i32 to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %zl2_32_64 = zext <2 x i32> %i32 to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %azl_32_64 = mul <2 x i64> %zl1_32_64, %zl2_32_64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zl1_32_64 = zext <2 x i32> %i32 to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zl2_32_64 = zext <2 x i32> %i32 to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %azl_32_64 = mul <2 x i64> %zl1_32_64, %zl2_32_64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %sw_8_16 = sext <2 x i8> %i8 to <2 x i16>
@@ -1704,13 +1704,13 @@ define void @extmulv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %azl_8_64 = mul <4 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sw_16_32 = sext <4 x i16> %i16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %asw_16_32 = mul <4 x i32> %i32, %sw_16_32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sl1_16_32 = sext <4 x i16> %i16 to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sl2_16_32 = sext <4 x i16> %i16 to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sl1_16_32 = sext <4 x i16> %i16 to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sl2_16_32 = sext <4 x i16> %i16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %asl_16_32 = mul <4 x i32> %sl1_16_32, %sl2_16_32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %zw_16_32 = zext <4 x i16> %i16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %azw_16_32 = mul <4 x i32> %i32, %zw_16_32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %zl1_16_32 = zext <4 x i16> %i16 to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %zl2_16_32 = zext <4 x i16> %i16 to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zl1_16_32 = zext <4 x i16> %i16 to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zl2_16_32 = zext <4 x i16> %i16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %azl_16_32 = mul <4 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %asw_16_64 = mul <4 x i64> %i64, %sw_16_64
@@ -1724,14 +1724,14 @@ define void @extmulv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %azl_16_64 = mul <4 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %asw_32_64 = mul <4 x i64> %i64, %sw_32_64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sl2_32_64 = sext <4 x i32> %i32 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %asl_32_64 = mul <4 x i64> %sl1_32_64, %sl2_32_64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sl2_32_64 = sext <4 x i32> %i32 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %asl_32_64 = mul <4 x i64> %sl1_32_64, %sl2_32_64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %zw_32_64 = zext <4 x i32> %i32 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %azw_32_64 = mul <4 x i64> %i64, %zw_32_64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %zl1_32_64 = zext <4 x i32> %i32 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %zl2_32_64 = zext <4 x i32> %i32 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %azl_32_64 = mul <4 x i64> %zl1_32_64, %zl2_32_64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zl1_32_64 = zext <4 x i32> %i32 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zl2_32_64 = zext <4 x i32> %i32 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %azl_32_64 = mul <4 x i64> %zl1_32_64, %zl2_32_64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %sw_8_16 = sext <4 x i8> %i8 to <4 x i16>
@@ -1813,13 +1813,13 @@ define void @extmulv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-LABEL: 'extmulv8'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sw_8_16 = sext <8 x i8> %i8 to <8 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %asw_8_16 = mul <8 x i16> %i16, %sw_8_16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sl1_8_16 = sext <8 x i8> %i8 to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sl2_8_16 = sext <8 x i8> %i8 to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sl1_8_16 = sext <8 x i8> %i8 to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sl2_8_16 = sext <8 x i8> %i8 to <8 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %asl_8_16 = mul <8 x i16> %sl1_8_16, %sl2_8_16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %zw_8_16 = zext <8 x i8> %i8 to <8 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %azw_8_16 = mul <8 x i16> %i16, %zw_8_16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %zl1_8_16 = zext <8 x i8> %i8 to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %zl2_8_16 = zext <8 x i8> %i8 to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zl1_8_16 = zext <8 x i8> %i8 to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zl2_8_16 = zext <8 x i8> %i8 to <8 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %azl_8_16 = mul <8 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %asw_8_32 = mul <8 x i32> %i32, %sw_8_32
@@ -1843,13 +1843,13 @@ define void @extmulv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %azl_8_64 = mul <8 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %asw_16_32 = mul <8 x i32> %i32, %sw_16_32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sl2_16_32 = sext <8 x i16> %i16 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sl2_16_32 = sext <8 x i16> %i16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %asl_16_32 = mul <8 x i32> %sl1_16_32, %sl2_16_32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %zw_16_32 = zext <8 x i16> %i16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %azw_16_32 = mul <8 x i32> %i32, %zw_16_32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %zl1_16_32 = zext <8 x i16> %i16 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %zl2_16_32 = zext <8 x i16> %i16 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zl1_16_32 = zext <8 x i16> %i16 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zl2_16_32 = zext <8 x i16> %i16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %azl_16_32 = mul <8 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %asw_16_64 = mul <8 x i64> %i64, %sw_16_64
@@ -1863,14 +1863,14 @@ define void @extmulv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %azl_16_64 = mul <8 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %asw_32_64 = mul <8 x i64> %i64, %sw_32_64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sl2_32_64 = sext <8 x i32> %i32 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %asl_32_64 = mul <8 x i64> %sl1_32_64, %sl2_32_64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sl2_32_64 = sext <8 x i32> %i32 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %asl_32_64 = mul <8 x i64> %sl1_32_64, %sl2_32_64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %zw_32_64 = zext <8 x i32> %i32 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %azw_32_64 = mul <8 x i64> %i64, %zw_32_64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %zl1_32_64 = zext <8 x i32> %i32 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %zl2_32_64 = zext <8 x i32> %i32 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %azl_32_64 = mul <8 x i64> %zl1_32_64, %zl2_32_64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zl1_32_64 = zext <8 x i32> %i32 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zl2_32_64 = zext <8 x i32> %i32 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %azl_32_64 = mul <8 x i64> %zl1_32_64, %zl2_32_64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %sw_8_16 = sext <8 x i8> %i8 to <8 x i16>
@@ -1952,13 +1952,13 @@ define void @extmulv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-LABEL: 'extmulv16'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sw_8_16 = sext <16 x i8> %i8 to <16 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %asw_8_16 = mul <16 x i16> %i16, %sw_8_16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sl1_8_16 = sext <16 x i8> %i8 to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sl2_8_16 = sext <16 x i8> %i8 to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sl1_8_16 = sext <16 x i8> %i8 to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sl2_8_16 = sext <16 x i8> %i8 to <16 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %asl_8_16 = mul <16 x i16> %sl1_8_16, %sl2_8_16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %zw_8_16 = zext <16 x i8> %i8 to <16 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %azw_8_16 = mul <16 x i16> %i16, %zw_8_16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %zl1_8_16 = zext <16 x i8> %i8 to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %zl2_8_16 = zext <16 x i8> %i8 to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zl1_8_16 = zext <16 x i8> %i8 to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zl2_8_16 = zext <16 x i8> %i8 to <16 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %azl_8_16 = mul <16 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %asw_8_32 = mul <16 x i32> %i32, %sw_8_32
@@ -1982,13 +1982,13 @@ define void @extmulv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %azl_8_64 = mul <16 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %asw_16_32 = mul <16 x i32> %i32, %sw_16_32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sl2_16_32 = sext <16 x i16> %i16 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sl2_16_32 = sext <16 x i16> %i16 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %asl_16_32 = mul <16 x i32> %sl1_16_32, %sl2_16_32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %zw_16_32 = zext <16 x i16> %i16 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %azw_16_32 = mul <16 x i32> %i32, %zw_16_32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %zl1_16_32 = zext <16 x i16> %i16 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %zl2_16_32 = zext <16 x i16> %i16 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zl1_16_32 = zext <16 x i16> %i16 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zl2_16_32 = zext <16 x i16> %i16 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %azl_16_32 = mul <16 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %asw_16_64 = mul <16 x i64> %i64, %sw_16_64
@@ -2002,14 +2002,14 @@ define void @extmulv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %azl_16_64 = mul <16 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %asw_32_64 = mul <16 x i64> %i64, %sw_32_64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sl2_32_64 = sext <16 x i32> %i32 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %asl_32_64 = mul <16 x i64> %sl1_32_64, %sl2_32_64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sl2_32_64 = sext <16 x i32> %i32 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %asl_32_64 = mul <16 x i64> %sl1_32_64, %sl2_32_64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %zw_32_64 = zext <16 x i32> %i32 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %azw_32_64 = mul <16 x i64> %i64, %zw_32_64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %zl1_32_64 = zext <16 x i32> %i32 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %zl2_32_64 = zext <16 x i32> %i32 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %azl_32_64 = mul <16 x i64> %zl1_32_64, %zl2_32_64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zl1_32_64 = zext <16 x i32> %i32 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zl2_32_64 = zext <16 x i32> %i32 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %azl_32_64 = mul <16 x i64> %zl1_32_64, %zl2_32_64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %sw_8_16 = sext <16 x i8> %i8 to <16 x i16>