[llvm] [WebAssembly] getMemoryOpCost and getCastInstrCost (PR #122896)

Tue Jan 14 04:44:31 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Sam Parker (sparker-arm)

<details>
<summary>Changes</summary>

Add inital implementations of these TTI methods for SIMD types. For casts, The costing covers the free extensions provided by extmul_low as well as extend_low. For memory operations we consider the use of load32_zero and load64_zero, as well as full width v128 loads.

---

Patch is 35.41 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/122896.diff


4 Files Affected:

- (modified) llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp (+106-2) 
- (modified) llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h (+10) 
- (added) llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll (+251) 
- (added) llvm/test/Transforms/LoopVectorize/WebAssembly/int-mac-reduction-costs.ll (+326) 


``````````diff

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 3d678e53841664..3e8c1e3d3b7bed 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -13,6 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "WebAssemblyTargetTransformInfo.h"
+
+#include "llvm/CodeGen/CostTable.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "wasmtti"
@@ -51,8 +53,7 @@ TypeSize WebAssemblyTTIImpl::getRegisterBitWidth(
 InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
-    ArrayRef<const Value *> Args,
-    const Instruction *CxtI) {
+    ArrayRef<const Value *> Args, const Instruction *CxtI) {
 
   InstructionCost Cost =
       BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
@@ -78,6 +79,109 @@ InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
   return Cost;
 }
 
+InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
+    unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH,
+    TTI::TargetCostKind CostKind, const Instruction *I) {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  auto SrcTy = TLI->getValueType(DL, Src);
+  auto DstTy = TLI->getValueType(DL, Dst);
+
+  if (!SrcTy.isSimple() || !DstTy.isSimple()) {
+    return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
+  }
+
+  if (!ST->hasSIMD128()) {
+    return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
+  }
+
+  auto DstVT = DstTy.getSimpleVT();
+  auto SrcVT = SrcTy.getSimpleVT();
+
+  if (I && I->hasOneUser()) {
+    auto *SingleUser = cast<Instruction>(*I->user_begin());
+    int UserISD = TLI->InstructionOpcodeToISD(SingleUser->getOpcode());
+
+    // extmul_low support
+    if (UserISD == ISD::MUL &&
+        (ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND)) {
+      // Free low extensions.
+      if ((SrcVT == MVT::v8i8 && DstVT == MVT::v8i16) ||
+          (SrcVT == MVT::v4i16 && DstVT == MVT::v4i32) ||
+          (SrcVT == MVT::v2i32 && DstVT == MVT::v2i64)) {
+        return 0;
+      }
+      // Will require an additional extlow operation for the intermediate
+      // i16/i32 value.
+      if ((SrcVT == MVT::v4i8 && DstVT == MVT::v4i32) ||
+          (SrcVT == MVT::v2i16 && DstVT == MVT::v2i64)) {
+        return 1;
+      }
+    }
+  }
+
+  // extend_low
+  static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
+      {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1},
+      {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1},
+      {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1},
+      {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1},
+      {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1},
+      {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1},
+      {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2},
+      {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2},
+      {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2},
+      {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2},
+  };
+
+  if (const auto *Entry =
+          ConvertCostTableLookup(ConversionTbl, ISD, DstVT, SrcVT)) {
+    return Entry->Cost;
+  }
+
+  return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
+}
+
+InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
+    unsigned Opcode, Type *Ty, MaybeAlign Alignment, unsigned AddressSpace,
+    TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
+    const Instruction *I) {
+  if (!ST->hasSIMD128() || !isa<FixedVectorType>(Ty)) {
+    return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
+                                  CostKind);
+  }
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  if (ISD != ISD::LOAD) {
+    return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
+                                  CostKind);
+  }
+
+  EVT VT = TLI->getValueType(DL, Ty, true);
+  // Type legalization can't handle structs
+  if (VT == MVT::Other)
+    return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
+                                  CostKind);
+
+  auto LT = getTypeLegalizationCost(Ty);
+  if (!LT.first.isValid())
+    return InstructionCost::getInvalid();
+
+  // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads can
+  // be lowered to load32_zero and load64_zero respectively. Assume SIMD loads
+  // are twice as expensive as scalar.
+  unsigned width = VT.getSizeInBits();
+  switch (width) {
+  default:
+    break;
+  case 32:
+  case 64:
+  case 128:
+    return 2;
+  }
+
+  return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind);
+}
+
 InstructionCost
 WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                        TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 9691120b2e531d..b9d02bd77a00f0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -64,6 +64,16 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
       TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
       TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
       ArrayRef<const Value *> Args = {}, const Instruction *CxtI = nullptr);
+
+  InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                   TTI::CastContextHint CCH,
+                                   TTI::TargetCostKind CostKind,
+                                   const Instruction *I = nullptr);
+  InstructionCost getMemoryOpCost(
+      unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace,
+      TTI::TargetCostKind CostKind,
+      TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
+      const Instruction *I = nullptr);
   using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                      TTI::TargetCostKind CostKind,
diff --git a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
new file mode 100644
index 00000000000000..fad15dc31dbfe2
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
@@ -0,0 +1,251 @@
+; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
+
+target triple = "wasm32"
+
+define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: i32_mac_s8:
+; CHECK:    v128.load32_zero 0:p2align=0
+; CHECK:    i16x8.extend_low_i8x16_s
+; CHECK:    v128.load32_zero 0:p2align=0
+; CHECK:    i16x8.extend_low_i8x16_s
+; CHECK:    i32x4.extmul_low_i16x8_s
+; CHECK:    i32x4.add
+entry:
+  %cmp7.not = icmp eq i32 %N, 0
+  br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %res.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.09
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.09
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv2 = sext i8 %1 to i32
+  %mul = mul nsw i32 %conv2, %conv
+  %add = add nsw i32 %mul, %res.08
+  %inc = add nuw i32 %i.09, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: i32_mac_s16:
+; CHECK:    i32x4.load16x4_s 0:p2align=1
+; CHECK:    i32x4.load16x4_s 0:p2align=1
+; CHECK:    i32x4.mul
+; CHECK:    i32x4.add
+entry:
+  %cmp7.not = icmp eq i32 %N, 0
+  br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %res.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, ptr %a, i32 %i.09
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %0 to i32
+  %arrayidx1 = getelementptr inbounds i16, ptr %b, i32 %i.09
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv2 = sext i16 %1 to i32
+  %mul = mul nsw i32 %conv2, %conv
+  %add = add nsw i32 %mul, %res.08
+  %inc = add nuw i32 %i.09, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i64 @i64_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: i64_mac_s16:
+; CHECK:    v128.load32_zero 0:p2align=1
+; CHECK:    i32x4.extend_low_i16x8_s
+; CHECK:    v128.load32_zero 0:p2align=1
+; CHECK:    i32x4.extend_low_i16x8_s
+; CHECK:    i64x2.extmul_low_i32x4_s
+; CHECK:    i64x2.add
+entry:
+  %cmp7.not = icmp eq i32 %N, 0
+  br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  ret i64 %res.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %res.08 = phi i64 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, ptr %a, i32 %i.09
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %0 to i64
+  %arrayidx1 = getelementptr inbounds i16, ptr %b, i32 %i.09
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv2 = sext i16 %1 to i64
+  %mul = mul nsw i64 %conv2, %conv
+  %add = add nsw i64 %mul, %res.08
+  %inc = add nuw i32 %i.09, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i64 @i64_mac_s32(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: i64_mac_s32:
+; CHECK:    v128.load64_zero 0:p2align=2
+; CHECK:    v128.load64_zero 0:p2align=2
+; CHECK:    i32x4.mul
+; CHECK:    i64x2.extend_low_i32x4_s
+; CHECK:    i64x2.add
+entry:
+  %cmp6.not = icmp eq i32 %N, 0
+  br i1 %cmp6.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  ret i64 %res.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %res.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.08
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.08
+  %1 = load i32, ptr %arrayidx1, align 4
+  %mul = mul i32 %1, %0
+  %conv = sext i32 %mul to i64
+  %add = add i64 %res.07, %conv
+  %inc = add nuw i32 %i.08, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i32 @i32_mac_u8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: i32_mac_u8:
+; CHECK:    v128.load32_zero 0:p2align=0
+; CHECK:    i16x8.extend_low_i8x16_u
+; CHECK:    v128.load32_zero 0:p2align=0
+; CHECK:    i16x8.extend_low_i8x16_u
+; CHECK:    i32x4.extmul_low_i16x8_u
+; CHECK:    i32x4.add
+entry:
+  %cmp7.not = icmp eq i32 %N, 0
+  br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %res.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.09
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.09
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv2 = zext i8 %1 to i32
+  %mul = mul nuw nsw i32 %conv2, %conv
+  %add = add i32 %mul, %res.08
+  %inc = add nuw i32 %i.09, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i32 @i32_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: i32_mac_u16:
+; CHECK:    i32x4.load16x4_u 0:p2align=1
+; CHECK:    i32x4.load16x4_u 0:p2align=1
+; CHECK:    i32x4.mul
+; CHECK:    i32x4.add
+entry:
+  %cmp7.not = icmp eq i32 %N, 0
+  br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %res.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, ptr %a, i32 %i.09
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = zext i16 %0 to i32
+  %arrayidx1 = getelementptr inbounds i16, ptr %b, i32 %i.09
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv2 = zext i16 %1 to i32
+  %mul = mul nuw nsw i32 %conv2, %conv
+  %add = add i32 %mul, %res.08
+  %inc = add nuw i32 %i.09, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i64 @i64_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: i64_mac_u16:
+; CHECK:    v128.load32_zero 0:p2align=1
+; CHECK:    i32x4.extend_low_i16x8_u
+; CHECK:    v128.load32_zero 0:p2align=1
+; CHECK:    i32x4.extend_low_i16x8_u
+; CHECK:    i64x2.extmul_low_i32x4_u
+; CHECK:    i64x2.add
+entry:
+  %cmp8.not = icmp eq i32 %N, 0
+  br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  ret i64 %res.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %res.09 = phi i64 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, ptr %a, i32 %i.010
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = zext i16 %0 to i64
+  %arrayidx1 = getelementptr inbounds i16, ptr %b, i32 %i.010
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv2 = zext i16 %1 to i64
+  %mul = mul nuw nsw i64 %conv2, %conv
+  %add = add i64 %mul, %res.09
+  %inc = add nuw i32 %i.010, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i64 @i64_mac_u32(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: i64_mac_u32:
+; CHECK:    v128.load64_zero 0:p2align=2
+; CHECK:    v128.load64_zero 0:p2align=2
+; CHECK:    i32x4.mul
+; CHECK:    i64x2.extend_low_i32x4_u
+; CHECK:    i64x2.add
+entry:
+  %cmp6.not = icmp eq i32 %N, 0
+  br i1 %cmp6.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  ret i64 %res.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %res.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.08
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.08
+  %1 = load i32, ptr %arrayidx1, align 4
+  %mul = mul i32 %1, %0
+  %conv = zext i32 %mul to i64
+  %add = add i64 %res.07, %conv
+  %inc = add nuw i32 %i.08, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/int-mac-reduction-costs.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/int-mac-reduction-costs.ll
new file mode 100644
index 00000000000000..d23c2272d9c0d7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/int-mac-reduction-costs.ll
@@ -0,0 +1,326 @@
+; REQUIRES: asserts
+; RUN: opt -mattr=+simd128 -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s
+
+target triple = "wasm32"
+
+define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: 'i32_mac_s8'
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load i8, ptr %arrayidx, align 1
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %conv = sext i8 %0 to i32
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load i8, ptr %arrayidx1, align 1
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %conv2 = sext i8 %1 to i32
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %mul = mul nsw i32 %conv2, %conv
+
+; CHECK: LV: Found an estimated cost of 3 for VF 2 For instruction:   %0 = load i8, ptr %arrayidx, align 1
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %conv = sext i8 %0 to i32
+; CHECK: LV: Found an estimated cost of 3 for VF 2 For instruction:   %1 = load i8, ptr %arrayidx1, align 1
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %conv2 = sext i8 %1 to i32
+; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %mul = mul nsw i32 %conv2, %conv
+
+; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %0 = load i8, ptr %arrayidx, align 1
+; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %conv = sext i8 %0 to i32
+; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load i8, ptr %arrayidx1, align 1
+; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %conv2 = sext i8 %1 to i32
+; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %mul = mul nsw i32 %conv2, %conv
+; CHECK: LV: Selecting VF: 4.
+entry:
+  %cmp7.not = icmp eq i32 %N, 0
+  br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %res.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.09
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %arrayidx1 = getelementptr inbounds nuw i8, ptr %b, i32 %i.09
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv2 = sext i8 %1 to i32
+  %mul = mul nsw i32 %conv2, %conv
+  %add = add nsw i32 %mul, %res.08
+  %inc = add nuw i32 %i.09, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: 'i32_mac_s16'
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load i16, ptr %arrayidx, align 2
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %conv = sext i16 %0 to i32
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load i16, ptr %arrayidx1, align 2
+; CHECK: ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/122896