[llvm] [LV] Enable considering higher VFs when data extend ops are present i… (PR #137593)

Mon Apr 28 01:01:53 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-powerpc

Author: Sushant Gokhale (sushgokh)

<details>
<summary>Changes</summary>

…n the loop

LV currently limits the VF based on the widest type in the loop.  This might not be beneficial for loops with data extend ops in them. In some cases, this strategy has been found to inhibit considering higher VFs even though a higher VF might be profitable.

This patch aims to relax this constraint to enable higher VFs and lets the cost model take the decision of considering whether a  particular VF is beneficial or not.

---

Patch is 650.15 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137593.diff


50 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+9) 
- (modified) llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll (+104-159) 
- (modified) llvm/test/CodeGen/WebAssembly/interleave.ll (+29-5) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll (+61-31) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll (+3-3) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll (+30-30) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll (+22-74) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll (+140-88) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll (+646-518) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll (+35-35) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll (+77-51) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll (+21-21) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll (+67-19) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll (+38-26) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll (+14-13) 
- (modified) llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll (+3-3) 
- (modified) llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll (+32-32) 
- (modified) llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll (+7-7) 
- (modified) llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll (+15-15) 
- (modified) llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll (+10-10) 
- (modified) llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll (+48-138) 
- (modified) llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll (+170-122) 
- (modified) llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll (+8-8) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll (+10-10) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll (+67-24) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll (+66-26) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll (+10-10) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll (+17-17) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll (+40-40) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll (+20-20) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll (+17-17) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll (+35-35) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll (+5-5) 
- (modified) llvm/test/Transforms/LoopVectorize/WebAssembly/int-mac-reduction-costs.ll (+83-8) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/cost-model.ll (+247-52) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/gcc-examples.ll (+7-7) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll (+15-15) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll (+237-480) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/no_fpmath.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/pr47437.ll (+63-24) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll (+12-12) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll (+54-14) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll (+1-1) 
- (modified) llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll (+37-18) 
- (modified) llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll (+15-15) 
- (modified) llvm/test/Transforms/PhaseOrdering/X86/vector-reduction-known-first-value.ll (+255-19) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f985e883d0dde..84444435bacbd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4123,6 +4123,15 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
   auto MaxVectorElementCount = ElementCount::get(
       llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
       ComputeScalableMaxVF);
+
+  // For loops with extend operations e.g. zext, sext etc., limiting the max VF
+  // based on widest type inhibits considering higher VFs even though
+  // vectorizing with higher VF might be profitable. In such cases, we should
+  // limit the max VF based on smallest type and the decision whether a
+  // particular VF is beneficial or not be left to cost model.
+  if (WidestType != SmallestType)
+    MaximizeBandwidth = true;
+
   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
                     << (MaxVectorElementCount * WidestType) << " bits.\n");
diff --git a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
index 0184e22a3b40d..ae31672854077 100644
--- a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
+++ b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
@@ -1,27 +1,20 @@
 ; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
-; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH
+; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
 
 target triple = "wasm32"
 
 define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
 ; CHECK-LABEL: i32_mac_s8:
-; CHECK:    v128.load32_zero 0:p2align=0
-; CHECK:    i16x8.extend_low_i8x16_s
-; CHECK:    v128.load32_zero 0:p2align=0
-; CHECK:    i16x8.extend_low_i8x16_s
-; CHECK:    i32x4.extmul_low_i16x8_s
-; CHECK:    i32x4.add
-
-; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; MAX-BANDWIDTH: i32x4.dot_i16x8_s
-; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
-; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
-; MAX-BANDWIDTH: i32x4.dot_i16x8_s
-; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i32x4.add
+; CHECK: v128.load
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: v128.load
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.dot_i16x8_s
+; CHECK: i16x8.extend_high_i8x16_s
+; CHECK: i16x8.extend_high_i8x16_s
+; CHECK: i32x4.dot_i16x8_s
+; CHECK: i32x4.add
+; CHECK: i32x4.add
 
 entry:
   %cmp7.not = icmp eq i32 %N, 0
@@ -49,14 +42,9 @@ for.body:                                         ; preds = %entry, %for.body
 
 define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
 ; CHECK-LABEL: i32_mac_s16:
-; CHECK:    i32x4.load16x4_s 0:p2align=1
-; CHECK:    i32x4.load16x4_s 0:p2align=1
-; CHECK:    i32x4.mul
-; CHECK:    i32x4.add
-
-; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i32x4.dot_i16x8_s
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i32x4.dot_i16x8_s
 
 entry:
   %cmp7.not = icmp eq i32 %N, 0
@@ -84,37 +72,30 @@ for.body:                                         ; preds = %entry, %for.body
 
 define hidden i64 @i64_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
 ; CHECK-LABEL: i64_mac_s16:
-; CHECK:    v128.load32_zero 0:p2align=1
-; CHECK:    i32x4.extend_low_i16x8_s
-; CHECK:    v128.load32_zero 0:p2align=1
-; CHECK:    i32x4.extend_low_i16x8_s
-; CHECK:    i64x2.extmul_low_i32x4_s
-; CHECK:    i64x2.add
-
-; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
-; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
-; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
-; MAX-BANDWIDTH: i64x2.add
-; MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
-; MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
-; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
-; MAX-BANDWIDTH: i64x2.add
-; MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
-; MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
-; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
-; MAX-BANDWIDTH: i64x2.add
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
-; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
-; MAX-BANDWIDTH: i64x2.add
+; CHECK: v128.load
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i64x2.extmul_low_i32x4_s
+; CHECK: i64x2.add
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i64x2.extmul_low_i32x4_s
+; CHECK: i64x2.add
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i64x2.extmul_low_i32x4_s
+; CHECK: i64x2.add
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i64x2.extmul_low_i32x4_s
+; CHECK: i64x2.add
 
 entry:
   %cmp7.not = icmp eq i32 %N, 0
@@ -142,19 +123,13 @@ for.body:                                         ; preds = %entry, %for.body
 
 define hidden i64 @i64_mac_s32(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
 ; CHECK-LABEL: i64_mac_s32:
-; CHECK:    v128.load64_zero 0:p2align=2
-; CHECK:    v128.load64_zero 0:p2align=2
-; CHECK:    i32x4.mul
-; CHECK:    i64x2.extend_low_i32x4_s
-; CHECK:    i64x2.add
-
-; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i32x4.mul
-; MAX-BANDWIDTH: i64x2.extend_high_i32x4_s
-; MAX-BANDWIDTH: i64x2.add
-; MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
-; MAX-BANDWIDTH: i64x2.add
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i32x4.mul
+; CHECK: i64x2.extend_high_i32x4_s
+; CHECK: i64x2.add
+; CHECK: i64x2.extend_low_i32x4_s
+; CHECK: i64x2.add
 
 entry:
   %cmp6.not = icmp eq i32 %N, 0
@@ -181,25 +156,18 @@ for.body:                                         ; preds = %entry, %for.body
 
 define hidden i32 @i32_mac_u8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
 ; CHECK-LABEL: i32_mac_u8:
-; CHECK:    v128.load32_zero 0:p2align=0
-; CHECK:    i16x8.extend_low_i8x16_u
-; CHECK:    v128.load32_zero 0:p2align=0
-; CHECK:    i16x8.extend_low_i8x16_u
-; CHECK:    i32x4.extmul_low_i16x8_u
-; CHECK:    i32x4.add
-
-; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_u
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
-; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_u
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
-; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i32x4.add
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i16x8.extmul_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.extend_high_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i16x8.extmul_high_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.extend_high_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i32x4.add
+; CHECK: i32x4.add
 
 entry:
   %cmp7.not = icmp eq i32 %N, 0
@@ -227,17 +195,12 @@ for.body:                                         ; preds = %entry, %for.body
 
 define hidden i32 @i32_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
 ; CHECK-LABEL: i32_mac_u16:
-; CHECK:    i32x4.load16x4_u 0:p2align=1
-; CHECK:    i32x4.load16x4_u 0:p2align=1
-; CHECK:    i32x4.mul
-; CHECK:    i32x4.add
-
-; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i32x4.extmul_low_i16x8_u
-; MAX-BANDWIDTH: i32x4.extmul_high_i16x8_u
-; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i32x4.add
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i32x4.extmul_low_i16x8_u
+; CHECK: i32x4.extmul_high_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i32x4.add
 
 entry:
   %cmp7.not = icmp eq i32 %N, 0
@@ -265,21 +228,16 @@ for.body:                                         ; preds = %entry, %for.body
 
 define hidden i32 @i32_mac_u16_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
 ; CHECK-LABEL: i32_mac_u16_s16:
-; CHECK:    i32x4.load16x4_s 0:p2align=1
-; CHECK:    i32x4.load16x4_u 0:p2align=1
-; CHECK:    i32x4.mul
-; CHECK:    i32x4.add
-
-; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
-; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
-; MAX-BANDWIDTH: i32x4.mul
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; MAX-BANDWIDTH: i32x4.mul
-; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i32x4.add
+; CHECK: v128.load
+; CHECK: i32x4.extend_high_i16x8_s
+; CHECK: v128.load
+; CHECK: i32x4.extend_high_i16x8_u
+; CHECK: i32x4.mul
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.mul
+; CHECK: i32x4.add
+; CHECK: i32x4.add
 
 entry:
   %cmp7.not = icmp eq i32 %N, 0
@@ -307,37 +265,30 @@ for.body:                                         ; preds = %entry, %for.body
 
 define hidden i64 @i64_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
 ; CHECK-LABEL: i64_mac_u16:
-; CHECK:    v128.load32_zero 0:p2align=1
-; CHECK:    i32x4.extend_low_i16x8_u
-; CHECK:    v128.load32_zero 0:p2align=1
-; CHECK:    i32x4.extend_low_i16x8_u
-; CHECK:    i64x2.extmul_low_i32x4_u
-; CHECK:    i64x2.add
-
-; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
-; MAX-BANDWIDTH: i64x2.add
-; MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
-; MAX-BANDWIDTH: i64x2.add
-; MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
-; MAX-BANDWIDTH: i64x2.add
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
-; MAX-BANDWIDTH: i64x2.add
+; CHECK: v128.load
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: v128.load
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i64x2.extmul_low_i32x4_u
+; CHECK: i64x2.add
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i64x2.extmul_low_i32x4_u
+; CHECK: i64x2.add
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i64x2.extmul_low_i32x4_u
+; CHECK: i64x2.add
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i64x2.extmul_low_i32x4_u
+; CHECK: i64x2.add
 
 entry:
   %cmp8.not = icmp eq i32 %N, 0
@@ -365,19 +316,13 @@ for.body:                                         ; preds = %entry, %for.body
 
 define hidden i64 @i64_mac_u32(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
 ; CHECK-LABEL: i64_mac_u32:
-; CHECK:    v128.load64_zero 0:p2align=2
-; CHECK:    v128.load64_zero 0:p2align=2
-; CHECK:    i32x4.mul
-; CHECK:    i64x2.extend_low_i32x4_u
-; CHECK:    i64x2.add
-
-; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i32x4.mul
-; MAX-BANDWIDTH: i64x2.extend_high_i32x4_u
-; MAX-BANDWIDTH: i64x2.add
-; MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
-; MAX-BANDWIDTH: i64x2.add
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i32x4.mul
+; CHECK: i64x2.extend_high_i32x4_u
+; CHECK: i64x2.add
+; CHECK: i64x2.extend_low_i32x4_u
+; CHECK: i64x2.add
 
 entry:
   %cmp6.not = icmp eq i32 %N, 0
diff --git a/llvm/test/CodeGen/WebAssembly/interleave.ll b/llvm/test/CodeGen/WebAssembly/interleave.ll
index c20b5e42c4850..5572510fa02ea 100644
--- a/llvm/test/CodeGen/WebAssembly/interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/interleave.ll
@@ -15,13 +15,37 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-i128:128-n32:64-S128-n
 ; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
 define hidden void @accumulate8x2(ptr dead_on_unwind noalias writable sret(%struct.Output32x2) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
 ; CHECK-LABEL: accumulate8x2:
-; CHECK: loop
-; CHECK: v128.load64_zero
-; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: v128.load       16:p2align=0                
+; CHECK: i8x16.shuffle   9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extend_low_i16x8_u
-; CHECK: i32x4.add
-; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i32x4.add                                
+; CHECK: i8x16.shuffle   1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add                        
+; CHECK: v128.load       0:p2align=0                
+; CHECK: i8x16.shuffle   9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add                                
+; CHECK: i8x16.shuffle   1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add                                
+; CHECK: i8x16.shuffle   8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add                                
+; CHECK: i8x16.shuffle   0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add                                
+; CHECK: i8x16.shuffle   8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add                                
+; CHECK: i8x16.shuffle   0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extend_low_i16x8_u
 ; CHECK: i32x4.add
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index b96a768bba24d..a34079387e246 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -661,29 +661,59 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; DEFAULT-LABEL: define void @multiple_exit_conditions(
 ; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] {
 ; DEFAULT-NEXT:  [[ENTRY:.*]]:
-; DEFAULT-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; DEFAULT-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP0]], 16
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 257, [[TMP6]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; DEFAULT:       [[VECTOR_PH]]:
-; DEFAULT-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 2048
-; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
-; DEFAULT:       [[VECTOR_BODY]]:
-; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 257, [[TMP3]]
+; DEFAULT-NEXT:    [[INDEX:%.*]] = sub i64 257, [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
 ; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; DEFAULT-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
+; DEFAULT-NEXT:    [[TMP8:%.*]] = mul i64 [[INDEX]], 2
+; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
+; DEFAULT:       [[VECTOR_BODY]]:
+; DEFAULT-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX1]], 8
+; DEFAULT-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX1]]
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP2:%.*]] = or <8 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
-; DEFAULT-NEXT:    [[TMP3:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x double>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
-; DEFAULT-NEXT:    store <8 x double> [[TMP3]], ptr [[TMP4]], align 8
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; DEFAULT-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; DEFAULT-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[TMP1]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP10:%.*]] = or <vscale x 4 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
+; DEFAULT-NEXT:    [[TMP11:%.*]] = or <vscale x 4 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
+; DEFAULT-NEXT:    [[TMP12:%.*]] = or <vscale x 4 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
+; DEFAULT-NEXT:    [[TMP13:%.*]] = or <vscale x 4 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
+; DEFAULT-NEXT:    [[TMP14:%.*]] = uitofp <vscale x 4 x i16> [[TMP10]] to <vscale x 4 x double>
+; DEFAULT-NEXT:    [[TMP15:%.*]] = uitofp <vscale x 4 x i16> [[TMP11]] to <vscale x 4 x double>
+; DEFAULT-NEXT:    [[TMP16:%.*]] = uitofp <vscale x 4 x i16> [[TMP12]] to <vscale x 4 x double>
+; DEFAULT-NEXT:    [[TMP17:%.*]] = uitofp <vscale x 4 x i16> [[TMP13]] to <vscale x 4 x double>
+; DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr double, ptr [[NEXT_GEP1]], i32 0
+; DEFAUL...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/137593