[llvm] [VPlan] Recursively match operands of interleave group (PR #164839)

Fri Oct 24 09:10:25 PDT 2025

https://github.com/artagnon updated https://github.com/llvm/llvm-project/pull/164839

>From d2f7593109801815752440c83ace9aef4a573fd7 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Thu, 23 Oct 2025 11:33:21 +0100
Subject: [PATCH 1/6] [LV] Pre-commit test for #128062

In preparation to extend the work done by dfa665f ([VPlan] Add
transformation to narrow interleave groups) to make the narrowing more
powerful, pre-commit a test case from #128062.
---
 ...28062-interleaved-accesses-narrow-group.ll | 253 ++++++++++++++++++
 1 file changed, 253 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll

diff --git a/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
new file mode 100644
index 0000000000000..aefcd4f5b777d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
@@ -0,0 +1,253 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph" --version 6
+; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-interleaved-mem-accesses -S | FileCheck --check-prefix=INTERLEAVED %s
+
+define void @pr128062(ptr %dst.start, i64 %n, i8 %a) {
+; CHECK-LABEL: define void @pr128062(
+; CHECK-SAME: ptr [[DST_START:%.*]], i64 [[N:%.*]], i8 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[IV_START:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    [[A_EXT:%.*]] = zext i8 [[A]] to i16
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[IV_START]], -4
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], -4
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[IV_START]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[N_VEC]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP5]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[A_EXT]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP7]]
+; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP8]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP9]]
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[NEXT_GEP3]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[NEXT_GEP4]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[NEXT_GEP5]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i8> poison, i8 [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> [[TMP15]], i8 [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP13]], i32 2
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[TMP14]], i32 3
+; CHECK-NEXT:    [[TMP19:%.*]] = zext <4 x i8> [[TMP18]] to <4 x i16>
+; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw <4 x i16> [[TMP19]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP21:%.*]] = udiv <4 x i16> [[TMP20]], splat (i16 255)
+; CHECK-NEXT:    [[TMP22:%.*]] = trunc nuw <4 x i16> [[TMP21]] to <4 x i8>
+; CHECK-NEXT:    [[TMP23:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i8> [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i8> [[TMP23]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i8> [[TMP23]], i32 2
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i8> [[TMP23]], i32 3
+; CHECK-NEXT:    store i8 [[TMP24]], ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    store i8 [[TMP25]], ptr [[NEXT_GEP3]], align 1
+; CHECK-NEXT:    store i8 [[TMP26]], ptr [[NEXT_GEP4]], align 1
+; CHECK-NEXT:    store i8 [[TMP27]], ptr [[NEXT_GEP5]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP]], i64 1
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP3]], i64 1
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP4]], i64 1
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP5]], i64 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i8, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i8, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i8> poison, i8 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i8> [[TMP36]], i8 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i8> [[TMP37]], i8 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i8> [[TMP38]], i8 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP40:%.*]] = zext <4 x i8> [[TMP39]] to <4 x i16>
+; CHECK-NEXT:    [[TMP41:%.*]] = mul nuw <4 x i16> [[TMP40]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP42:%.*]] = udiv <4 x i16> [[TMP41]], splat (i16 255)
+; CHECK-NEXT:    [[TMP43:%.*]] = trunc nuw <4 x i16> [[TMP42]] to <4 x i8>
+; CHECK-NEXT:    [[TMP44:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP43]]
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i8> [[TMP44]], i32 0
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i8> [[TMP44]], i32 1
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i8> [[TMP44]], i32 2
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <4 x i8> [[TMP44]], i32 3
+; CHECK-NEXT:    store i8 [[TMP45]], ptr [[TMP28]], align 1
+; CHECK-NEXT:    store i8 [[TMP46]], ptr [[TMP29]], align 1
+; CHECK-NEXT:    store i8 [[TMP47]], ptr [[TMP30]], align 1
+; CHECK-NEXT:    store i8 [[TMP48]], ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP]], i64 2
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP3]], i64 2
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP4]], i64 2
+; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP5]], i64 2
+; CHECK-NEXT:    [[TMP53:%.*]] = load i8, ptr [[TMP49]], align 1
+; CHECK-NEXT:    [[TMP54:%.*]] = load i8, ptr [[TMP50]], align 1
+; CHECK-NEXT:    [[TMP55:%.*]] = load i8, ptr [[TMP51]], align 1
+; CHECK-NEXT:    [[TMP56:%.*]] = load i8, ptr [[TMP52]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <4 x i8> poison, i8 [[TMP53]], i32 0
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <4 x i8> [[TMP57]], i8 [[TMP54]], i32 1
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <4 x i8> [[TMP58]], i8 [[TMP55]], i32 2
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i8> [[TMP59]], i8 [[TMP56]], i32 3
+; CHECK-NEXT:    [[TMP61:%.*]] = zext <4 x i8> [[TMP60]] to <4 x i16>
+; CHECK-NEXT:    [[TMP62:%.*]] = mul nuw <4 x i16> [[TMP61]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP63:%.*]] = udiv <4 x i16> [[TMP62]], splat (i16 255)
+; CHECK-NEXT:    [[TMP64:%.*]] = trunc nuw <4 x i16> [[TMP63]] to <4 x i8>
+; CHECK-NEXT:    [[TMP65:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP64]]
+; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <4 x i8> [[TMP65]], i32 0
+; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <4 x i8> [[TMP65]], i32 1
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i8> [[TMP65]], i32 2
+; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <4 x i8> [[TMP65]], i32 3
+; CHECK-NEXT:    store i8 [[TMP66]], ptr [[TMP49]], align 1
+; CHECK-NEXT:    store i8 [[TMP67]], ptr [[TMP50]], align 1
+; CHECK-NEXT:    store i8 [[TMP68]], ptr [[TMP51]], align 1
+; CHECK-NEXT:    store i8 [[TMP69]], ptr [[TMP52]], align 1
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP]], i64 3
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP3]], i64 3
+; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP4]], i64 3
+; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP5]], i64 3
+; CHECK-NEXT:    [[TMP74:%.*]] = load i8, ptr [[TMP70]], align 1
+; CHECK-NEXT:    [[TMP75:%.*]] = load i8, ptr [[TMP71]], align 1
+; CHECK-NEXT:    [[TMP76:%.*]] = load i8, ptr [[TMP72]], align 1
+; CHECK-NEXT:    [[TMP77:%.*]] = load i8, ptr [[TMP73]], align 1
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i8> poison, i8 [[TMP74]], i32 0
+; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <4 x i8> [[TMP78]], i8 [[TMP75]], i32 1
+; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <4 x i8> [[TMP79]], i8 [[TMP76]], i32 2
+; CHECK-NEXT:    [[TMP81:%.*]] = insertelement <4 x i8> [[TMP80]], i8 [[TMP77]], i32 3
+; CHECK-NEXT:    [[TMP82:%.*]] = zext <4 x i8> [[TMP81]] to <4 x i16>
+; CHECK-NEXT:    [[TMP83:%.*]] = mul nuw <4 x i16> [[TMP82]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP84:%.*]] = udiv <4 x i16> [[TMP83]], splat (i16 255)
+; CHECK-NEXT:    [[TMP85:%.*]] = trunc nuw <4 x i16> [[TMP84]] to <4 x i8>
+; CHECK-NEXT:    [[TMP86:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP85]]
+; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <4 x i8> [[TMP86]], i32 0
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <4 x i8> [[TMP86]], i32 1
+; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <4 x i8> [[TMP86]], i32 2
+; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <4 x i8> [[TMP86]], i32 3
+; CHECK-NEXT:    store i8 [[TMP87]], ptr [[TMP70]], align 1
+; CHECK-NEXT:    store i8 [[TMP88]], ptr [[TMP71]], align 1
+; CHECK-NEXT:    store i8 [[TMP89]], ptr [[TMP72]], align 1
+; CHECK-NEXT:    store i8 [[TMP90]], ptr [[TMP73]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP91:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP91]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+; INTERLEAVED-LABEL: define void @pr128062(
+; INTERLEAVED-SAME: ptr [[DST_START:%.*]], i64 [[N:%.*]], i8 [[A:%.*]]) {
+; INTERLEAVED-NEXT:  [[ENTRY:.*:]]
+; INTERLEAVED-NEXT:    [[IV_START:%.*]] = and i64 [[N]], -4
+; INTERLEAVED-NEXT:    [[A_EXT:%.*]] = zext i8 [[A]] to i16
+; INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[IV_START]], -4
+; INTERLEAVED-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
+; INTERLEAVED-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVED:       [[VECTOR_PH]]:
+; INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], -4
+; INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[IV_START]], [[TMP3]]
+; INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[N_VEC]], 4
+; INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP5]]
+; INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[A_EXT]], i64 0
+; INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0
+; INTERLEAVED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
+; INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVED:       [[VECTOR_BODY]]:
+; INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[OFFSET_IDX]]
+; INTERLEAVED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; INTERLEAVED-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; INTERLEAVED-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; INTERLEAVED-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; INTERLEAVED-NEXT:    [[TMP81:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <4 x i8> [[TMP18]] to <4 x i16>
+; INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nuw <4 x i16> [[TMP19]], [[BROADCAST_SPLAT]]
+; INTERLEAVED-NEXT:    [[TMP21:%.*]] = udiv <4 x i16> [[TMP20]], splat (i16 255)
+; INTERLEAVED-NEXT:    [[TMP22:%.*]] = trunc nuw <4 x i16> [[TMP21]] to <4 x i8>
+; INTERLEAVED-NEXT:    [[TMP23:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP22]]
+; INTERLEAVED-NEXT:    [[TMP40:%.*]] = zext <4 x i8> [[TMP39]] to <4 x i16>
+; INTERLEAVED-NEXT:    [[TMP41:%.*]] = mul nuw <4 x i16> [[TMP40]], [[BROADCAST_SPLAT]]
+; INTERLEAVED-NEXT:    [[TMP42:%.*]] = udiv <4 x i16> [[TMP41]], splat (i16 255)
+; INTERLEAVED-NEXT:    [[TMP43:%.*]] = trunc nuw <4 x i16> [[TMP42]] to <4 x i8>
+; INTERLEAVED-NEXT:    [[TMP44:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP43]]
+; INTERLEAVED-NEXT:    [[TMP61:%.*]] = zext <4 x i8> [[TMP60]] to <4 x i16>
+; INTERLEAVED-NEXT:    [[TMP62:%.*]] = mul nuw <4 x i16> [[TMP61]], [[BROADCAST_SPLAT]]
+; INTERLEAVED-NEXT:    [[TMP63:%.*]] = udiv <4 x i16> [[TMP62]], splat (i16 255)
+; INTERLEAVED-NEXT:    [[TMP64:%.*]] = trunc nuw <4 x i16> [[TMP63]] to <4 x i8>
+; INTERLEAVED-NEXT:    [[TMP65:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP64]]
+; INTERLEAVED-NEXT:    [[TMP82:%.*]] = zext <4 x i8> [[TMP81]] to <4 x i16>
+; INTERLEAVED-NEXT:    [[TMP83:%.*]] = mul nuw <4 x i16> [[TMP82]], [[BROADCAST_SPLAT]]
+; INTERLEAVED-NEXT:    [[TMP84:%.*]] = udiv <4 x i16> [[TMP83]], splat (i16 255)
+; INTERLEAVED-NEXT:    [[TMP85:%.*]] = trunc nuw <4 x i16> [[TMP84]] to <4 x i8>
+; INTERLEAVED-NEXT:    [[TMP86:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP85]]
+; INTERLEAVED-NEXT:    [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP23]], <4 x i8> [[TMP44]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; INTERLEAVED-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP65]], <4 x i8> [[TMP86]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; INTERLEAVED-NEXT:    [[TMP29:%.*]] = shufflevector <8 x i8> [[TMP27]], <8 x i8> [[TMP28]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; INTERLEAVED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; INTERLEAVED-NEXT:    store <16 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1
+; INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; INTERLEAVED-NEXT:    [[TMP91:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVED-NEXT:    br i1 [[TMP91]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; INTERLEAVED:       [[MIDDLE_BLOCK]]:
+; INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; INTERLEAVED-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; INTERLEAVED:       [[SCALAR_PH]]:
+;
+entry:
+  %iv.start = and i64 %n, -4
+  %a.ext = zext i8 %a to i16
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ %iv.start, %entry ]
+  %dst = phi ptr [ %dst.next, %loop ], [ %dst.start, %entry ]
+  %dst.next = getelementptr inbounds nuw i8, ptr %dst, i64 4
+  %load.dst = load i8, ptr %dst, align 1
+  %dst.ext = zext i8 %load.dst to i16
+  %mul.dst.0 = mul nuw i16 %dst.ext, %a.ext
+  %udiv.0 = udiv i16 %mul.dst.0, 255
+  %trunc.0 = trunc nuw i16 %udiv.0 to i8
+  %val.0 = add i8 %a, %trunc.0
+  store i8 %val.0, ptr %dst, align 1
+  %gep.dst.1 = getelementptr inbounds nuw i8, ptr %dst, i64 1
+  %load.dst.1 = load i8, ptr %gep.dst.1, align 1
+  %dst.1.ext = zext i8 %load.dst.1 to i16
+  %mul.dst.1 = mul nuw i16 %dst.1.ext, %a.ext
+  %udiv.1 = udiv i16 %mul.dst.1, 255
+  %trunc.1 = trunc nuw i16 %udiv.1 to i8
+  %val.1 = add i8 %a, %trunc.1
+  store i8 %val.1, ptr %gep.dst.1, align 1
+  %gep.dst.2 = getelementptr inbounds nuw i8, ptr %dst, i64 2
+  %load.dst.2 = load i8, ptr %gep.dst.2, align 1
+  %dst.2.ext = zext i8 %load.dst.2 to i16
+  %mul.dst.2 = mul nuw i16 %dst.2.ext, %a.ext
+  %udiv.2 = udiv i16 %mul.dst.2, 255
+  %trunc.2 = trunc nuw i16 %udiv.2 to i8
+  %val.2 = add i8 %a, %trunc.2
+  store i8 %val.2, ptr %gep.dst.2, align 1
+  %gep.dst.3 = getelementptr inbounds nuw i8, ptr %dst, i64 3
+  %load.dst.3 = load i8, ptr %gep.dst.3, align 1
+  %dst.3.ext = zext i8 %load.dst.3 to i16
+  %mul.dst.3 = mul nuw i16 %dst.3.ext, %a.ext
+  %udiv.3 = udiv i16 %mul.dst.3, 255
+  %trunc.3 = trunc nuw i16 %udiv.3 to i8
+  %val.3 = add i8 %a, %trunc.3
+  store i8 %val.3, ptr %gep.dst.3, align 1
+  %iv.next = add i64 %iv, -4
+  %exit.cond = icmp eq i64 %iv.next, 0
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:
+  ret void
+}

>From 0b4c451bc6f7420addf40f7ff80c3ecb67a8f0be Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Thu, 23 Oct 2025 20:12:22 +0100
Subject: [PATCH 2/6] [LV] Improve test

---
 ...28062-interleaved-accesses-narrow-group.ll | 252 ++++--------------
 1 file changed, 50 insertions(+), 202 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
index aefcd4f5b777d..bbba4f612dbd3 100644
--- a/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
@@ -1,26 +1,13 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph" --version 6
-; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
-; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-interleaved-mem-accesses -S | FileCheck --check-prefix=INTERLEAVED %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-interleaved-mem-accesses -S | FileCheck %s
 
-define void @pr128062(ptr %dst.start, i64 %n, i8 %a) {
+define void @pr128062(ptr %dst.start, i8 %a, i16 %b) {
 ; CHECK-LABEL: define void @pr128062(
-; CHECK-SAME: ptr [[DST_START:%.*]], i64 [[N:%.*]], i8 [[A:%.*]]) {
+; CHECK-SAME: ptr [[DST_START:%.*]], i8 [[A:%.*]], i16 [[B:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[IV_START:%.*]] = and i64 [[N]], -4
-; CHECK-NEXT:    [[A_EXT:%.*]] = zext i8 [[A]] to i16
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[IV_START]], -4
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], -4
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[IV_START]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[N_VEC]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP5]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[A_EXT]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
@@ -28,194 +15,55 @@ define void @pr128062(ptr %dst.start, i64 %n, i8 %a) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 12
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP7]]
-; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP8]]
-; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP9]]
-; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[NEXT_GEP]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[NEXT_GEP3]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[NEXT_GEP4]], align 1
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[NEXT_GEP5]], align 1
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i8> poison, i8 [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> [[TMP15]], i8 [[TMP12]], i32 1
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP13]], i32 2
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[TMP14]], i32 3
-; CHECK-NEXT:    [[TMP19:%.*]] = zext <4 x i8> [[TMP18]] to <4 x i16>
-; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw <4 x i16> [[TMP19]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP21:%.*]] = udiv <4 x i16> [[TMP20]], splat (i16 255)
-; CHECK-NEXT:    [[TMP22:%.*]] = trunc nuw <4 x i16> [[TMP21]] to <4 x i8>
-; CHECK-NEXT:    [[TMP23:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i8> [[TMP23]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i8> [[TMP23]], i32 1
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i8> [[TMP23]], i32 2
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i8> [[TMP23]], i32 3
-; CHECK-NEXT:    store i8 [[TMP24]], ptr [[NEXT_GEP]], align 1
-; CHECK-NEXT:    store i8 [[TMP25]], ptr [[NEXT_GEP3]], align 1
-; CHECK-NEXT:    store i8 [[TMP26]], ptr [[NEXT_GEP4]], align 1
-; CHECK-NEXT:    store i8 [[TMP27]], ptr [[NEXT_GEP5]], align 1
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP]], i64 1
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP3]], i64 1
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP4]], i64 1
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP5]], i64 1
-; CHECK-NEXT:    [[TMP32:%.*]] = load i8, ptr [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i8, ptr [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i8> poison, i8 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i8> [[TMP36]], i8 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i8> [[TMP37]], i8 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i8> [[TMP38]], i8 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = zext <4 x i8> [[TMP39]] to <4 x i16>
-; CHECK-NEXT:    [[TMP41:%.*]] = mul nuw <4 x i16> [[TMP40]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP42:%.*]] = udiv <4 x i16> [[TMP41]], splat (i16 255)
-; CHECK-NEXT:    [[TMP43:%.*]] = trunc nuw <4 x i16> [[TMP42]] to <4 x i8>
-; CHECK-NEXT:    [[TMP44:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP43]]
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i8> [[TMP44]], i32 0
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i8> [[TMP44]], i32 1
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i8> [[TMP44]], i32 2
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <4 x i8> [[TMP44]], i32 3
-; CHECK-NEXT:    store i8 [[TMP45]], ptr [[TMP28]], align 1
-; CHECK-NEXT:    store i8 [[TMP46]], ptr [[TMP29]], align 1
-; CHECK-NEXT:    store i8 [[TMP47]], ptr [[TMP30]], align 1
-; CHECK-NEXT:    store i8 [[TMP48]], ptr [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP]], i64 2
-; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP3]], i64 2
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP4]], i64 2
-; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP5]], i64 2
-; CHECK-NEXT:    [[TMP53:%.*]] = load i8, ptr [[TMP49]], align 1
-; CHECK-NEXT:    [[TMP54:%.*]] = load i8, ptr [[TMP50]], align 1
-; CHECK-NEXT:    [[TMP55:%.*]] = load i8, ptr [[TMP51]], align 1
-; CHECK-NEXT:    [[TMP56:%.*]] = load i8, ptr [[TMP52]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <4 x i8> poison, i8 [[TMP53]], i32 0
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <4 x i8> [[TMP57]], i8 [[TMP54]], i32 1
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <4 x i8> [[TMP58]], i8 [[TMP55]], i32 2
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i8> [[TMP59]], i8 [[TMP56]], i32 3
-; CHECK-NEXT:    [[TMP61:%.*]] = zext <4 x i8> [[TMP60]] to <4 x i16>
-; CHECK-NEXT:    [[TMP62:%.*]] = mul nuw <4 x i16> [[TMP61]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP63:%.*]] = udiv <4 x i16> [[TMP62]], splat (i16 255)
-; CHECK-NEXT:    [[TMP64:%.*]] = trunc nuw <4 x i16> [[TMP63]] to <4 x i8>
-; CHECK-NEXT:    [[TMP65:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP64]]
-; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <4 x i8> [[TMP65]], i32 0
-; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <4 x i8> [[TMP65]], i32 1
-; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i8> [[TMP65]], i32 2
-; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <4 x i8> [[TMP65]], i32 3
-; CHECK-NEXT:    store i8 [[TMP66]], ptr [[TMP49]], align 1
-; CHECK-NEXT:    store i8 [[TMP67]], ptr [[TMP50]], align 1
-; CHECK-NEXT:    store i8 [[TMP68]], ptr [[TMP51]], align 1
-; CHECK-NEXT:    store i8 [[TMP69]], ptr [[TMP52]], align 1
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP]], i64 3
-; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP3]], i64 3
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP4]], i64 3
-; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP5]], i64 3
-; CHECK-NEXT:    [[TMP74:%.*]] = load i8, ptr [[TMP70]], align 1
-; CHECK-NEXT:    [[TMP75:%.*]] = load i8, ptr [[TMP71]], align 1
-; CHECK-NEXT:    [[TMP76:%.*]] = load i8, ptr [[TMP72]], align 1
-; CHECK-NEXT:    [[TMP77:%.*]] = load i8, ptr [[TMP73]], align 1
-; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i8> poison, i8 [[TMP74]], i32 0
-; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <4 x i8> [[TMP78]], i8 [[TMP75]], i32 1
-; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <4 x i8> [[TMP79]], i8 [[TMP76]], i32 2
-; CHECK-NEXT:    [[TMP81:%.*]] = insertelement <4 x i8> [[TMP80]], i8 [[TMP77]], i32 3
-; CHECK-NEXT:    [[TMP82:%.*]] = zext <4 x i8> [[TMP81]] to <4 x i16>
-; CHECK-NEXT:    [[TMP83:%.*]] = mul nuw <4 x i16> [[TMP82]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP84:%.*]] = udiv <4 x i16> [[TMP83]], splat (i16 255)
-; CHECK-NEXT:    [[TMP85:%.*]] = trunc nuw <4 x i16> [[TMP84]] to <4 x i8>
-; CHECK-NEXT:    [[TMP86:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP85]]
-; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <4 x i8> [[TMP86]], i32 0
-; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <4 x i8> [[TMP86]], i32 1
-; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <4 x i8> [[TMP86]], i32 2
-; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <4 x i8> [[TMP86]], i32 3
-; CHECK-NEXT:    store i8 [[TMP87]], ptr [[TMP70]], align 1
-; CHECK-NEXT:    store i8 [[TMP88]], ptr [[TMP71]], align 1
-; CHECK-NEXT:    store i8 [[TMP89]], ptr [[TMP72]], align 1
-; CHECK-NEXT:    store i8 [[TMP90]], ptr [[TMP73]], align 1
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i8> [[STRIDED_VEC]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw <4 x i16> [[TMP0]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv <4 x i16> [[TMP1]], splat (i16 255)
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw <4 x i16> [[TMP2]] to <4 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[STRIDED_VEC3]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw <4 x i16> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv <4 x i16> [[TMP6]], splat (i16 255)
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc nuw <4 x i16> [[TMP7]] to <4 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <4 x i8> [[STRIDED_VEC4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw <4 x i16> [[TMP10]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = udiv <4 x i16> [[TMP11]], splat (i16 255)
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc nuw <4 x i16> [[TMP12]] to <4 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = zext <4 x i8> [[STRIDED_VEC5]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw <4 x i16> [[TMP15]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP17:%.*]] = udiv <4 x i16> [[TMP16]], splat (i16 255)
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc nuw <4 x i16> [[TMP17]] to <4 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i8> [[TMP20]], <8 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    store <16 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP91:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP91]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-;
-; INTERLEAVED-LABEL: define void @pr128062(
-; INTERLEAVED-SAME: ptr [[DST_START:%.*]], i64 [[N:%.*]], i8 [[A:%.*]]) {
-; INTERLEAVED-NEXT:  [[ENTRY:.*:]]
-; INTERLEAVED-NEXT:    [[IV_START:%.*]] = and i64 [[N]], -4
-; INTERLEAVED-NEXT:    [[A_EXT:%.*]] = zext i8 [[A]] to i16
-; INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[IV_START]], -4
-; INTERLEAVED-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
-; INTERLEAVED-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
-; INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; INTERLEAVED:       [[VECTOR_PH]]:
-; INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
-; INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], -4
-; INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[IV_START]], [[TMP3]]
-; INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[N_VEC]], 4
-; INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP5]]
-; INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[A_EXT]], i64 0
-; INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
-; INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0
-; INTERLEAVED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
-; INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
-; INTERLEAVED:       [[VECTOR_BODY]]:
-; INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
-; INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[OFFSET_IDX]]
-; INTERLEAVED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; INTERLEAVED-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; INTERLEAVED-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; INTERLEAVED-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; INTERLEAVED-NEXT:    [[TMP81:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <4 x i8> [[TMP18]] to <4 x i16>
-; INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nuw <4 x i16> [[TMP19]], [[BROADCAST_SPLAT]]
-; INTERLEAVED-NEXT:    [[TMP21:%.*]] = udiv <4 x i16> [[TMP20]], splat (i16 255)
-; INTERLEAVED-NEXT:    [[TMP22:%.*]] = trunc nuw <4 x i16> [[TMP21]] to <4 x i8>
-; INTERLEAVED-NEXT:    [[TMP23:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP22]]
-; INTERLEAVED-NEXT:    [[TMP40:%.*]] = zext <4 x i8> [[TMP39]] to <4 x i16>
-; INTERLEAVED-NEXT:    [[TMP41:%.*]] = mul nuw <4 x i16> [[TMP40]], [[BROADCAST_SPLAT]]
-; INTERLEAVED-NEXT:    [[TMP42:%.*]] = udiv <4 x i16> [[TMP41]], splat (i16 255)
-; INTERLEAVED-NEXT:    [[TMP43:%.*]] = trunc nuw <4 x i16> [[TMP42]] to <4 x i8>
-; INTERLEAVED-NEXT:    [[TMP44:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP43]]
-; INTERLEAVED-NEXT:    [[TMP61:%.*]] = zext <4 x i8> [[TMP60]] to <4 x i16>
-; INTERLEAVED-NEXT:    [[TMP62:%.*]] = mul nuw <4 x i16> [[TMP61]], [[BROADCAST_SPLAT]]
-; INTERLEAVED-NEXT:    [[TMP63:%.*]] = udiv <4 x i16> [[TMP62]], splat (i16 255)
-; INTERLEAVED-NEXT:    [[TMP64:%.*]] = trunc nuw <4 x i16> [[TMP63]] to <4 x i8>
-; INTERLEAVED-NEXT:    [[TMP65:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP64]]
-; INTERLEAVED-NEXT:    [[TMP82:%.*]] = zext <4 x i8> [[TMP81]] to <4 x i16>
-; INTERLEAVED-NEXT:    [[TMP83:%.*]] = mul nuw <4 x i16> [[TMP82]], [[BROADCAST_SPLAT]]
-; INTERLEAVED-NEXT:    [[TMP84:%.*]] = udiv <4 x i16> [[TMP83]], splat (i16 255)
-; INTERLEAVED-NEXT:    [[TMP85:%.*]] = trunc nuw <4 x i16> [[TMP84]] to <4 x i8>
-; INTERLEAVED-NEXT:    [[TMP86:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP85]]
-; INTERLEAVED-NEXT:    [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP23]], <4 x i8> [[TMP44]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; INTERLEAVED-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP65]], <4 x i8> [[TMP86]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; INTERLEAVED-NEXT:    [[TMP29:%.*]] = shufflevector <8 x i8> [[TMP27]], <8 x i8> [[TMP28]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; INTERLEAVED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; INTERLEAVED-NEXT:    store <16 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1
-; INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; INTERLEAVED-NEXT:    [[TMP91:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVED-NEXT:    br i1 [[TMP91]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; INTERLEAVED:       [[MIDDLE_BLOCK]]:
-; INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; INTERLEAVED-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
-; INTERLEAVED:       [[SCALAR_PH]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
-  %iv.start = and i64 %n, -4
-  %a.ext = zext i8 %a to i16
   br label %loop
 
 loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ %iv.start, %entry ]
-  %dst = phi ptr [ %dst.next, %loop ], [ %dst.start, %entry ]
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %dst = phi ptr [ %dst.start, %entry ], [ %dst.next, %loop ]
   %dst.next = getelementptr inbounds nuw i8, ptr %dst, i64 4
   %load.dst = load i8, ptr %dst, align 1
   %dst.ext = zext i8 %load.dst to i16
-  %mul.dst.0 = mul nuw i16 %dst.ext, %a.ext
+  %mul.dst.0 = mul nuw i16 %dst.ext, %b
   %udiv.0 = udiv i16 %mul.dst.0, 255
   %trunc.0 = trunc nuw i16 %udiv.0 to i8
   %val.0 = add i8 %a, %trunc.0
@@ -223,7 +71,7 @@ loop:
   %gep.dst.1 = getelementptr inbounds nuw i8, ptr %dst, i64 1
   %load.dst.1 = load i8, ptr %gep.dst.1, align 1
   %dst.1.ext = zext i8 %load.dst.1 to i16
-  %mul.dst.1 = mul nuw i16 %dst.1.ext, %a.ext
+  %mul.dst.1 = mul nuw i16 %dst.1.ext, %b
   %udiv.1 = udiv i16 %mul.dst.1, 255
   %trunc.1 = trunc nuw i16 %udiv.1 to i8
   %val.1 = add i8 %a, %trunc.1
@@ -231,7 +79,7 @@ loop:
   %gep.dst.2 = getelementptr inbounds nuw i8, ptr %dst, i64 2
   %load.dst.2 = load i8, ptr %gep.dst.2, align 1
   %dst.2.ext = zext i8 %load.dst.2 to i16
-  %mul.dst.2 = mul nuw i16 %dst.2.ext, %a.ext
+  %mul.dst.2 = mul nuw i16 %dst.2.ext, %b
   %udiv.2 = udiv i16 %mul.dst.2, 255
   %trunc.2 = trunc nuw i16 %udiv.2 to i8
   %val.2 = add i8 %a, %trunc.2
@@ -239,13 +87,13 @@ loop:
   %gep.dst.3 = getelementptr inbounds nuw i8, ptr %dst, i64 3
   %load.dst.3 = load i8, ptr %gep.dst.3, align 1
   %dst.3.ext = zext i8 %load.dst.3 to i16
-  %mul.dst.3 = mul nuw i16 %dst.3.ext, %a.ext
+  %mul.dst.3 = mul nuw i16 %dst.3.ext, %b
   %udiv.3 = udiv i16 %mul.dst.3, 255
   %trunc.3 = trunc nuw i16 %udiv.3 to i8
   %val.3 = add i8 %a, %trunc.3
   store i8 %val.3, ptr %gep.dst.3, align 1
-  %iv.next = add i64 %iv, -4
-  %exit.cond = icmp eq i64 %iv.next, 0
+  %iv.next = add i64 %iv, 4
+  %exit.cond = icmp eq i64 %iv.next, 256
   br i1 %exit.cond, label %exit, label %loop
 
 exit:

>From fafb462c7e3d3d7d7bc363a35fce76a774f8a8fd Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Thu, 23 Oct 2025 14:34:11 +0100
Subject: [PATCH 3/6] [VPlan] Recursively match operands of interleave group

When narrowing interleave groups, we use the canNarrowLoad check, which
bails out when there any recipe that is not a VPWidenLoad,
VPWidenInterleave, or live-in feeding the interleave: a lot of potential
narrowing opportunities are missed as a result. Correctly identify that
these three cases are the leaf cases, and match the recursive operands
instead.

Fixes #128062.
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  88 ++++++----
 ...interleave-to-widen-memory-constant-ops.ll |  98 +++++------
 ...nterleave-to-widen-memory-with-wide-ops.ll | 165 ++++++++++--------
 ...28062-interleaved-accesses-narrow-group.ll |  25 +--
 .../PhaseOrdering/AArch64/interleave_vec.ll   |  26 +--
 5 files changed, 201 insertions(+), 201 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 84817d78a077a..a38f4f07408fd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -628,6 +628,14 @@ static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
   return Users.takeVector();
 }
 
+static SmallVector<VPValue *> collectOperandsRecursively(VPRecipeBase *R) {
+  SetVector<VPValue *> Operands(llvm::from_range, R->operands());
+  for (unsigned I = 0; I != Operands.size(); ++I)
+    if (auto *Cur = Operands[I]->getDefiningRecipe())
+      Operands.insert_range(Cur->operands());
+  return Operands.takeVector();
+}
+
 /// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
 /// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
 /// VPWidenPointerInductionRecipe will generate vectors only. If some users
@@ -4064,25 +4072,42 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
   return ExpandedSCEVs;
 }
 
-/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
-/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
-/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
-/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
-/// an index-independent load if it feeds all wide ops at all indices (\p OpV
-/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
-/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
-/// is defined at \p Idx of a load interleave group.
-static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
-                          VPValue *OpV, unsigned Idx) {
-  auto *DefR = OpV->getDefiningRecipe();
-  if (!DefR)
-    return WideMember0->getOperand(OpIdx) == OpV;
-  if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
-    return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
-
-  if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
-    return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
-  return false;
+/// Returns true if the \p StoredValues of an interleave group match. It does
+/// this by going through operands recursively until it hits the leaf cases:
+/// VPWidenLoadRecipe, VPInterleaveRecipe, and live-ins.
+static bool interleaveStoredValuesMatch(ArrayRef<VPValue *> StoredValues) {
+  auto *WideMember0 =
+      dyn_cast_or_null<VPWidenRecipe>(StoredValues[0]->getDefiningRecipe());
+  if (!WideMember0)
+    return false;
+  SmallVector<VPValue *> Ops0 = collectOperandsRecursively(WideMember0);
+  for (VPValue *ValI : StoredValues) {
+    auto *WideMemberI =
+        dyn_cast_or_null<VPWidenRecipe>(ValI->getDefiningRecipe());
+    if (!WideMemberI || WideMemberI->getOpcode() != WideMember0->getOpcode())
+      return false;
+    SmallVector<VPValue *> OpsI = collectOperandsRecursively(WideMemberI);
+    if (Ops0.size() != OpsI.size())
+      return false;
+    for (const auto &[Op0, OpI] : zip(Ops0, OpsI)) {
+      auto *Def0 = Op0->getDefiningRecipe();
+      auto *DefI = OpI->getDefiningRecipe();
+      if (!Def0 || !DefI) {
+        if (Op0 != OpI)
+          return false;
+      } else if (Def0->getVPDefID() != DefI->getVPDefID()) {
+        return false;
+      } else if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefI)) {
+        if (W->isMasked() || Op0 != OpI)
+          return false;
+      } else if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefI)) {
+        if (!IR->getInterleaveGroup()->isFull() ||
+            !equal(DefI->definedValues(), Def0->definedValues()))
+          return false;
+      }
+    }
+  }
+  return true;
 }
 
 /// Returns true if \p IR is a full interleave group with factor and number of
@@ -4201,24 +4226,9 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
       continue;
     }
 
-    // Check if all values feeding InterleaveR are matching wide recipes, which
-    // operands that can be narrowed.
-    auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
-        InterleaveR->getStoredValues()[0]->getDefiningRecipe());
-    if (!WideMember0)
+    // Check if all values feeding InterleaveR match.
+    if (!interleaveStoredValuesMatch(InterleaveR->getStoredValues()))
       return;
-    for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
-      auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
-      if (!R || R->getOpcode() != WideMember0->getOpcode() ||
-          R->getNumOperands() > 2)
-        return;
-      if (any_of(enumerate(R->operands()),
-                 [WideMember0, Idx = I](const auto &P) {
-                   const auto &[OpIdx, OpV] = P;
-                   return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
-                 }))
-        return;
-    }
     StoreGroups.push_back(InterleaveR);
   }
 
@@ -4250,7 +4260,11 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
       NarrowedOps.insert(RepR);
       return RepR;
     }
-    auto *WideLoad = cast<VPWidenLoadRecipe>(R);
+    auto *WideLoad = dyn_cast<VPWidenLoadRecipe>(R);
+    if (!WideLoad) {
+      NarrowedOps.insert(V);
+      return V;
+    }
 
     VPValue *PtrOp = WideLoad->getAddr();
     if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
index 005ca8c9b2d93..52bd8a0a11e35 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
@@ -28,8 +28,9 @@ define void @test_add_double_same_const_args_1(ptr %res, ptr noalias %A, ptr noa
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -76,10 +77,11 @@ define void @test_add_double_same_const_args_2(ptr %res, ptr noalias %A, ptr noa
 ; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -136,10 +138,11 @@ define void @test_add_double_mixed_const_args(ptr %res, ptr noalias %A, ptr noal
 ; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -175,33 +178,24 @@ define void @test_add_double_same_var_args_1(ptr %res, ptr noalias %A, ptr noali
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -237,33 +231,24 @@ define void @test_add_double_same_var_args_2(ptr %res, ptr noalias %A, ptr noali
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -322,10 +307,11 @@ define void @test_add_double_same_var_args_at_different_positions(ptr %res, ptr
 ; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -386,10 +372,11 @@ define void @test_add_double_different_var_args_1(ptr %res, ptr noalias %A, ptr
 ; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC7]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -450,10 +437,11 @@ define void @test_add_double_different_var_args_2(ptr %res, ptr noalias %A, ptr
 ; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC7]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
index 6d0c55b1d246c..07af33b6caa95 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
@@ -23,8 +23,9 @@ define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr no
 ; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br [[EXIT:label %.*]]
-; VF2:       [[SCALAR_PH:.*:]]
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @test_2xi64_unary_op_load_interleave_group(
 ; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) {
@@ -48,8 +49,9 @@ define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr no
 ; VF4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br [[EXIT:label %.*]]
-; VF4:       [[SCALAR_PH:.*:]]
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -98,8 +100,9 @@ define void @test_2xi64_unary_op_wide_load(ptr noalias %data, ptr noalias %A, pt
 ; VF2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br [[EXIT:label %.*]]
-; VF2:       [[SCALAR_PH:.*:]]
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @test_2xi64_unary_op_wide_load(
 ; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -124,8 +127,9 @@ define void @test_2xi64_unary_op_wide_load(ptr noalias %data, ptr noalias %A, pt
 ; VF4-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br [[EXIT:label %.*]]
-; VF4:       [[SCALAR_PH:.*:]]
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -174,8 +178,9 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor) {
 ; VF2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br [[EXIT:label %.*]]
-; VF2:       [[SCALAR_PH:.*:]]
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @test_2xi64(
 ; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) {
@@ -201,8 +206,9 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor) {
 ; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br [[EXIT:label %.*]]
-; VF4:       [[SCALAR_PH:.*:]]
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -254,8 +260,9 @@ define void @test_2xi64_different_opcodes(ptr noalias %data, ptr noalias %factor
 ; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br [[EXIT:label %.*]]
-; VF2:       [[SCALAR_PH:.*:]]
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @test_2xi64_different_opcodes(
 ; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) {
@@ -281,8 +288,9 @@ define void @test_2xi64_different_opcodes(ptr noalias %data, ptr noalias %factor
 ; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br [[EXIT:label %.*]]
-; VF4:       [[SCALAR_PH:.*:]]
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -319,23 +327,21 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; VF2:       [[VECTOR_BODY]]:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; VF2-NEXT:    [[TMP15:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP15]]
-; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP16]], align 8
-; VF2-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT:    [[TMP21:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP20]]
+; VF2-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr [[TMP16]], align 8
 ; VF2-NEXT:    [[TMP24:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
-; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP21]], <2 x i64> [[TMP24]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    store <2 x i64> [[TMP24]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br [[EXIT:label %.*]]
-; VF2:       [[SCALAR_PH:.*:]]
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @test_2xi64_interleave_loads_order_flipped(
 ; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) {
@@ -361,8 +367,9 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br [[EXIT:label %.*]]
-; VF4:       [[SCALAR_PH:.*:]]
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -399,23 +406,21 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; VF2:       [[VECTOR_BODY]]:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; VF2-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
-; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
-; VF2-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP22:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP21]]
-; VF2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP22]], <2 x i64> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    store <2 x i64> [[TMP22]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br [[EXIT:label %.*]]
-; VF2:       [[SCALAR_PH:.*:]]
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @test_2xi64_store_order_flipped_1(
 ; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) {
@@ -441,8 +446,9 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br [[EXIT:label %.*]]
-; VF4:       [[SCALAR_PH:.*:]]
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -479,23 +485,21 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; VF2:       [[VECTOR_BODY]]:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; VF2-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
-; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
-; VF2-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP22:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP21]]
-; VF2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP22]], <2 x i64> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    store <2 x i64> [[TMP22]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br [[EXIT:label %.*]]
-; VF2:       [[SCALAR_PH:.*:]]
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @test_2xi64_store_order_flipped_2(
 ; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) {
@@ -521,8 +525,9 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br [[EXIT:label %.*]]
-; VF4:       [[SCALAR_PH:.*:]]
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -667,8 +672,9 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor) {
 ; VF2-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br [[EXIT:label %.*]]
-; VF2:       [[SCALAR_PH:.*:]]
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @test_3xi64(
 ; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) {
@@ -697,8 +703,9 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor) {
 ; VF4-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br [[EXIT:label %.*]]
-; VF4:       [[SCALAR_PH:.*:]]
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -954,8 +961,9 @@ define void @test_2xi64_sub_of_wide_loads(ptr noalias %data, ptr noalias %A, ptr
 ; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
 ; VF2-NEXT:    br i1 [[EC]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br [[EXIT:label %.*]]
-; VF2:       [[SCALAR_PH:.*:]]
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @test_2xi64_sub_of_wide_loads(
 ; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -979,8 +987,9 @@ define void @test_2xi64_sub_of_wide_loads(ptr noalias %data, ptr noalias %A, ptr
 ; VF4-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br [[EXIT:label %.*]]
-; VF4:       [[SCALAR_PH:.*:]]
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -1031,8 +1040,9 @@ define void @test_2xi64_sub_of_wide_loads_ops_swapped(ptr noalias %data, ptr noa
 ; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
 ; VF2-NEXT:    br i1 [[EC]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br [[EXIT:label %.*]]
-; VF2:       [[SCALAR_PH:.*:]]
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @test_2xi64_sub_of_wide_loads_ops_swapped(
 ; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -1057,8 +1067,9 @@ define void @test_2xi64_sub_of_wide_loads_ops_swapped(ptr noalias %data, ptr noa
 ; VF4-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br [[EXIT:label %.*]]
-; VF4:       [[SCALAR_PH:.*:]]
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -1111,8 +1122,9 @@ define void @test_2xi64_sub_of_wide_loads_with_different_base_ptrs(ptr noalias %
 ; VF2-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br [[EXIT:label %.*]]
-; VF2:       [[SCALAR_PH:.*:]]
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @test_2xi64_sub_of_wide_loads_with_different_base_ptrs(
 ; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
@@ -1139,8 +1151,9 @@ define void @test_2xi64_sub_of_wide_loads_with_different_base_ptrs(ptr noalias %
 ; VF4-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br [[EXIT:label %.*]]
-; VF4:       [[SCALAR_PH:.*:]]
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -1191,8 +1204,9 @@ define void @multiple_store_groups_storing_same_wide_bin_op(ptr noalias %A, ptr
 ; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br [[EXIT:label %.*]]
-; VF2:       [[SCALAR_PH:.*:]]
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @multiple_store_groups_storing_same_wide_bin_op(
 ; VF4-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
@@ -1218,8 +1232,9 @@ define void @multiple_store_groups_storing_same_wide_bin_op(ptr noalias %A, ptr
 ; VF4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br [[EXIT:label %.*]]
-; VF4:       [[SCALAR_PH:.*:]]
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
index bbba4f612dbd3..dd3d36e17ee2c 100644
--- a/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
@@ -21,32 +21,13 @@ define void @pr128062(ptr %dst.start, i8 %a, i16 %b) {
 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 ; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
 ; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i8> [[STRIDED_VEC]] to <4 x i16>
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw <4 x i16> [[TMP0]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = udiv <4 x i16> [[TMP1]], splat (i16 255)
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw <4 x i16> [[TMP2]] to <4 x i8>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[STRIDED_VEC3]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw <4 x i16> [[TMP5]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP7:%.*]] = udiv <4 x i16> [[TMP6]], splat (i16 255)
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc nuw <4 x i16> [[TMP7]] to <4 x i8>
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = zext <4 x i8> [[STRIDED_VEC4]] to <4 x i16>
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw <4 x i16> [[TMP10]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP12:%.*]] = udiv <4 x i16> [[TMP11]], splat (i16 255)
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc nuw <4 x i16> [[TMP12]] to <4 x i8>
-; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = zext <4 x i8> [[STRIDED_VEC5]] to <4 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = zext <4 x i8> [[STRIDED_VEC]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw <4 x i16> [[TMP15]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = udiv <4 x i16> [[TMP16]], splat (i16 255)
 ; CHECK-NEXT:    [[TMP18:%.*]] = trunc nuw <4 x i16> [[TMP17]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP19:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i8> [[TMP20]], <8 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:    store <16 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    store <4 x i8> [[TMP19]], ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
index 2dceb27165c4d..5c7dcf78f9c4f 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
@@ -421,12 +421,13 @@ define void @same_op4(ptr noalias noundef %a, ptr noundef %b, ptr noundef %c) {
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x float>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC17:%.*]] = load <16 x float>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_VEC22:%.*]] = load <16 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <16 x float> [[WIDE_VEC17]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd fast <16 x float> [[WIDE_VEC22]], [[TMP3]]
-; CHECK-NEXT:    store <16 x float> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[TMP6]]
+; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 288
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[FOR_END13:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[FOR_END13]]:
@@ -513,19 +514,20 @@ define void @same_op4_splat(ptr noalias noundef %a, ptr noundef %b, ptr noundef
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[C]], align 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load <13 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <13 x float> [[TMP4]], <13 x float> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <4 x float> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_VEC15:%.*]] = load <16 x float>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[TMP1]]
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd fast <16 x float> [[WIDE_VEC15]], [[TMP4]]
-; CHECK-NEXT:    store <16 x float> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[TMP6]]
+; CHECK-NEXT:    store <4 x float> [[TMP7]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 288
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[FOR_END11:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[FOR_END11]]:

>From e3d44e4e1afdde69e564044b755859a355b1074c Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Fri, 24 Oct 2025 12:29:30 +0100
Subject: [PATCH 4/6] [LV] Pre-commit test for opcode-mismatch miscompile

---
 ...28062-interleaved-accesses-narrow-group.ll | 81 +++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
index dd3d36e17ee2c..e501223c27323 100644
--- a/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
@@ -80,3 +80,84 @@ loop:
 exit:
   ret void
 }
+
+; Same as above, except one zext is replaced with an sext.
+define void @opcode_mismatch(ptr %dst.start, i8 %a, i16 %b) {
+; CHECK-LABEL: define void @opcode_mismatch(
+; CHECK-SAME: ptr [[DST_START:%.*]], i8 [[A:%.*]], i16 [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i8> [[STRIDED_VEC]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw <4 x i16> [[TMP0]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv <4 x i16> [[TMP1]], splat (i16 255)
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw <4 x i16> [[TMP2]] to <4 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP3]]
+; CHECK-NEXT:    store <4 x i8> [[TMP4]], ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %dst = phi ptr [ %dst.start, %entry ], [ %dst.next, %loop ]
+  %dst.next = getelementptr inbounds nuw i8, ptr %dst, i64 4
+  %load.dst = load i8, ptr %dst, align 1
+  %dst.ext = zext i8 %load.dst to i16
+  %mul.dst.0 = mul nuw i16 %dst.ext, %b
+  %udiv.0 = udiv i16 %mul.dst.0, 255
+  %trunc.0 = trunc nuw i16 %udiv.0 to i8
+  %val.0 = add i8 %a, %trunc.0
+  store i8 %val.0, ptr %dst, align 1
+  %gep.dst.1 = getelementptr inbounds nuw i8, ptr %dst, i64 1
+  %load.dst.1 = load i8, ptr %gep.dst.1, align 1
+  %dst.1.ext = sext i8 %load.dst.1 to i16
+  %mul.dst.1 = mul nuw i16 %dst.1.ext, %b
+  %udiv.1 = udiv i16 %mul.dst.1, 255
+  %trunc.1 = trunc nuw i16 %udiv.1 to i8
+  %val.1 = add i8 %a, %trunc.1
+  store i8 %val.1, ptr %gep.dst.1, align 1
+  %gep.dst.2 = getelementptr inbounds nuw i8, ptr %dst, i64 2
+  %load.dst.2 = load i8, ptr %gep.dst.2, align 1
+  %dst.2.ext = zext i8 %load.dst.2 to i16
+  %mul.dst.2 = mul nuw i16 %dst.2.ext, %b
+  %udiv.2 = udiv i16 %mul.dst.2, 255
+  %trunc.2 = trunc nuw i16 %udiv.2 to i8
+  %val.2 = add i8 %a, %trunc.2
+  store i8 %val.2, ptr %gep.dst.2, align 1
+  %gep.dst.3 = getelementptr inbounds nuw i8, ptr %dst, i64 3
+  %load.dst.3 = load i8, ptr %gep.dst.3, align 1
+  %dst.3.ext = zext i8 %load.dst.3 to i16
+  %mul.dst.3 = mul nuw i16 %dst.3.ext, %b
+  %udiv.3 = udiv i16 %mul.dst.3, 255
+  %trunc.3 = trunc nuw i16 %udiv.3 to i8
+  %val.3 = add i8 %a, %trunc.3
+  store i8 %val.3, ptr %gep.dst.3, align 1
+  %iv.next = add i64 %iv, 4
+  %exit.cond = icmp eq i64 %iv.next, 256
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:
+  ret void
+}

>From da6c3d10cb9f68cb1a5dfa7a307457432f4a12e3 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Fri, 24 Oct 2025 14:35:12 +0100
Subject: [PATCH 5/6] [VPlan] Use getOpcodeOrIntrinsicID to fix miscompile

---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 14 +++++++++--
 ...28062-interleaved-accesses-narrow-group.ll | 23 +++++++++++++++++--
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a38f4f07408fd..a1bfbb477501a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4104,6 +4104,15 @@ static bool interleaveStoredValuesMatch(ArrayRef<VPValue *> StoredValues) {
         if (!IR->getInterleaveGroup()->isFull() ||
             !equal(DefI->definedValues(), Def0->definedValues()))
           return false;
+      } else if (Def0 != DefI) {
+        auto *SingleDef0 = dyn_cast<VPSingleDefRecipe>(Def0);
+        auto *SingleDefI = dyn_cast<VPSingleDefRecipe>(DefI);
+        if (!SingleDef0 || !SingleDefI)
+          return false;
+        auto Opc0 = getOpcodeOrIntrinsicID(SingleDef0);
+        auto OpcI = getOpcodeOrIntrinsicID(SingleDefI);
+        if (!Opc0 || Opc0 != OpcI)
+          return false;
       }
     }
   }
@@ -4260,11 +4269,12 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
       NarrowedOps.insert(RepR);
       return RepR;
     }
-    auto *WideLoad = dyn_cast<VPWidenLoadRecipe>(R);
-    if (!WideLoad) {
+    if (isa<VPSingleDefRecipe>(R)) {
+      // Narrow any intervening single-def recipes.
       NarrowedOps.insert(V);
       return V;
     }
+    auto *WideLoad = cast<VPWidenLoadRecipe>(R);
 
     VPValue *PtrOp = WideLoad->getAddr();
     if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
diff --git a/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
index e501223c27323..b1bb1de957050 100644
--- a/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
@@ -107,8 +107,27 @@ define void @opcode_mismatch(ptr %dst.start, i8 %a, i16 %b) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv <4 x i16> [[TMP1]], splat (i16 255)
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw <4 x i16> [[TMP2]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP3]]
-; CHECK-NEXT:    store <4 x i8> [[TMP4]], ptr [[NEXT_GEP]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP23:%.*]] = sext <4 x i8> [[STRIDED_VEC3]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw <4 x i16> [[TMP23]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv <4 x i16> [[TMP6]], splat (i16 255)
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc nuw <4 x i16> [[TMP7]] to <4 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <4 x i8> [[STRIDED_VEC4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw <4 x i16> [[TMP10]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = udiv <4 x i16> [[TMP11]], splat (i16 255)
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc nuw <4 x i16> [[TMP12]] to <4 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = zext <4 x i8> [[STRIDED_VEC5]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw <4 x i16> [[TMP15]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP17:%.*]] = udiv <4 x i16> [[TMP16]], splat (i16 255)
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc nuw <4 x i16> [[TMP17]] to <4 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i8> [[TMP20]], <8 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    store <16 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:

>From fb1d76e6f8f970a4f5f7901f15672f1b20422d18 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Fri, 24 Oct 2025 16:30:55 +0100
Subject: [PATCH 6/6] [VPlan] Minor optz

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a1bfbb477501a..3e1640e04e553 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4095,6 +4095,8 @@ static bool interleaveStoredValuesMatch(ArrayRef<VPValue *> StoredValues) {
       if (!Def0 || !DefI) {
         if (Op0 != OpI)
           return false;
+      } else if (Def0 == DefI) {
+        continue;
       } else if (Def0->getVPDefID() != DefI->getVPDefID()) {
         return false;
       } else if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefI)) {
@@ -4104,7 +4106,7 @@ static bool interleaveStoredValuesMatch(ArrayRef<VPValue *> StoredValues) {
         if (!IR->getInterleaveGroup()->isFull() ||
             !equal(DefI->definedValues(), Def0->definedValues()))
           return false;
-      } else if (Def0 != DefI) {
+      } else {
         auto *SingleDef0 = dyn_cast<VPSingleDefRecipe>(Def0);
         auto *SingleDefI = dyn_cast<VPSingleDefRecipe>(DefI);
         if (!SingleDef0 || !SingleDefI)