[llvm] [LV] Pre-commit test for #128062 (PR #164801)

Ramkumar Ramachandra via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 6 02:10:59 PST 2025


https://github.com/artagnon updated https://github.com/llvm/llvm-project/pull/164801

>From 045d3436eb7e57c64dcc6cfec56094d40228a1f1 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Thu, 23 Oct 2025 11:33:21 +0100
Subject: [PATCH 1/3] [LV] Pre-commit test for #128062

In preparation to extend the work done by dfa665f ([VPlan] Add
transformation to narrow interleave groups) to make the narrowing more
powerful, pre-commit a test case from #128062.
---
 ...28062-interleaved-accesses-narrow-group.ll | 253 ++++++++++++++++++
 1 file changed, 253 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll

diff --git a/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
new file mode 100644
index 0000000000000..aefcd4f5b777d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
@@ -0,0 +1,253 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph" --version 6
+; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-interleaved-mem-accesses -S | FileCheck --check-prefix=INTERLEAVED %s
+
+define void @pr128062(ptr %dst.start, i64 %n, i8 %a) {
+; CHECK-LABEL: define void @pr128062(
+; CHECK-SAME: ptr [[DST_START:%.*]], i64 [[N:%.*]], i8 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[IV_START:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    [[A_EXT:%.*]] = zext i8 [[A]] to i16
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[IV_START]], -4
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], -4
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[IV_START]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[N_VEC]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP5]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[A_EXT]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP7]]
+; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP8]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP9]]
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[NEXT_GEP3]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[NEXT_GEP4]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[NEXT_GEP5]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i8> poison, i8 [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> [[TMP15]], i8 [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP13]], i32 2
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[TMP14]], i32 3
+; CHECK-NEXT:    [[TMP19:%.*]] = zext <4 x i8> [[TMP18]] to <4 x i16>
+; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw <4 x i16> [[TMP19]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP21:%.*]] = udiv <4 x i16> [[TMP20]], splat (i16 255)
+; CHECK-NEXT:    [[TMP22:%.*]] = trunc nuw <4 x i16> [[TMP21]] to <4 x i8>
+; CHECK-NEXT:    [[TMP23:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i8> [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i8> [[TMP23]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i8> [[TMP23]], i32 2
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i8> [[TMP23]], i32 3
+; CHECK-NEXT:    store i8 [[TMP24]], ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    store i8 [[TMP25]], ptr [[NEXT_GEP3]], align 1
+; CHECK-NEXT:    store i8 [[TMP26]], ptr [[NEXT_GEP4]], align 1
+; CHECK-NEXT:    store i8 [[TMP27]], ptr [[NEXT_GEP5]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP]], i64 1
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP3]], i64 1
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP4]], i64 1
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP5]], i64 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i8, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i8, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i8> poison, i8 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i8> [[TMP36]], i8 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i8> [[TMP37]], i8 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i8> [[TMP38]], i8 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP40:%.*]] = zext <4 x i8> [[TMP39]] to <4 x i16>
+; CHECK-NEXT:    [[TMP41:%.*]] = mul nuw <4 x i16> [[TMP40]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP42:%.*]] = udiv <4 x i16> [[TMP41]], splat (i16 255)
+; CHECK-NEXT:    [[TMP43:%.*]] = trunc nuw <4 x i16> [[TMP42]] to <4 x i8>
+; CHECK-NEXT:    [[TMP44:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP43]]
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i8> [[TMP44]], i32 0
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i8> [[TMP44]], i32 1
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i8> [[TMP44]], i32 2
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <4 x i8> [[TMP44]], i32 3
+; CHECK-NEXT:    store i8 [[TMP45]], ptr [[TMP28]], align 1
+; CHECK-NEXT:    store i8 [[TMP46]], ptr [[TMP29]], align 1
+; CHECK-NEXT:    store i8 [[TMP47]], ptr [[TMP30]], align 1
+; CHECK-NEXT:    store i8 [[TMP48]], ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP]], i64 2
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP3]], i64 2
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP4]], i64 2
+; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP5]], i64 2
+; CHECK-NEXT:    [[TMP53:%.*]] = load i8, ptr [[TMP49]], align 1
+; CHECK-NEXT:    [[TMP54:%.*]] = load i8, ptr [[TMP50]], align 1
+; CHECK-NEXT:    [[TMP55:%.*]] = load i8, ptr [[TMP51]], align 1
+; CHECK-NEXT:    [[TMP56:%.*]] = load i8, ptr [[TMP52]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <4 x i8> poison, i8 [[TMP53]], i32 0
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <4 x i8> [[TMP57]], i8 [[TMP54]], i32 1
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <4 x i8> [[TMP58]], i8 [[TMP55]], i32 2
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i8> [[TMP59]], i8 [[TMP56]], i32 3
+; CHECK-NEXT:    [[TMP61:%.*]] = zext <4 x i8> [[TMP60]] to <4 x i16>
+; CHECK-NEXT:    [[TMP62:%.*]] = mul nuw <4 x i16> [[TMP61]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP63:%.*]] = udiv <4 x i16> [[TMP62]], splat (i16 255)
+; CHECK-NEXT:    [[TMP64:%.*]] = trunc nuw <4 x i16> [[TMP63]] to <4 x i8>
+; CHECK-NEXT:    [[TMP65:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP64]]
+; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <4 x i8> [[TMP65]], i32 0
+; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <4 x i8> [[TMP65]], i32 1
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i8> [[TMP65]], i32 2
+; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <4 x i8> [[TMP65]], i32 3
+; CHECK-NEXT:    store i8 [[TMP66]], ptr [[TMP49]], align 1
+; CHECK-NEXT:    store i8 [[TMP67]], ptr [[TMP50]], align 1
+; CHECK-NEXT:    store i8 [[TMP68]], ptr [[TMP51]], align 1
+; CHECK-NEXT:    store i8 [[TMP69]], ptr [[TMP52]], align 1
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP]], i64 3
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP3]], i64 3
+; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP4]], i64 3
+; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP5]], i64 3
+; CHECK-NEXT:    [[TMP74:%.*]] = load i8, ptr [[TMP70]], align 1
+; CHECK-NEXT:    [[TMP75:%.*]] = load i8, ptr [[TMP71]], align 1
+; CHECK-NEXT:    [[TMP76:%.*]] = load i8, ptr [[TMP72]], align 1
+; CHECK-NEXT:    [[TMP77:%.*]] = load i8, ptr [[TMP73]], align 1
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i8> poison, i8 [[TMP74]], i32 0
+; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <4 x i8> [[TMP78]], i8 [[TMP75]], i32 1
+; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <4 x i8> [[TMP79]], i8 [[TMP76]], i32 2
+; CHECK-NEXT:    [[TMP81:%.*]] = insertelement <4 x i8> [[TMP80]], i8 [[TMP77]], i32 3
+; CHECK-NEXT:    [[TMP82:%.*]] = zext <4 x i8> [[TMP81]] to <4 x i16>
+; CHECK-NEXT:    [[TMP83:%.*]] = mul nuw <4 x i16> [[TMP82]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP84:%.*]] = udiv <4 x i16> [[TMP83]], splat (i16 255)
+; CHECK-NEXT:    [[TMP85:%.*]] = trunc nuw <4 x i16> [[TMP84]] to <4 x i8>
+; CHECK-NEXT:    [[TMP86:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP85]]
+; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <4 x i8> [[TMP86]], i32 0
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <4 x i8> [[TMP86]], i32 1
+; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <4 x i8> [[TMP86]], i32 2
+; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <4 x i8> [[TMP86]], i32 3
+; CHECK-NEXT:    store i8 [[TMP87]], ptr [[TMP70]], align 1
+; CHECK-NEXT:    store i8 [[TMP88]], ptr [[TMP71]], align 1
+; CHECK-NEXT:    store i8 [[TMP89]], ptr [[TMP72]], align 1
+; CHECK-NEXT:    store i8 [[TMP90]], ptr [[TMP73]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP91:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP91]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+; INTERLEAVED-LABEL: define void @pr128062(
+; INTERLEAVED-SAME: ptr [[DST_START:%.*]], i64 [[N:%.*]], i8 [[A:%.*]]) {
+; INTERLEAVED-NEXT:  [[ENTRY:.*:]]
+; INTERLEAVED-NEXT:    [[IV_START:%.*]] = and i64 [[N]], -4
+; INTERLEAVED-NEXT:    [[A_EXT:%.*]] = zext i8 [[A]] to i16
+; INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[IV_START]], -4
+; INTERLEAVED-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
+; INTERLEAVED-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVED:       [[VECTOR_PH]]:
+; INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], -4
+; INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[IV_START]], [[TMP3]]
+; INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[N_VEC]], 4
+; INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP5]]
+; INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[A_EXT]], i64 0
+; INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0
+; INTERLEAVED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
+; INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVED:       [[VECTOR_BODY]]:
+; INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[OFFSET_IDX]]
+; INTERLEAVED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; INTERLEAVED-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; INTERLEAVED-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; INTERLEAVED-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; INTERLEAVED-NEXT:    [[TMP81:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <4 x i8> [[TMP18]] to <4 x i16>
+; INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nuw <4 x i16> [[TMP19]], [[BROADCAST_SPLAT]]
+; INTERLEAVED-NEXT:    [[TMP21:%.*]] = udiv <4 x i16> [[TMP20]], splat (i16 255)
+; INTERLEAVED-NEXT:    [[TMP22:%.*]] = trunc nuw <4 x i16> [[TMP21]] to <4 x i8>
+; INTERLEAVED-NEXT:    [[TMP23:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP22]]
+; INTERLEAVED-NEXT:    [[TMP40:%.*]] = zext <4 x i8> [[TMP39]] to <4 x i16>
+; INTERLEAVED-NEXT:    [[TMP41:%.*]] = mul nuw <4 x i16> [[TMP40]], [[BROADCAST_SPLAT]]
+; INTERLEAVED-NEXT:    [[TMP42:%.*]] = udiv <4 x i16> [[TMP41]], splat (i16 255)
+; INTERLEAVED-NEXT:    [[TMP43:%.*]] = trunc nuw <4 x i16> [[TMP42]] to <4 x i8>
+; INTERLEAVED-NEXT:    [[TMP44:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP43]]
+; INTERLEAVED-NEXT:    [[TMP61:%.*]] = zext <4 x i8> [[TMP60]] to <4 x i16>
+; INTERLEAVED-NEXT:    [[TMP62:%.*]] = mul nuw <4 x i16> [[TMP61]], [[BROADCAST_SPLAT]]
+; INTERLEAVED-NEXT:    [[TMP63:%.*]] = udiv <4 x i16> [[TMP62]], splat (i16 255)
+; INTERLEAVED-NEXT:    [[TMP64:%.*]] = trunc nuw <4 x i16> [[TMP63]] to <4 x i8>
+; INTERLEAVED-NEXT:    [[TMP65:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP64]]
+; INTERLEAVED-NEXT:    [[TMP82:%.*]] = zext <4 x i8> [[TMP81]] to <4 x i16>
+; INTERLEAVED-NEXT:    [[TMP83:%.*]] = mul nuw <4 x i16> [[TMP82]], [[BROADCAST_SPLAT]]
+; INTERLEAVED-NEXT:    [[TMP84:%.*]] = udiv <4 x i16> [[TMP83]], splat (i16 255)
+; INTERLEAVED-NEXT:    [[TMP85:%.*]] = trunc nuw <4 x i16> [[TMP84]] to <4 x i8>
+; INTERLEAVED-NEXT:    [[TMP86:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP85]]
+; INTERLEAVED-NEXT:    [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP23]], <4 x i8> [[TMP44]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; INTERLEAVED-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP65]], <4 x i8> [[TMP86]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; INTERLEAVED-NEXT:    [[TMP29:%.*]] = shufflevector <8 x i8> [[TMP27]], <8 x i8> [[TMP28]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; INTERLEAVED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; INTERLEAVED-NEXT:    store <16 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1
+; INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; INTERLEAVED-NEXT:    [[TMP91:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVED-NEXT:    br i1 [[TMP91]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; INTERLEAVED:       [[MIDDLE_BLOCK]]:
+; INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; INTERLEAVED-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; INTERLEAVED:       [[SCALAR_PH]]:
+;
+entry:
+  %iv.start = and i64 %n, -4
+  %a.ext = zext i8 %a to i16
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ %iv.start, %entry ]
+  %dst = phi ptr [ %dst.next, %loop ], [ %dst.start, %entry ]
+  %dst.next = getelementptr inbounds nuw i8, ptr %dst, i64 4
+  %load.dst = load i8, ptr %dst, align 1
+  %dst.ext = zext i8 %load.dst to i16
+  %mul.dst.0 = mul nuw i16 %dst.ext, %a.ext
+  %udiv.0 = udiv i16 %mul.dst.0, 255
+  %trunc.0 = trunc nuw i16 %udiv.0 to i8
+  %val.0 = add i8 %a, %trunc.0
+  store i8 %val.0, ptr %dst, align 1
+  %gep.dst.1 = getelementptr inbounds nuw i8, ptr %dst, i64 1
+  %load.dst.1 = load i8, ptr %gep.dst.1, align 1
+  %dst.1.ext = zext i8 %load.dst.1 to i16
+  %mul.dst.1 = mul nuw i16 %dst.1.ext, %a.ext
+  %udiv.1 = udiv i16 %mul.dst.1, 255
+  %trunc.1 = trunc nuw i16 %udiv.1 to i8
+  %val.1 = add i8 %a, %trunc.1
+  store i8 %val.1, ptr %gep.dst.1, align 1
+  %gep.dst.2 = getelementptr inbounds nuw i8, ptr %dst, i64 2
+  %load.dst.2 = load i8, ptr %gep.dst.2, align 1
+  %dst.2.ext = zext i8 %load.dst.2 to i16
+  %mul.dst.2 = mul nuw i16 %dst.2.ext, %a.ext
+  %udiv.2 = udiv i16 %mul.dst.2, 255
+  %trunc.2 = trunc nuw i16 %udiv.2 to i8
+  %val.2 = add i8 %a, %trunc.2
+  store i8 %val.2, ptr %gep.dst.2, align 1
+  %gep.dst.3 = getelementptr inbounds nuw i8, ptr %dst, i64 3
+  %load.dst.3 = load i8, ptr %gep.dst.3, align 1
+  %dst.3.ext = zext i8 %load.dst.3 to i16
+  %mul.dst.3 = mul nuw i16 %dst.3.ext, %a.ext
+  %udiv.3 = udiv i16 %mul.dst.3, 255
+  %trunc.3 = trunc nuw i16 %udiv.3 to i8
+  %val.3 = add i8 %a, %trunc.3
+  store i8 %val.3, ptr %gep.dst.3, align 1
+  %iv.next = add i64 %iv, -4
+  %exit.cond = icmp eq i64 %iv.next, 0
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:
+  ret void
+}

>From 2a851516c27f0c1ded1ba680a928e00e4dbaccc2 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Thu, 23 Oct 2025 20:12:22 +0100
Subject: [PATCH 2/3] [LV] Improve test

---
 ...28062-interleaved-accesses-narrow-group.ll | 252 ++++--------------
 1 file changed, 50 insertions(+), 202 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
index aefcd4f5b777d..bbba4f612dbd3 100644
--- a/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
@@ -1,26 +1,13 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph" --version 6
-; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
-; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-interleaved-mem-accesses -S | FileCheck --check-prefix=INTERLEAVED %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-interleaved-mem-accesses -S | FileCheck %s
 
-define void @pr128062(ptr %dst.start, i64 %n, i8 %a) {
+define void @pr128062(ptr %dst.start, i8 %a, i16 %b) {
 ; CHECK-LABEL: define void @pr128062(
-; CHECK-SAME: ptr [[DST_START:%.*]], i64 [[N:%.*]], i8 [[A:%.*]]) {
+; CHECK-SAME: ptr [[DST_START:%.*]], i8 [[A:%.*]], i16 [[B:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[IV_START:%.*]] = and i64 [[N]], -4
-; CHECK-NEXT:    [[A_EXT:%.*]] = zext i8 [[A]] to i16
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[IV_START]], -4
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], -4
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[IV_START]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[N_VEC]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP5]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[A_EXT]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
@@ -28,194 +15,55 @@ define void @pr128062(ptr %dst.start, i64 %n, i8 %a) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 12
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP7]]
-; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP8]]
-; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP9]]
-; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[NEXT_GEP]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[NEXT_GEP3]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[NEXT_GEP4]], align 1
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[NEXT_GEP5]], align 1
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i8> poison, i8 [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> [[TMP15]], i8 [[TMP12]], i32 1
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP13]], i32 2
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[TMP14]], i32 3
-; CHECK-NEXT:    [[TMP19:%.*]] = zext <4 x i8> [[TMP18]] to <4 x i16>
-; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw <4 x i16> [[TMP19]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP21:%.*]] = udiv <4 x i16> [[TMP20]], splat (i16 255)
-; CHECK-NEXT:    [[TMP22:%.*]] = trunc nuw <4 x i16> [[TMP21]] to <4 x i8>
-; CHECK-NEXT:    [[TMP23:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i8> [[TMP23]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i8> [[TMP23]], i32 1
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i8> [[TMP23]], i32 2
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i8> [[TMP23]], i32 3
-; CHECK-NEXT:    store i8 [[TMP24]], ptr [[NEXT_GEP]], align 1
-; CHECK-NEXT:    store i8 [[TMP25]], ptr [[NEXT_GEP3]], align 1
-; CHECK-NEXT:    store i8 [[TMP26]], ptr [[NEXT_GEP4]], align 1
-; CHECK-NEXT:    store i8 [[TMP27]], ptr [[NEXT_GEP5]], align 1
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP]], i64 1
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP3]], i64 1
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP4]], i64 1
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP5]], i64 1
-; CHECK-NEXT:    [[TMP32:%.*]] = load i8, ptr [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i8, ptr [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i8> poison, i8 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i8> [[TMP36]], i8 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i8> [[TMP37]], i8 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i8> [[TMP38]], i8 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = zext <4 x i8> [[TMP39]] to <4 x i16>
-; CHECK-NEXT:    [[TMP41:%.*]] = mul nuw <4 x i16> [[TMP40]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP42:%.*]] = udiv <4 x i16> [[TMP41]], splat (i16 255)
-; CHECK-NEXT:    [[TMP43:%.*]] = trunc nuw <4 x i16> [[TMP42]] to <4 x i8>
-; CHECK-NEXT:    [[TMP44:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP43]]
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i8> [[TMP44]], i32 0
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i8> [[TMP44]], i32 1
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i8> [[TMP44]], i32 2
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <4 x i8> [[TMP44]], i32 3
-; CHECK-NEXT:    store i8 [[TMP45]], ptr [[TMP28]], align 1
-; CHECK-NEXT:    store i8 [[TMP46]], ptr [[TMP29]], align 1
-; CHECK-NEXT:    store i8 [[TMP47]], ptr [[TMP30]], align 1
-; CHECK-NEXT:    store i8 [[TMP48]], ptr [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP]], i64 2
-; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP3]], i64 2
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP4]], i64 2
-; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP5]], i64 2
-; CHECK-NEXT:    [[TMP53:%.*]] = load i8, ptr [[TMP49]], align 1
-; CHECK-NEXT:    [[TMP54:%.*]] = load i8, ptr [[TMP50]], align 1
-; CHECK-NEXT:    [[TMP55:%.*]] = load i8, ptr [[TMP51]], align 1
-; CHECK-NEXT:    [[TMP56:%.*]] = load i8, ptr [[TMP52]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <4 x i8> poison, i8 [[TMP53]], i32 0
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <4 x i8> [[TMP57]], i8 [[TMP54]], i32 1
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <4 x i8> [[TMP58]], i8 [[TMP55]], i32 2
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i8> [[TMP59]], i8 [[TMP56]], i32 3
-; CHECK-NEXT:    [[TMP61:%.*]] = zext <4 x i8> [[TMP60]] to <4 x i16>
-; CHECK-NEXT:    [[TMP62:%.*]] = mul nuw <4 x i16> [[TMP61]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP63:%.*]] = udiv <4 x i16> [[TMP62]], splat (i16 255)
-; CHECK-NEXT:    [[TMP64:%.*]] = trunc nuw <4 x i16> [[TMP63]] to <4 x i8>
-; CHECK-NEXT:    [[TMP65:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP64]]
-; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <4 x i8> [[TMP65]], i32 0
-; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <4 x i8> [[TMP65]], i32 1
-; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i8> [[TMP65]], i32 2
-; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <4 x i8> [[TMP65]], i32 3
-; CHECK-NEXT:    store i8 [[TMP66]], ptr [[TMP49]], align 1
-; CHECK-NEXT:    store i8 [[TMP67]], ptr [[TMP50]], align 1
-; CHECK-NEXT:    store i8 [[TMP68]], ptr [[TMP51]], align 1
-; CHECK-NEXT:    store i8 [[TMP69]], ptr [[TMP52]], align 1
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP]], i64 3
-; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP3]], i64 3
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP4]], i64 3
-; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds nuw i8, ptr [[NEXT_GEP5]], i64 3
-; CHECK-NEXT:    [[TMP74:%.*]] = load i8, ptr [[TMP70]], align 1
-; CHECK-NEXT:    [[TMP75:%.*]] = load i8, ptr [[TMP71]], align 1
-; CHECK-NEXT:    [[TMP76:%.*]] = load i8, ptr [[TMP72]], align 1
-; CHECK-NEXT:    [[TMP77:%.*]] = load i8, ptr [[TMP73]], align 1
-; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i8> poison, i8 [[TMP74]], i32 0
-; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <4 x i8> [[TMP78]], i8 [[TMP75]], i32 1
-; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <4 x i8> [[TMP79]], i8 [[TMP76]], i32 2
-; CHECK-NEXT:    [[TMP81:%.*]] = insertelement <4 x i8> [[TMP80]], i8 [[TMP77]], i32 3
-; CHECK-NEXT:    [[TMP82:%.*]] = zext <4 x i8> [[TMP81]] to <4 x i16>
-; CHECK-NEXT:    [[TMP83:%.*]] = mul nuw <4 x i16> [[TMP82]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP84:%.*]] = udiv <4 x i16> [[TMP83]], splat (i16 255)
-; CHECK-NEXT:    [[TMP85:%.*]] = trunc nuw <4 x i16> [[TMP84]] to <4 x i8>
-; CHECK-NEXT:    [[TMP86:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP85]]
-; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <4 x i8> [[TMP86]], i32 0
-; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <4 x i8> [[TMP86]], i32 1
-; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <4 x i8> [[TMP86]], i32 2
-; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <4 x i8> [[TMP86]], i32 3
-; CHECK-NEXT:    store i8 [[TMP87]], ptr [[TMP70]], align 1
-; CHECK-NEXT:    store i8 [[TMP88]], ptr [[TMP71]], align 1
-; CHECK-NEXT:    store i8 [[TMP89]], ptr [[TMP72]], align 1
-; CHECK-NEXT:    store i8 [[TMP90]], ptr [[TMP73]], align 1
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i8> [[STRIDED_VEC]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw <4 x i16> [[TMP0]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv <4 x i16> [[TMP1]], splat (i16 255)
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw <4 x i16> [[TMP2]] to <4 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[STRIDED_VEC3]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw <4 x i16> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv <4 x i16> [[TMP6]], splat (i16 255)
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc nuw <4 x i16> [[TMP7]] to <4 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <4 x i8> [[STRIDED_VEC4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw <4 x i16> [[TMP10]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = udiv <4 x i16> [[TMP11]], splat (i16 255)
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc nuw <4 x i16> [[TMP12]] to <4 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = zext <4 x i8> [[STRIDED_VEC5]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw <4 x i16> [[TMP15]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP17:%.*]] = udiv <4 x i16> [[TMP16]], splat (i16 255)
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc nuw <4 x i16> [[TMP17]] to <4 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i8> [[TMP20]], <8 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    store <16 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP91:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP91]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-;
-; INTERLEAVED-LABEL: define void @pr128062(
-; INTERLEAVED-SAME: ptr [[DST_START:%.*]], i64 [[N:%.*]], i8 [[A:%.*]]) {
-; INTERLEAVED-NEXT:  [[ENTRY:.*:]]
-; INTERLEAVED-NEXT:    [[IV_START:%.*]] = and i64 [[N]], -4
-; INTERLEAVED-NEXT:    [[A_EXT:%.*]] = zext i8 [[A]] to i16
-; INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[IV_START]], -4
-; INTERLEAVED-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
-; INTERLEAVED-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
-; INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; INTERLEAVED:       [[VECTOR_PH]]:
-; INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
-; INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], -4
-; INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[IV_START]], [[TMP3]]
-; INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[N_VEC]], 4
-; INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[TMP5]]
-; INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[A_EXT]], i64 0
-; INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
-; INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0
-; INTERLEAVED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
-; INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
-; INTERLEAVED:       [[VECTOR_BODY]]:
-; INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
-; INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[OFFSET_IDX]]
-; INTERLEAVED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; INTERLEAVED-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; INTERLEAVED-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; INTERLEAVED-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; INTERLEAVED-NEXT:    [[TMP81:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <4 x i8> [[TMP18]] to <4 x i16>
-; INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nuw <4 x i16> [[TMP19]], [[BROADCAST_SPLAT]]
-; INTERLEAVED-NEXT:    [[TMP21:%.*]] = udiv <4 x i16> [[TMP20]], splat (i16 255)
-; INTERLEAVED-NEXT:    [[TMP22:%.*]] = trunc nuw <4 x i16> [[TMP21]] to <4 x i8>
-; INTERLEAVED-NEXT:    [[TMP23:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP22]]
-; INTERLEAVED-NEXT:    [[TMP40:%.*]] = zext <4 x i8> [[TMP39]] to <4 x i16>
-; INTERLEAVED-NEXT:    [[TMP41:%.*]] = mul nuw <4 x i16> [[TMP40]], [[BROADCAST_SPLAT]]
-; INTERLEAVED-NEXT:    [[TMP42:%.*]] = udiv <4 x i16> [[TMP41]], splat (i16 255)
-; INTERLEAVED-NEXT:    [[TMP43:%.*]] = trunc nuw <4 x i16> [[TMP42]] to <4 x i8>
-; INTERLEAVED-NEXT:    [[TMP44:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP43]]
-; INTERLEAVED-NEXT:    [[TMP61:%.*]] = zext <4 x i8> [[TMP60]] to <4 x i16>
-; INTERLEAVED-NEXT:    [[TMP62:%.*]] = mul nuw <4 x i16> [[TMP61]], [[BROADCAST_SPLAT]]
-; INTERLEAVED-NEXT:    [[TMP63:%.*]] = udiv <4 x i16> [[TMP62]], splat (i16 255)
-; INTERLEAVED-NEXT:    [[TMP64:%.*]] = trunc nuw <4 x i16> [[TMP63]] to <4 x i8>
-; INTERLEAVED-NEXT:    [[TMP65:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP64]]
-; INTERLEAVED-NEXT:    [[TMP82:%.*]] = zext <4 x i8> [[TMP81]] to <4 x i16>
-; INTERLEAVED-NEXT:    [[TMP83:%.*]] = mul nuw <4 x i16> [[TMP82]], [[BROADCAST_SPLAT]]
-; INTERLEAVED-NEXT:    [[TMP84:%.*]] = udiv <4 x i16> [[TMP83]], splat (i16 255)
-; INTERLEAVED-NEXT:    [[TMP85:%.*]] = trunc nuw <4 x i16> [[TMP84]] to <4 x i8>
-; INTERLEAVED-NEXT:    [[TMP86:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP85]]
-; INTERLEAVED-NEXT:    [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP23]], <4 x i8> [[TMP44]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; INTERLEAVED-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP65]], <4 x i8> [[TMP86]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; INTERLEAVED-NEXT:    [[TMP29:%.*]] = shufflevector <8 x i8> [[TMP27]], <8 x i8> [[TMP28]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; INTERLEAVED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; INTERLEAVED-NEXT:    store <16 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1
-; INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; INTERLEAVED-NEXT:    [[TMP91:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVED-NEXT:    br i1 [[TMP91]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; INTERLEAVED:       [[MIDDLE_BLOCK]]:
-; INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; INTERLEAVED-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
-; INTERLEAVED:       [[SCALAR_PH]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
-  %iv.start = and i64 %n, -4
-  %a.ext = zext i8 %a to i16
   br label %loop
 
 loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ %iv.start, %entry ]
-  %dst = phi ptr [ %dst.next, %loop ], [ %dst.start, %entry ]
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %dst = phi ptr [ %dst.start, %entry ], [ %dst.next, %loop ]
   %dst.next = getelementptr inbounds nuw i8, ptr %dst, i64 4
   %load.dst = load i8, ptr %dst, align 1
   %dst.ext = zext i8 %load.dst to i16
-  %mul.dst.0 = mul nuw i16 %dst.ext, %a.ext
+  %mul.dst.0 = mul nuw i16 %dst.ext, %b
   %udiv.0 = udiv i16 %mul.dst.0, 255
   %trunc.0 = trunc nuw i16 %udiv.0 to i8
   %val.0 = add i8 %a, %trunc.0
@@ -223,7 +71,7 @@ loop:
   %gep.dst.1 = getelementptr inbounds nuw i8, ptr %dst, i64 1
   %load.dst.1 = load i8, ptr %gep.dst.1, align 1
   %dst.1.ext = zext i8 %load.dst.1 to i16
-  %mul.dst.1 = mul nuw i16 %dst.1.ext, %a.ext
+  %mul.dst.1 = mul nuw i16 %dst.1.ext, %b
   %udiv.1 = udiv i16 %mul.dst.1, 255
   %trunc.1 = trunc nuw i16 %udiv.1 to i8
   %val.1 = add i8 %a, %trunc.1
@@ -231,7 +79,7 @@ loop:
   %gep.dst.2 = getelementptr inbounds nuw i8, ptr %dst, i64 2
   %load.dst.2 = load i8, ptr %gep.dst.2, align 1
   %dst.2.ext = zext i8 %load.dst.2 to i16
-  %mul.dst.2 = mul nuw i16 %dst.2.ext, %a.ext
+  %mul.dst.2 = mul nuw i16 %dst.2.ext, %b
   %udiv.2 = udiv i16 %mul.dst.2, 255
   %trunc.2 = trunc nuw i16 %udiv.2 to i8
   %val.2 = add i8 %a, %trunc.2
@@ -239,13 +87,13 @@ loop:
   %gep.dst.3 = getelementptr inbounds nuw i8, ptr %dst, i64 3
   %load.dst.3 = load i8, ptr %gep.dst.3, align 1
   %dst.3.ext = zext i8 %load.dst.3 to i16
-  %mul.dst.3 = mul nuw i16 %dst.3.ext, %a.ext
+  %mul.dst.3 = mul nuw i16 %dst.3.ext, %b
   %udiv.3 = udiv i16 %mul.dst.3, 255
   %trunc.3 = trunc nuw i16 %udiv.3 to i8
   %val.3 = add i8 %a, %trunc.3
   store i8 %val.3, ptr %gep.dst.3, align 1
-  %iv.next = add i64 %iv, -4
-  %exit.cond = icmp eq i64 %iv.next, 0
+  %iv.next = add i64 %iv, 4
+  %exit.cond = icmp eq i64 %iv.next, 256
   br i1 %exit.cond, label %exit, label %loop
 
 exit:

>From 41ba46ff218b06be331a9bf6f1d1f1690bc0d8bf Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Fri, 24 Oct 2025 12:29:30 +0100
Subject: [PATCH 3/3] [LV] Pre-commit test for opcode-mismatch miscompile

---
 ...28062-interleaved-accesses-narrow-group.ll | 100 ++++++++++++++++++
 1 file changed, 100 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
index bbba4f612dbd3..00eeb69dcb0f7 100644
--- a/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
@@ -99,3 +99,103 @@ loop:
 exit:
   ret void
 }
+
+; Same as above, except one zext is replaced with an sext.
+define void @opcode_mismatch(ptr %dst.start, i8 %a, i16 %b) {
+; CHECK-LABEL: define void @opcode_mismatch(
+; CHECK-SAME: ptr [[DST_START:%.*]], i8 [[A:%.*]], i16 [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i8> [[STRIDED_VEC]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw <4 x i16> [[TMP0]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv <4 x i16> [[TMP1]], splat (i16 255)
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw <4 x i16> [[TMP2]] to <4 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i8> [[STRIDED_VEC3]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw <4 x i16> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv <4 x i16> [[TMP6]], splat (i16 255)
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc nuw <4 x i16> [[TMP7]] to <4 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <4 x i8> [[STRIDED_VEC4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw <4 x i16> [[TMP10]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = udiv <4 x i16> [[TMP11]], splat (i16 255)
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc nuw <4 x i16> [[TMP12]] to <4 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = zext <4 x i8> [[STRIDED_VEC5]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw <4 x i16> [[TMP15]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP17:%.*]] = udiv <4 x i16> [[TMP16]], splat (i16 255)
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc nuw <4 x i16> [[TMP17]] to <4 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i8> [[TMP20]], <8 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    store <16 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %dst = phi ptr [ %dst.start, %entry ], [ %dst.next, %loop ]
+  %dst.next = getelementptr inbounds nuw i8, ptr %dst, i64 4
+  %load.dst = load i8, ptr %dst, align 1
+  %dst.ext = zext i8 %load.dst to i16
+  %mul.dst.0 = mul nuw i16 %dst.ext, %b
+  %udiv.0 = udiv i16 %mul.dst.0, 255
+  %trunc.0 = trunc nuw i16 %udiv.0 to i8
+  %val.0 = add i8 %a, %trunc.0
+  store i8 %val.0, ptr %dst, align 1
+  %gep.dst.1 = getelementptr inbounds nuw i8, ptr %dst, i64 1
+  %load.dst.1 = load i8, ptr %gep.dst.1, align 1
+  %dst.1.ext = sext i8 %load.dst.1 to i16
+  %mul.dst.1 = mul nuw i16 %dst.1.ext, %b
+  %udiv.1 = udiv i16 %mul.dst.1, 255
+  %trunc.1 = trunc nuw i16 %udiv.1 to i8
+  %val.1 = add i8 %a, %trunc.1
+  store i8 %val.1, ptr %gep.dst.1, align 1
+  %gep.dst.2 = getelementptr inbounds nuw i8, ptr %dst, i64 2
+  %load.dst.2 = load i8, ptr %gep.dst.2, align 1
+  %dst.2.ext = zext i8 %load.dst.2 to i16
+  %mul.dst.2 = mul nuw i16 %dst.2.ext, %b
+  %udiv.2 = udiv i16 %mul.dst.2, 255
+  %trunc.2 = trunc nuw i16 %udiv.2 to i8
+  %val.2 = add i8 %a, %trunc.2
+  store i8 %val.2, ptr %gep.dst.2, align 1
+  %gep.dst.3 = getelementptr inbounds nuw i8, ptr %dst, i64 3
+  %load.dst.3 = load i8, ptr %gep.dst.3, align 1
+  %dst.3.ext = zext i8 %load.dst.3 to i16
+  %mul.dst.3 = mul nuw i16 %dst.3.ext, %b
+  %udiv.3 = udiv i16 %mul.dst.3, 255
+  %trunc.3 = trunc nuw i16 %udiv.3 to i8
+  %val.3 = add i8 %a, %trunc.3
+  store i8 %val.3, ptr %gep.dst.3, align 1
+  %iv.next = add i64 %iv, 4
+  %exit.cond = icmp eq i64 %iv.next, 256
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:
+  ret void
+}



More information about the llvm-commits mailing list