[llvm] [SLP] Reject duplicate shift amounts in matchesShlZExt reorder path (PR #183627)

Fri Feb 27 08:14:06 PST 2026

https://github.com/akadutta updated https://github.com/llvm/llvm-project/pull/183627

>From e320929357c2cd68ad2a490632f25083b13ffbea Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Thu, 26 Feb 2026 16:05:09 -0600
Subject: [PATCH 1/2] duplicate shift check zext bitcast reorder

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  5 ++
 .../AMDGPU/zext-duplicate-shift.ll            | 59 +++++++++++++++++++
 2 files changed, 64 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AMDGPU/zext-duplicate-shift.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3a10d2b2a7158..7ec3a341d01e0 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13420,6 +13420,8 @@ bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
   } else {
     const unsigned VF = RhsTE->getVectorFactor();
     Order.assign(VF, VF);
+    // Track which logical positions we've seen; reject duplicate shift amounts.
+    SmallBitVector SeenPositions(VF);
     // Check if need to reorder Rhs to make it in form (0, Stride, 2 * Stride,
     // ..., Sz-Stride).
     if (VF * Stride != Sz)
@@ -13434,6 +13436,9 @@ bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
       if (Val.isNegative() || Val.uge(Sz) || Val.getZExtValue() % Stride != 0)
         return false;
       unsigned Pos = Val.getZExtValue() / Stride;
+      if (SeenPositions.test(Pos))
+        return false;
+      SeenPositions.set(Pos);
       // TODO: Support Pos >= VF, in this case need to shift the final value.
       if (Order[Idx] != VF || Pos >= VF)
         return false;
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/zext-duplicate-shift.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/zext-duplicate-shift.ll
new file mode 100644
index 0000000000000..a87e5441a6431
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/zext-duplicate-shift.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -S -o - < %s | FileCheck --implicit-check-not='<i32 2, i32 3, i32 poison, i32 poison>' %s
+
+define void @duplicate_shift_i128_store(ptr %base, i32 %spec_select, i32 %spec_select37, i1 %narrow) {
+; CHECK-LABEL: define void @duplicate_shift_i128_store(
+; CHECK-SAME: ptr [[BASE:%.*]], i32 [[SPEC_SELECT:%.*]], i32 [[SPEC_SELECT37:%.*]], i1 [[NARROW:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 1024
+; CHECK-NEXT:    [[P_INSERT_11:%.*]] = select i1 [[NARROW]], i128 158456325046975419260797452288, i128 158456325028528675187087900672
+; CHECK-NEXT:    [[ZEXT37:%.*]] = zext i32 [[SPEC_SELECT37]] to i128
+; CHECK-NEXT:    [[SHL37:%.*]] = shl nuw nsw i128 [[ZEXT37]], 32
+; CHECK-NEXT:    [[MASKED37:%.*]] = and i128 [[SHL37]], 18446742974197923840
+; CHECK-NEXT:    [[AND37_255:%.*]] = and i32 [[SPEC_SELECT37]], 255
+; CHECK-NEXT:    [[ZEXT37_255:%.*]] = zext nneg i32 [[AND37_255]] to i128
+; CHECK-NEXT:    [[SHL37_255:%.*]] = shl nuw nsw i128 [[ZEXT37_255]], 32
+; CHECK-NEXT:    [[P_MASK_7:%.*]] = or disjoint i128 [[P_INSERT_11]], [[MASKED37]]
+; CHECK-NEXT:    [[P_INSERT_7:%.*]] = or disjoint i128 [[P_MASK_7]], [[SHL37_255]]
+; CHECK-NEXT:    [[PREFIX_HIGH:%.*]] = and i32 [[SPEC_SELECT]], -16777216
+; CHECK-NEXT:    [[ZEXT_HIGH:%.*]] = zext i32 [[PREFIX_HIGH]] to i128
+; CHECK-NEXT:    [[P_INSERT_6:%.*]] = or disjoint i128 [[P_INSERT_7]], [[ZEXT_HIGH]]
+; CHECK-NEXT:    [[AND_16711680:%.*]] = and i32 [[SPEC_SELECT]], 16711680
+; CHECK-NEXT:    [[ZEXT_16711680:%.*]] = zext nneg i32 [[AND_16711680]] to i128
+; CHECK-NEXT:    [[P_MASK_4:%.*]] = or disjoint i128 [[P_INSERT_6]], [[ZEXT_16711680]]
+; CHECK-NEXT:    [[AND_255:%.*]] = and i32 [[SPEC_SELECT]], 255
+; CHECK-NEXT:    [[ZEXT_255:%.*]] = zext nneg i32 [[AND_255]] to i128
+; CHECK-NEXT:    [[AND_65280:%.*]] = and i32 [[SPEC_SELECT]], 65280
+; CHECK-NEXT:    [[ZEXT_65280:%.*]] = zext nneg i32 [[AND_65280]] to i128
+; CHECK-NEXT:    [[P_MASK_0:%.*]] = or disjoint i128 [[P_MASK_4]], [[ZEXT_65280]]
+; CHECK-NEXT:    [[P_INSERT_0:%.*]] = or disjoint i128 [[P_MASK_0]], [[ZEXT_255]]
+; CHECK-NEXT:    store i128 [[P_INSERT_0]], ptr [[ARRAYIDX]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx = getelementptr inbounds i8, ptr %base, i64 1024
+  %p_insert_11 = select i1 %narrow, i128 158456325046975419260797452288, i128 158456325028528675187087900672
+  %zext37 = zext i32 %spec_select37 to i128
+  %shl37 = shl nuw nsw i128 %zext37, 32
+  %masked37 = and i128 %shl37, 18446742974197923840
+  %and37_255 = and i32 %spec_select37, 255
+  %zext37_255 = zext nneg i32 %and37_255 to i128
+  %shl37_255 = shl nuw nsw i128 %zext37_255, 32
+  %p_mask_7 = or disjoint i128 %p_insert_11, %masked37
+  %p_insert_7 = or disjoint i128 %p_mask_7, %shl37_255
+  %prefix_high = and i32 %spec_select, -16777216
+  %zext_high = zext i32 %prefix_high to i128
+  %p_insert_6 = or disjoint i128 %p_insert_7, %zext_high
+  %and_16711680 = and i32 %spec_select, 16711680
+  %zext_16711680 = zext nneg i32 %and_16711680 to i128
+  %p_mask_4 = or disjoint i128 %p_insert_6, %zext_16711680
+  %and_255 = and i32 %spec_select, 255
+  %zext_255 = zext nneg i32 %and_255 to i128
+  %and_65280 = and i32 %spec_select, 65280
+  %zext_65280 = zext nneg i32 %and_65280 to i128
+  %p_mask_0 = or disjoint i128 %p_mask_4, %zext_65280
+  %p_insert_0 = or disjoint i128 %p_mask_0, %zext_255
+  ; Generic store i128 (triggers SLP).
+  store i128 %p_insert_0, ptr %arrayidx, align 16
+  ret void
+}

>From 6a07f8806570de92f2e5b8b7b395ce35a608501f Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Thu, 26 Feb 2026 20:46:02 -0600
Subject: [PATCH 2/2] update seen positions tracker to SmallVector

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7ec3a341d01e0..883b00083f85c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13421,7 +13421,8 @@ bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
     const unsigned VF = RhsTE->getVectorFactor();
     Order.assign(VF, VF);
     // Track which logical positions we've seen; reject duplicate shift amounts.
-    SmallBitVector SeenPositions(VF);
+    SmallVector<unsigned, 4> SeenPositions;
+    SeenPositions.assign(VF, VF);
     // Check if need to reorder Rhs to make it in form (0, Stride, 2 * Stride,
     // ..., Sz-Stride).
     if (VF * Stride != Sz)
@@ -13436,9 +13437,9 @@ bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
       if (Val.isNegative() || Val.uge(Sz) || Val.getZExtValue() % Stride != 0)
         return false;
       unsigned Pos = Val.getZExtValue() / Stride;
-      if (SeenPositions.test(Pos))
+      if (SeenPositions[Pos] != VF)
         return false;
-      SeenPositions.set(Pos);
+      SeenPositions[Pos] = Pos;
       // TODO: Support Pos >= VF, in this case need to shift the final value.
       if (Order[Idx] != VF || Pos >= VF)
         return false;