[llvm] f564a48 - [SLP]Fix PR108700: correctly identify id of the operand node

Mon Sep 16 09:44:56 PDT 2024

Author: Alexey Bataev
Date: 2024-09-16T09:44:47-07:00
New Revision: f564a48f0ea4d2100c0cadfa6e6f20f97244025e

URL: https://github.com/llvm/llvm-project/commit/f564a48f0ea4d2100c0cadfa6e6f20f97244025e
DIFF: https://github.com/llvm/llvm-project/commit/f564a48f0ea4d2100c0cadfa6e6f20f97244025e.diff

LOG: [SLP]Fix PR108700: correctly identify id of the operand node

If the operand node for truncs is not created during construction, but
one of the previous ones is reused instead, need to correctly identify
its index, to correctly emit the code.

Fixes https://github.com/llvm/llvm-project/issues/108700

Added: 
    llvm/test/Transforms/SLPVectorizer/X86/trunc-node-reused.ll

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-user-not-min.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5f2bf082fb87f0..282bb8eac7e2e4 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7481,7 +7481,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                 PrevMaxBW),
             std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
                                PrevMinBW));
-        ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
+      }
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndices);
+      LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
+
+      TE->setOperandsInOrder();
+      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+        buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
+      if (ShuffleOrOp == Instruction::Trunc) {
+        ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
       } else if (ShuffleOrOp == Instruction::SIToFP ||
                  ShuffleOrOp == Instruction::UIToFP) {
         unsigned NumSignBits =
@@ -7492,15 +7501,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         }
         if (NumSignBits * 2 >=
             DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
-          ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
+          ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
       }
-      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndices);
-      LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
-
-      TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
-        buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       return;
     }
     case Instruction::ICmp:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-user-not-min.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-user-not-min.ll
index 50b19d01ad58f1..6922df8991b831 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-user-not-min.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-user-not-min.ll
@@ -6,10 +6,10 @@ define void @test(ptr %block, ptr noalias %pixels, i1 %b) {
 ; CHECK-SAME: ptr [[BLOCK:%.*]], ptr noalias [[PIXELS:%.*]], i1 [[B:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i1> <i1 true, i1 poison, i1 false, i1 false>, i1 [[B]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[BLOCK]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <4 x i16> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i16> [[TMP2]] to <4 x i8>
-; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP3]], <4 x i8> [[TMP4]], <4 x i8> [[TMP1]]
 ; CHECK-NEXT:    store <4 x i8> [[TMP5]], ptr [[PIXELS]], align 1
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/trunc-node-reused.ll b/llvm/test/Transforms/SLPVectorizer/X86/trunc-node-reused.ll
new file mode 100644
index 00000000000000..4b62ef688ca44f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/trunc-node-reused.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i16 @test() {
+; CHECK-LABEL: define i16 @test() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i1> @llvm.vector.insert.v4i1.v2i1(<4 x i1> <i1 false, i1 false, i1 poison, i1 poison>, <2 x i1> zeroinitializer, i64 2)
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> zeroinitializer, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i1> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i1> zeroinitializer, <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <4 x i1> [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> [[TMP8]])
+; CHECK-NEXT:    ret i16 [[TMP9]]
+;
+entry:
+  %conv73 = xor i64 0, 0
+  %and.i = and i64 0, 0
+  %xor2.i = or i64 %and.i, 0
+  %sub.i = or i64 %xor2.i, 0
+  %xor3.i = xor i64 %sub.i, %conv73
+  %and4.i = and i64 %xor3.i, 0
+  %cmp.i = icmp slt i64 %and4.i, 0
+  %0 = trunc i64 %conv73 to i16
+  %1 = or i16 0, %0
+  %conv73i = xor i64 0, 0
+  %andi.i = and i64 0, 0
+  %xor2i.i = or i64 %andi.i, 0
+  %subi.i = or i64 %xor2i.i, 0
+  %xor3i.i = xor i64 %subi.i, %conv73i
+  %and4i.i = and i64 %xor3i.i, 0
+  %cmpi.i = icmp slt i64 %and4i.i, 0
+  %2 = trunc i64 %conv73i to i16
+  %3 = or i16 0, %2
+  %4 = select i1 %cmpi.i, i16 0, i16 %3
+  %5 = select i1 %cmp.i, i16 0, i16 %1
+  %6 = zext i32 0 to i64
+  %add.ip = or i64 %6, 0
+  %orp = or i64 %add.ip, 0
+  %conv72p = shl i64 %orp, 0
+  %sextp = ashr i64 %conv72p, 0
+  %conv73p = xor i64 %sextp, 0
+  %and.ip = and i64 0, 0
+  %xor2.ip = or i64 %and.ip, 0
+  %sub.ip = or i64 %xor2.ip, 0
+  %xor3.ip = xor i64 %sub.ip, %conv73p
+  %and4.ip = and i64 %xor3.ip, 0
+  %cmp.ip = icmp slt i64 %and4.ip, 0
+  %7 = trunc i64 %conv73p to i16
+  %8 = or i16 0, %7
+  %9 = select i1 %cmp.ip, i16 0, i16 %8
+  %conv76i = and i16 %4, %5
+  %conv76p = and i16 %conv76i, %9
+  %10 = zext i32 0 to i64
+  %add.ip1 = or i64 %10, 0
+  %orp1 = or i64 %add.ip1, 0
+  %conv72p1 = shl i64 %orp1, 0
+  %sextp1 = ashr i64 %conv72p1, 0
+  %conv73p1 = xor i64 %sextp1, 0
+  %and.ip1 = and i64 0, 0
+  %xor2.ip1 = or i64 %and.ip1, 0
+  %sub.ip1 = or i64 %xor2.ip1, 0
+  %xor3.ip1 = xor i64 %sub.ip1, %conv73p1
+  %and4.ip1 = and i64 %xor3.ip1, 0
+  %cmp.ip1 = icmp slt i64 %and4.ip1, 0
+  %11 = trunc i64 %conv73p1 to i16
+  %12 = or i16 0, %11
+  %13 = select i1 %cmp.ip1, i16 0, i16 %12
+  %conv76p2 = and i16 %conv76p, %13
+  ret i16 %conv76p2
+}