[llvm] 8c41859 - [SLP]Clear the operands deps of non-schedulable nodes, if previously all operands were copyable

Thu Sep 18 12:11:42 PDT 2025

Author: Alexey Bataev
Date: 2025-09-18T12:11:33-07:00
New Revision: 8c41859a21a4d0cfda164cc58f4a5336dbcd30d1

URL: https://github.com/llvm/llvm-project/commit/8c41859a21a4d0cfda164cc58f4a5336dbcd30d1
DIFF: https://github.com/llvm/llvm-project/commit/8c41859a21a4d0cfda164cc58f4a5336dbcd30d1.diff

LOG: [SLP]Clear the operands deps of non-schedulable nodes, if previously all operands were copyable

If all operands of the non-schedulable nodes were previously only
copyables, need to clear the dependencies of the original schedule data
for such copyable operands and recalculate them to correctly handle
  number of dependecies.

Fixes #159406

Added: 
    llvm/test/Transforms/SLPVectorizer/X86/non-sched-inst-has-copyable-before.ll

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0c94a1d593ce0..6ac9018df641e 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -20804,12 +20804,45 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
                                             const EdgeInfo &EI) {
   // No need to schedule PHIs, insertelement, extractelement and extractvalue
   // instructions.
-  bool HasCopyables = S.areInstructionsWithCopyableElements();
   if (isa<PHINode>(S.getMainOp()) ||
-      isVectorLikeInstWithConstOps(S.getMainOp()) ||
-      (!HasCopyables && doesNotNeedToSchedule(VL)) ||
-      all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
+      isVectorLikeInstWithConstOps(S.getMainOp()))
+    return nullptr;
+  bool HasCopyables = S.areInstructionsWithCopyableElements();
+  if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
+       all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
+    // If all operands were replaced by copyables, the operands of this node
+    // might be not, so need to recalculate dependencies for schedule data,
+    // replaced by copyable schedule data.
+    SmallVector<ScheduleData *> ControlDependentMembers;
+    for (Value *V : VL) {
+      auto *I = dyn_cast<Instruction>(V);
+      if (!I || (HasCopyables && S.isCopyableElement(V)))
+        continue;
+      SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
+      for (const Use &U : I->operands()) {
+        unsigned &NumOps =
+            UserOpToNumOps.try_emplace(std::make_pair(I, U.get()), 0)
+                .first->getSecond();
+        ++NumOps;
+        if (auto *Op = dyn_cast<Instruction>(U.get());
+            Op && areAllOperandsReplacedByCopyableData(I, Op, *SLP, NumOps)) {
+          if (ScheduleData *OpSD = getScheduleData(Op);
+              OpSD && OpSD->hasValidDependencies()) {
+            OpSD->clearDirectDependencies();
+            if (RegionHasStackSave ||
+                !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))
+              ControlDependentMembers.push_back(OpSD);
+          }
+        }
+      }
+    }
+    if (!ControlDependentMembers.empty()) {
+      ScheduleBundle Invalid = ScheduleBundle::invalid();
+      calculateDependencies(Invalid, /*InsertInReadyList=*/true, SLP,
+                            ControlDependentMembers);
+    }
     return nullptr;
+  }
 
   // Initialize the instruction bundle.
   Instruction *OldScheduleEnd = ScheduleEnd;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/non-sched-inst-has-copyable-before.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-sched-inst-has-copyable-before.ll
new file mode 100644
index 0000000000000..fe389ee78071a
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-sched-inst-has-copyable-before.ll
@@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-cros-linux-gnu < %s | FileCheck %s
+
+%struct.fe = type { [5 x i64] }
+
+define i32 @test(i64 %0, i128 %1, i1 %2) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i64 [[TMP0:%.*]], i128 [[TMP1:%.*]], i1 [[TMP2:%.*]]) {
+; CHECK-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_FE:%.*]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 16
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 24
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 32
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 8
+; CHECK-NEXT:    br label %[[BB9:.*]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i64 [ undef, [[TMP3:%.*]] ], [ [[TMP29:%.*]], %[[BB9]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i64 [ undef, [[TMP3]] ], [ [[TMP26:%.*]], %[[BB9]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i64 [ undef, [[TMP3]] ], [ [[TMP23:%.*]], %[[BB9]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i64 [ undef, [[TMP3]] ], [ [[TMP20:%.*]], %[[BB9]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i64 [ undef, [[TMP3]] ], [ [[TMP17:%.*]], %[[BB9]] ]
+; CHECK-NEXT:    [[DOTSROA_14_0:%.*]] = phi i64 [ undef, [[TMP3]] ], [ [[TMP52:%.*]], %[[BB9]] ]
+; CHECK-NEXT:    [[DOTSROA_11_0:%.*]] = phi i64 [ undef, [[TMP3]] ], [ [[TMP50:%.*]], %[[BB9]] ]
+; CHECK-NEXT:    [[DOTSROA_8_0:%.*]] = phi i64 [ undef, [[TMP3]] ], [ [[TMP57:%.*]], %[[BB9]] ]
+; CHECK-NEXT:    [[DOTSROA_4_0:%.*]] = phi i64 [ undef, [[TMP3]] ], [ [[TMP56:%.*]], %[[BB9]] ]
+; CHECK-NEXT:    [[DOTSROA_0_0:%.*]] = phi i64 [ undef, [[TMP3]] ], [ [[TMP54:%.*]], %[[BB9]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[DOTSROA_0_0]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and i64 [[TMP15]], [[TMP0]]
+; CHECK-NEXT:    [[TMP17]] = xor i64 [[TMP16]], 1
+; CHECK-NEXT:    store i64 [[TMP17]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[DOTSROA_4_0]], [[TMP13]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and i64 [[TMP18]], [[TMP0]]
+; CHECK-NEXT:    [[TMP20]] = xor i64 [[TMP19]], 1
+; CHECK-NEXT:    store i64 [[TMP20]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[DOTSROA_8_0]], [[TMP12]]
+; CHECK-NEXT:    [[TMP22:%.*]] = and i64 [[TMP21]], [[TMP0]]
+; CHECK-NEXT:    [[TMP23]] = xor i64 [[TMP22]], 1
+; CHECK-NEXT:    store i64 [[TMP23]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[DOTSROA_11_0]], [[TMP11]]
+; CHECK-NEXT:    [[TMP25:%.*]] = and i64 [[TMP24]], [[TMP0]]
+; CHECK-NEXT:    [[TMP26]] = xor i64 [[TMP25]], 1
+; CHECK-NEXT:    store i64 [[TMP26]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[DOTSROA_14_0]], [[TMP10]]
+; CHECK-NEXT:    [[TMP28:%.*]] = and i64 [[TMP27]], [[TMP0]]
+; CHECK-NEXT:    [[TMP29]] = xor i64 [[TMP28]], 1
+; CHECK-NEXT:    store i64 [[TMP29]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP30:%.*]] = load i64, ptr null, align 4294967296
+; CHECK-NEXT:    [[TMP31:%.*]] = or i64 [[TMP19]], 1
+; CHECK-NEXT:    [[TMP32:%.*]] = or i64 [[TMP16]], 1
+; CHECK-NEXT:    [[TMP33:%.*]] = add i64 [[TMP17]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = mul i64 [[TMP29]], 19
+; CHECK-NEXT:    [[TMP35:%.*]] = zext i64 [[TMP34]] to i128
+; CHECK-NEXT:    [[TMP36:%.*]] = mul i64 [[TMP26]], 19
+; CHECK-NEXT:    [[TMP37:%.*]] = zext i64 [[TMP36]] to i128
+; CHECK-NEXT:    [[TMP38:%.*]] = mul i64 [[TMP23]], 19
+; CHECK-NEXT:    [[TMP39:%.*]] = zext i64 [[TMP38]] to i128
+; CHECK-NEXT:    [[TMP40:%.*]] = mul nuw nsw i128 [[TMP39]], 24
+; CHECK-NEXT:    [[TMP41:%.*]] = zext i64 [[TMP32]] to i128
+; CHECK-NEXT:    [[TMP42:%.*]] = mul nuw i128 [[TMP37]], [[TMP41]]
+; CHECK-NEXT:    [[TMP43:%.*]] = zext i64 [[TMP31]] to i128
+; CHECK-NEXT:    [[TMP44:%.*]] = mul nuw i128 [[TMP35]], [[TMP43]]
+; CHECK-NEXT:    [[TMP45:%.*]] = zext i64 [[TMP33]] to i128
+; CHECK-NEXT:    [[TMP46:%.*]] = mul i128 [[TMP1]], [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = add i128 [[TMP40]], [[TMP46]]
+; CHECK-NEXT:    [[TMP48:%.*]] = add i128 [[TMP47]], [[TMP42]]
+; CHECK-NEXT:    [[TMP49:%.*]] = add i128 [[TMP48]], [[TMP44]]
+; CHECK-NEXT:    [[TMP50]] = and i64 [[TMP29]], 1
+; CHECK-NEXT:    [[TMP51:%.*]] = trunc i128 [[TMP49]] to i64
+; CHECK-NEXT:    [[TMP52]] = and i64 [[TMP30]], 1
+; CHECK-NEXT:    [[TMP53:%.*]] = add i64 [[TMP51]], 1
+; CHECK-NEXT:    [[TMP54]] = and i64 [[TMP53]], 1
+; CHECK-NEXT:    [[TMP55:%.*]] = lshr i64 [[TMP53]], 1
+; CHECK-NEXT:    [[TMP56]] = and i64 [[TMP19]], 1
+; CHECK-NEXT:    [[TMP57]] = or i64 [[TMP55]], 1
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB58:.*]], label %[[BB9]]
+; CHECK:       [[BB58]]:
+; CHECK-NEXT:    call void @g(ptr nonnull [[TMP4]])
+; CHECK-NEXT:    ret i32 0
+;
+  %4 = alloca %struct.fe, align 8
+  %5 = getelementptr inbounds nuw i8, ptr %4, i64 16
+  %6 = getelementptr inbounds nuw i8, ptr %4, i64 24
+  %7 = getelementptr inbounds nuw i8, ptr %4, i64 32
+  %8 = getelementptr inbounds nuw i8, ptr %4, i64 8
+  br label %9
+
+9:
+  %10 = phi i64 [ undef, %3 ], [ %29, %9 ]
+  %11 = phi i64 [ undef, %3 ], [ %26, %9 ]
+  %12 = phi i64 [ undef, %3 ], [ %23, %9 ]
+  %13 = phi i64 [ undef, %3 ], [ %20, %9 ]
+  %14 = phi i64 [ undef, %3 ], [ %17, %9 ]
+  %.sroa.14.0 = phi i64 [ undef, %3 ], [ %52, %9 ]
+  %.sroa.11.0 = phi i64 [ undef, %3 ], [ %50, %9 ]
+  %.sroa.8.0 = phi i64 [ undef, %3 ], [ %57, %9 ]
+  %.sroa.4.0 = phi i64 [ undef, %3 ], [ %56, %9 ]
+  %.sroa.0.0 = phi i64 [ undef, %3 ], [ %54, %9 ]
+  %15 = xor i64 %.sroa.0.0, %14
+  %16 = and i64 %15, %0
+  %17 = xor i64 %16, 1
+  store i64 %17, ptr %4, align 8
+  %18 = xor i64 %.sroa.4.0, %13
+  %19 = and i64 %18, %0
+  %20 = xor i64 %19, 1
+  store i64 %20, ptr %8, align 8
+  %21 = xor i64 %.sroa.8.0, %12
+  %22 = and i64 %21, %0
+  %23 = xor i64 %22, 1
+  store i64 %23, ptr %5, align 8
+  %24 = xor i64 %.sroa.11.0, %11
+  %25 = and i64 %24, %0
+  %26 = xor i64 %25, 1
+  store i64 %26, ptr %6, align 8
+  %27 = xor i64 %.sroa.14.0, %10
+  %28 = and i64 %27, %0
+  %29 = xor i64 %28, 1
+  store i64 %29, ptr %7, align 8
+  %30 = load i64, ptr null, align 4294967296
+  %31 = or i64 %19, 1
+  %32 = or i64 %16, 1
+  %33 = add i64 %17, 1
+  %34 = mul i64 %29, 19
+  %35 = zext i64 %34 to i128
+  %36 = mul i64 %26, 19
+  %37 = zext i64 %36 to i128
+  %38 = mul i64 %23, 19
+  %39 = zext i64 %38 to i128
+  %40 = mul nuw nsw i128 %39, 24
+  %41 = zext i64 %32 to i128
+  %42 = mul nuw i128 %37, %41
+  %43 = zext i64 %31 to i128
+  %44 = mul nuw i128 %35, %43
+  %45 = zext i64 %33 to i128
+  %46 = mul i128 %1, %45
+  %47 = add i128 %40, %46
+  %48 = add i128 %47, %42
+  %49 = add i128 %48, %44
+  %50 = and i64 %29, 1
+  %51 = trunc i128 %49 to i64
+  %52 = and i64 %30, 1
+  %53 = add i64 %51, 1
+  %54 = and i64 %53, 1
+  %55 = lshr i64 %53, 1
+  %56 = and i64 %19, 1
+  %57 = or i64 %55, 1
+  br i1 %2, label %58, label %9
+
+58:
+  call void @g(ptr nonnull %4)
+  ret i32 0
+}
+
+declare void @g(ptr)
+