[llvm-branch-commits] [llvm] release/22.x: [LoopVectorize] Fix nondeterminism in loop-vectorize (#200833) (PR #203850)

Mon Jun 15 02:02:21 PDT 2026

https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/203850

Backport 7ff58e4747dcece251d3e5cf2e5d8de4670c5252

Requested by: @OCHyams

>From e304f71254184250956054e6c23bc2da9a27821b Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams at sony.com>
Date: Mon, 15 Jun 2026 08:29:11 +0100
Subject: [PATCH] [LoopVectorize] Fix nondeterminism in loop-vectorize
 (#200833)

The nondeterministic iteration over `AddrDefs` (SmallPtrSet) causes
nondeterministic output for the test case in this patch (reduced from a
C codebase). One of two different outputs is generated arbitrarily,
chosen roughly equally.

Between the two different outputs sometimes the instruction
   `%3 = load i64, ptr %2, align 8`
has an associated cost of 4 and othertimes 9. The instruction is visited
twice in `setCostBasedWideningDecision` in the `AddrDefs` loop: once
directly as an element of `AddrDefs`, and the other time indirectly in
the lambda `UpdateMemOpUserCost` as a User of another `AddrDefs`
element. Each of those times `setWideningDecision` is called with a
different cost value; the final of the two calls sets the final value
(previous is overwritten). Because `AddrDefs` iteration is
nondeterministic, the order of those two calls to `setWideningDecision`
is also nondeterministic, hence we see two different costs arbitrarily
between runs.

This patch prevents the nondeterministic iteration order.

Some additional info:
We observe the issue reproducing (frequently) in llvm-22 but not
llvm-21.

With the test case in this patch we observe the nondeterminism
frequently reproduces (< 3 tries) from
1f331e453fa9c328c165c739f61e17a2815ece82 building on and targeting
x86_64 linux. Before that we haven't observed the nondeterminism (> 500
tries), but can't rule out that it may be observed with different inputs
based on the fact the SmallPtrSet has been in use since 2017. N.B.
Building on Windows bisects to a different commit.

(cherry picked from commit 7ff58e4747dcece251d3e5cf2e5d8de4670c5252)
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   4 +-
 .../X86/nondetermisitic-widening-cost.ll      | 147 ++++++++++++++++++
 2 files changed, 149 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/nondetermisitic-widening-cost.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 77589e06d9c33..08adc5a80d2c1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5799,7 +5799,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
     return;
 
   // Start with all scalar pointer uses.
-  SmallPtrSet<Instruction *, 8> AddrDefs;
+  SmallSetVector<Instruction *, 8> AddrDefs;
   for (BasicBlock *BB : TheLoop->blocks())
     for (Instruction &I : *BB) {
       Instruction *PtrDef =
@@ -5817,7 +5817,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
     for (auto &Op : I->operands())
       if (auto *InstOp = dyn_cast<Instruction>(Op))
         if (TheLoop->contains(InstOp) && !isa<PHINode>(InstOp) &&
-            AddrDefs.insert(InstOp).second)
+            AddrDefs.insert(InstOp))
           Worklist.push_back(InstOp);
   }
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/nondetermisitic-widening-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/nondetermisitic-widening-cost.ll
new file mode 100644
index 0000000000000..b99322c0ce924
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/nondetermisitic-widening-cost.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt --passes=loop-vectorize %s -S | FileCheck %s
+
+; Check that we see expected deterministic (over multiple test runs) output.
+; NOTE: Beware, if this test fails it may be due to non-determinism.
+
+target triple = "x86_64-unknown-linux"
+
+define float @fun(i64 %0, float %1, ptr noalias  %a, ptr noalias %b, i64 %len) #0 {
+; CHECK-LABEL: define float @fun(
+; CHECK-SAME: i64 [[TMP0:%.*]], float [[TMP1:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[VLA:%.*]] = alloca float, i64 [[LEN]], align 16
+; CHECK-NEXT:    br label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], 32
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [4 x i8], ptr [[VLA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr [8 x i8], ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr [8 x i8], ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr [8 x i8], ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr [8 x i8], ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr [8 x i8], ptr [[A]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr [8 x i8], ptr [[A]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr [8 x i8], ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr [8 x i8], ptr [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP60:%.*]] = load ptr, ptr [[TMP48]], align 8
+; CHECK-NEXT:    [[TMP62:%.*]] = load ptr, ptr [[TMP49]], align 8
+; CHECK-NEXT:    [[TMP64:%.*]] = load ptr, ptr [[TMP50]], align 8
+; CHECK-NEXT:    [[TMP65:%.*]] = load ptr, ptr [[TMP51]], align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[TMP16]], align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[TMP17]], align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP18]], align 8
+; CHECK-NEXT:    [[TMP66:%.*]] = load i64, ptr [[TMP60]], align 8
+; CHECK-NEXT:    [[TMP33:%.*]] = load i64, ptr [[TMP62]], align 8
+; CHECK-NEXT:    [[TMP67:%.*]] = load i64, ptr [[TMP64]], align 8
+; CHECK-NEXT:    [[TMP35:%.*]] = load i64, ptr [[TMP65]], align 8
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP23]], align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP24]], align 8
+; CHECK-NEXT:    [[TMP36:%.*]] = load i64, ptr [[TMP25]], align 8
+; CHECK-NEXT:    [[TMP34:%.*]] = load i64, ptr [[TMP26]], align 8
+; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr [4 x i8], ptr [[A]], i64 [[TMP66]]
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr [4 x i8], ptr [[A]], i64 [[TMP33]]
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr [4 x i8], ptr [[A]], i64 [[TMP67]]
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr [4 x i8], ptr [[A]], i64 [[TMP35]]
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr [4 x i8], ptr [[A]], i64 [[TMP31]]
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr [4 x i8], ptr [[A]], i64 [[TMP32]]
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr [4 x i8], ptr [[A]], i64 [[TMP36]]
+; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr [4 x i8], ptr [[A]], i64 [[TMP34]]
+; CHECK-NEXT:    [[TMP52:%.*]] = load float, ptr [[TMP68]], align 4
+; CHECK-NEXT:    [[TMP53:%.*]] = load float, ptr [[TMP69]], align 4
+; CHECK-NEXT:    [[TMP54:%.*]] = load float, ptr [[TMP70]], align 4
+; CHECK-NEXT:    [[TMP55:%.*]] = load float, ptr [[TMP43]], align 4
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <4 x float> poison, float [[TMP52]], i32 0
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <4 x float> [[TMP56]], float [[TMP53]], i32 1
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <4 x float> [[TMP57]], float [[TMP54]], i32 2
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <4 x float> [[TMP58]], float [[TMP55]], i32 3
+; CHECK-NEXT:    [[TMP71:%.*]] = load float, ptr [[TMP44]], align 4
+; CHECK-NEXT:    [[TMP72:%.*]] = load float, ptr [[TMP45]], align 4
+; CHECK-NEXT:    [[TMP73:%.*]] = load float, ptr [[TMP46]], align 4
+; CHECK-NEXT:    [[TMP74:%.*]] = load float, ptr [[TMP47]], align 4
+; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <4 x float> poison, float [[TMP71]], i32 0
+; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x float> [[TMP75]], float [[TMP72]], i32 1
+; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <4 x float> [[TMP76]], float [[TMP73]], i32 2
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x float> [[TMP77]], float [[TMP74]], i32 3
+; CHECK-NEXT:    [[TMP61:%.*]] = getelementptr [4 x i8], ptr [[VLA]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr float, ptr [[TMP61]], i64 4
+; CHECK-NEXT:    store <4 x float> [[TMP59]], ptr [[TMP61]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP78]], ptr [[TMP79]], align 4
+; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr [4 x i8], ptr [[TMP4]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP80:%.*]] = getelementptr float, ptr [[TMP63]], i64 4
+; CHECK-NEXT:    store <4 x float> [[BROADCAST_SPLAT]], ptr [[TMP63]], align 4
+; CHECK-NEXT:    store <4 x float> [[BROADCAST_SPLAT]], ptr [[TMP80]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[TMP37]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr [8 x i8], ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load ptr, ptr [[ARRAYIDX16]], align 8
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP38]], align 8
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr [4 x i8], ptr [[A]], i64 [[TMP39]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr [4 x i8], ptr [[VLA]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[TMP40]], ptr [[ARRAYIDX21]], align 4
+; CHECK-NEXT:    [[TMP41:%.*]] = load i64, ptr [[B]], align 8
+; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr [4 x i8], ptr [[A]], i64 [[TMP41]]
+; CHECK-NEXT:    [[TMP42:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr [4 x i8], ptr [[VLA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr [4 x i8], ptr [[ARRAYIDX28]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[TMP1]], ptr [[ARRAYIDX30]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[VLA]], align 4
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %vla = alloca float, i64 %len, align 16
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx16 = getelementptr [8 x i8], ptr %a, i64 %iv
+  %2 = load ptr, ptr %arrayidx16, align 8
+  %3 = load i64, ptr %2, align 8
+  %arrayidx18 = getelementptr [4 x i8], ptr %a, i64 %3
+  %4 = load float, ptr %arrayidx18, align 4
+  %arrayidx21 = getelementptr [4 x i8], ptr %vla, i64 %iv
+  store float %4, ptr %arrayidx21, align 4
+  %5 = load i64, ptr %b, align 8
+  %arrayidx27 = getelementptr [4 x i8], ptr %a, i64 %5
+  %6 = load float, ptr %arrayidx27, align 4
+  %arrayidx28 = getelementptr [4 x i8], ptr %vla, i64 %0
+  %arrayidx30 = getelementptr [4 x i8], ptr %arrayidx28, i64 %iv
+  store float %1, ptr %arrayidx30, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv, 128
+  br i1 %exitcond.not, label %for.end.loopexit, label %loop
+
+for.end.loopexit:
+  %r = load float, ptr %vla
+  ret float %r
+}
+
+attributes #0 = { "target-features"="+avx2" }