[llvm] SROA: Fix tree merge IRBuilder insert point (PR #189680)
Lewis Crawford via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 1 04:53:22 PDT 2026
https://github.com/LewisCrawford updated https://github.com/llvm/llvm-project/pull/189680
>From 121ab4d7bdde56dd8b50e06a089fd02d4f533af5 Mon Sep 17 00:00:00 2001
From: Lewis Crawford <lcrawford at nvidia.com>
Date: Wed, 25 Mar 2026 18:56:05 +0000
Subject: [PATCH 1/2] SROA: Fix tree merge IRBuilder insert point
StoreInfos is sorted by slice offset, not program order.
Anchoring the IRBuilder at StoreInfos.back() could emit
shufflevectors before SSA values defined later in the
same block (invalid IR).
Insert merged shuffles immediately before TheLoad when
the load shares the store block. When the load is elsewhere,
insert before the store block terminator so the merge runs
after every store + any trailing instructions in that block.
---
llvm/lib/Transforms/Scalar/SROA.cpp | 9 ++-
...ctor-promotion-via-tree-structure-merge.ll | 61 +++++++++++++++++++
2 files changed, 69 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 760b84000fe7b..b091909c0b069 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -3029,7 +3029,14 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
// Instead of having these stores, we merge all the stored values into a
// vector and store the merged value into the alloca
std::queue<Value *> VecElements;
- IRBuilder<> Builder(StoreInfos.back().Store);
+ // StoreInfos is sorted by offset, not by block order. Anchoring to
+ // StoreInfos.back().Store (last by offset) can place shuffles before
+ // operands that appear later in the block (invalid SSA). Insert before
+ // TheLoad when it shares the store block (after all stores, before any
+ // later IR in that block). Otherwise insert before the store block's
+ // terminator so the merge runs after every store and any trailing
+ // instructions in that block.
+ IRBuilder<> Builder(LoadBB == StoreBB ? TheLoad : StoreBB->getTerminator());
for (const auto &Info : StoreInfos) {
DeletedValues.push_back(Info.Store);
VecElements.push(Info.StoredValue);
diff --git a/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll
index 8bfe0bb83051e..3f01827e55541 100644
--- a/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll
@@ -296,6 +296,67 @@ FalseBranch:
ret <8 x float> %result
}
+define <6 x float> @merge_unordered_indices_same_bb(<2 x float> %a, <2 x float> %c) {
+; CHECK-LABEL: define <6 x float> @merge_unordered_indices_same_bb(
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[C:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CALC:%.*]] = fadd <2 x float> [[A]], [[C]]
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[CALC]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT: ret <6 x float> [[TMP2]]
+;
+entry:
+ %alloca = alloca [6 x float]
+
+ %ptr4 = getelementptr inbounds [6 x float], ptr %alloca, i32 0, i32 4
+ store <2 x float> %c, ptr %ptr4
+
+ %ptr0 = getelementptr inbounds [6 x float], ptr %alloca, i32 0, i32 0
+ store <2 x float> %a, ptr %ptr0
+
+ %calc = fadd <2 x float> %a, %c
+
+ %ptr2 = getelementptr inbounds [6 x float], ptr %alloca, i32 0, i32 2
+ store <2 x float> %calc, ptr %ptr2
+
+ %result = load <6 x float>, ptr %alloca
+ ret <6 x float> %result
+}
+
+define <6 x float> @merge_unordered_indices_different_bb(<2 x float> %a, <2 x float> %c) {
+; CHECK-LABEL: define <6 x float> @merge_unordered_indices_different_bb(
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[C:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CALC:%.*]] = fadd <2 x float> [[A]], [[C]]
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[CALC]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT: br label %[[LOAD_BB:.*]]
+; CHECK: [[LOAD_BB]]:
+; CHECK-NEXT: ret <6 x float> [[TMP2]]
+;
+entry:
+ %alloca = alloca [6 x float]
+
+ %ptr4 = getelementptr inbounds [6 x float], ptr %alloca, i32 0, i32 4
+ store <2 x float> %c, ptr %ptr4
+
+ %ptr0 = getelementptr inbounds [6 x float], ptr %alloca, i32 0, i32 0
+ store <2 x float> %a, ptr %ptr0
+
+ %calc = fadd <2 x float> %a, %c
+
+ %ptr2 = getelementptr inbounds [6 x float], ptr %alloca, i32 0, i32 2
+ store <2 x float> %calc, ptr %ptr2
+
+ br label %load_bb
+
+load_bb:
+ %result = load <6 x float>, ptr %alloca
+ ret <6 x float> %result
+}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK-MODIFY-CFG: {{.*}}
; CHECK-PRESERVE-CFG: {{.*}}
>From fc25c2e17713fa04933204a98e6c8503be38ca52 Mon Sep 17 00:00:00 2001
From: Lewis Crawford <lcrawford at nvidia.com>
Date: Wed, 1 Apr 2026 11:20:53 +0000
Subject: [PATCH 2/2] Remove unused CHECK prefixes
Remove the seperate CHECK prefixes for preserve
vs modify CFG in test file, since they are unused.
---
.../SROA/vector-promotion-via-tree-structure-merge.ll | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll
index 3f01827e55541..7ac05c078a4d3 100644
--- a/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes='sroa<preserve-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
-; RUN: opt < %s -passes='sroa<modify-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
+; RUN: opt < %s -passes='sroa<preserve-cfg>' -S | FileCheck %s
+; RUN: opt < %s -passes='sroa<modify-cfg>' -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
; Basic tree-structured merge: 4 stores of <2 x float> into <8 x float>
@@ -356,7 +356,3 @@ load_bb:
%result = load <6 x float>, ptr %alloca
ret <6 x float> %result
}
-
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-MODIFY-CFG: {{.*}}
-; CHECK-PRESERVE-CFG: {{.*}}
More information about the llvm-commits
mailing list