[llvm] [Hexagon] Add :mem_noshuf for store-load pairs with no scheduler Order dep (PR #181456)
Brian Cain via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 13 21:18:14 PST 2026
https://github.com/androm3da updated https://github.com/llvm/llvm-project/pull/181456
>From e85fdc43db780a322bb2c44e17407d2a881e0f50 Mon Sep 17 00:00:00 2001
From: Brian Cain <brian.cain at oss.qualcomm.com>
Date: Fri, 13 Feb 2026 21:14:42 -0800
Subject: [PATCH] [Hexagon] Add :mem_noshuf for store-load pairs with no
scheduler Order dep
When TBAA tells the scheduler that a store and load access different
types, the scheduler omits the Order (memory) dependency edge between
them. On V65+ the packetizer can then place the store in slot 1 and
the load in slot 0 of the same packet. Without :mem_noshuf the
hardware is free to reorder the memory operations, which is unsound
when the pointers actually alias at runtime (TBAA can be overly
optimistic with type-punning patterns such as libc++ tree node casts).
Re-check aliasing with UseTBAA=false in the packetizer whenever a
store-load pair has no scheduler Order dependency, and mark the packet
:mem_noshuf if the accesses may alias. Skip the re-check when either
operand touches a PseudoSourceValue (stack slot, constant pool, GOT,
jump table) since TBAA is not the source of the NoAlias conclusion
for those accesses.
Bugs fixed:
- HexagonMCChecker SIGSEGV in std::set tree destruction
- HexagonMCCompound "out of slots" on legal packets
- Verifier false "Broken module found" errors
- LiveIntervals "reserved computation mismatch" assertion
- Systemic miscompilation across multiple files
---
.../Target/Hexagon/HexagonVLIWPacketizer.cpp | 51 +++++-
.../Hexagon/packetize-mem-noshuf-tbaa.ll | 167 ++++++++++++++++++
2 files changed, 217 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/Hexagon/packetize-mem-noshuf-tbaa.ll
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index d39b79a86753a..e4dd49a05fbc8 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -1394,9 +1394,53 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
return false;
}
+ // When the scheduler found no Order (memory) dependency between a
+ // store-load pair — either because there is no DAG edge at all, or
+ // because the only edges are Anti/register deps — the pair can land in
+ // the same V65+ packet. Re-check aliasing without TBAA (TBAA may have
+ // been the reason the scheduler omitted the Order edge) and, if the
+ // accesses may alias, mark the packet :mem_noshuf so the hardware does
+ // not reorder the memory operations.
+ auto CheckMemNoshufForSlot1Store = [&]() {
+ if (!Slot1Store || !MF.getSubtarget<HexagonSubtarget>().hasV65Ops())
+ return;
+ bool LoadJ = J.mayLoad(), StoreJ = J.mayStore();
+ bool LoadI = I.mayLoad(), StoreI = I.mayStore();
+ bool NVStoreJ = HII->isNewValueStore(J);
+ bool NVStoreI = HII->isNewValueStore(I);
+ bool IsVecJ = HII->isHVXVec(J);
+ bool IsVecI = HII->isHVXVec(I);
+
+ if (((LoadJ && StoreI && !NVStoreI) || (StoreJ && LoadI && !NVStoreJ)) &&
+ (J.getOpcode() != Hexagon::S2_allocframe &&
+ I.getOpcode() != Hexagon::S2_allocframe) &&
+ (J.getOpcode() != Hexagon::L2_deallocframe &&
+ I.getOpcode() != Hexagon::L2_deallocframe) &&
+ (!HII->isMemOp(J) && !HII->isMemOp(I)) && (!IsVecJ && !IsVecI)) {
+ // If either instruction accesses a stack slot, constant pool, GOT,
+ // or jump table (PseudoSourceValue), the scheduler's TBAA-based
+ // NoAlias result is reliable — skip the re-check. TBAA false
+ // positives only affect heap-to-heap accesses through different
+ // pointer types (e.g. libc++ tree node pointer casts).
+ auto HasPSV = [](const MachineInstr &MI) {
+ for (const MachineMemOperand *MMO : MI.memoperands())
+ if (MMO->getPseudoValue())
+ return true;
+ return false;
+ };
+ if (HasPSV(J) || HasPSV(I))
+ return;
+
+ if (J.mayAlias(AA, I, /*UseTBAA=*/false))
+ setmemShufDisabled(true);
+ }
+ };
+
// There no dependency between a prolog instruction and its successor.
- if (!SUJ->isSucc(SUI))
+ if (!SUJ->isSucc(SUI)) {
+ CheckMemNoshufForSlot1Store();
return true;
+ }
for (unsigned i = 0; i < SUJ->Succs.size(); ++i) {
if (FoundSequentialDependence)
@@ -1628,6 +1672,11 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
return false;
}
+ // The dependency loop found no blocking dependence — only Anti deps
+ // (or Order deps that the V65 Slot1Store path already handled).
+ // Still need to guard against a store-load pair whose Order dep was
+ // omitted by the scheduler due to TBAA.
+ CheckMemNoshufForSlot1Store();
return true;
}
diff --git a/llvm/test/CodeGen/Hexagon/packetize-mem-noshuf-tbaa.ll b/llvm/test/CodeGen/Hexagon/packetize-mem-noshuf-tbaa.ll
new file mode 100644
index 0000000000000..d95f8fdf9d633
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/packetize-mem-noshuf-tbaa.ll
@@ -0,0 +1,167 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv65 -O2 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv62 -O2 < %s | FileCheck %s --check-prefix=V62
+
+; When the scheduler uses TBAA to conclude that a store and load access
+; different types, it omits the Order (memory) dependency edge. On V65+
+; the packetizer can then place the store in slot 1 and the load in slot
+; 0 of the same packet. Without :mem_noshuf the hardware is free to
+; reorder the memory operations, which is unsound when the pointers
+; actually alias at runtime (TBAA can be overly optimistic with
+; type-punning patterns such as libc++ tree-node pointer casts).
+;
+; These tests verify that the packetizer adds :mem_noshuf when it
+; re-checks aliasing without TBAA and the accesses may alias.
+
+;--- Tree node insertion pattern ---
+;
+; Models the inlined std::set::__insert_node_at sequence where a store
+; to new_node->__parent_ (through one pointer type) is followed by a
+; load from begin_node->__left_ (through a different pointer type).
+; TBAA says the accesses are to different types, but the pointers may
+; alias at runtime because libc++ tree nodes cast between base/derived
+; node types.
+
+; CHECK-LABEL: test_tree_node_insert:
+; CHECK: {
+; CHECK-DAG: memw(r0+#0) = r1
+; CHECK-DAG: r{{[0-9]+}} = memw(r2+#0)
+; CHECK: } :mem_noshuf
+
+; V62-LABEL: test_tree_node_insert:
+; V62-NOT: :mem_noshuf
+
+define ptr @test_tree_node_insert(ptr %new_node, ptr %parent, ptr %child_ptr) #0 {
+entry:
+ store ptr %parent, ptr %new_node, align 4, !tbaa !0
+ %child = load ptr, ptr %child_ptr, align 4, !tbaa !3
+ ret ptr %child
+}
+
+;--- Compound instruction logic ---
+;
+; Models a function where a store and load with different TBAA types
+; are followed by a comparison and branch. When the store-load pair
+; is miscompiled (wrong memory ordering), the loaded value used for
+; the comparison is stale, causing incorrect control-flow decisions.
+; In the original bug this broke compound instruction splitting in
+; HexagonMCCompound.cpp.
+
+; CHECK-LABEL: test_store_load_branch:
+; CHECK: {
+; CHECK-DAG: memw(r0+#0) = r2
+; CHECK-DAG: r{{[0-9]+}} = memw(r1+#0)
+; CHECK: } :mem_noshuf
+
+; V62-LABEL: test_store_load_branch:
+; V62-NOT: :mem_noshuf
+
+define i32 @test_store_load_branch(ptr %flag_ptr, ptr %data_ptr, i32 %val) #0 {
+entry:
+ store i32 %val, ptr %flag_ptr, align 4, !tbaa !0
+ %data = load i32, ptr %data_ptr, align 4, !tbaa !3
+ %cmp = icmp eq i32 %data, 0
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ %sum = add i32 %val, 1
+ ret i32 %sum
+
+if.end:
+ ret i32 %data
+}
+
+;--- Simple store-load with immediate ---
+;
+; The simplest case: store an immediate, load through a different
+; pointer. This models the Verifier.cpp miscompile where even trivial
+; IR triggered false "Broken module" errors because a stored boolean
+; flag was read back through a differently-typed pointer with stale
+; data.
+
+; CHECK-LABEL: test_store_imm_load:
+; CHECK: {
+; CHECK-DAG: memw(r0+#0) = #1
+; CHECK-DAG: r0 = memw(r1+#0)
+; CHECK: } :mem_noshuf
+
+; V62-LABEL: test_store_imm_load:
+; V62-NOT: :mem_noshuf
+
+define i32 @test_store_imm_load(ptr %p, ptr %q) #0 {
+entry:
+ store i32 1, ptr %p, align 4, !tbaa !0
+ %v = load i32, ptr %q, align 4, !tbaa !3
+ ret i32 %v
+}
+
+;--- Anti-dependency path ---
+;
+; Models the LiveIntervals/MachineRegisterInfo pattern where an
+; iterator's pointer is loaded (defining a register), then a store
+; through a different TBAA type uses overlapping registers. The
+; scheduler creates an Anti dependency (register reuse) but no Order
+; dependency (TBAA says different types), so the pair can land in the
+; same packet.
+
+; CHECK-LABEL: test_anti_dep_path:
+; CHECK: {
+; CHECK-DAG: r{{[0-9]+}} = memw(r1+#0)
+; CHECK-DAG: memw(r0+#0) = #42
+; CHECK: } :mem_noshuf
+
+; V62-LABEL: test_anti_dep_path:
+; V62-NOT: :mem_noshuf
+
+define i32 @test_anti_dep_path(ptr %p, ptr %q) #0 {
+entry:
+ %addr = load ptr, ptr %q, align 4, !tbaa !3
+ store i32 42, ptr %p, align 4, !tbaa !0
+ %v = load i32, ptr %addr, align 4, !tbaa !3
+ ret i32 %v
+}
+
+;--- Multiple store-load pairs (systemic) ---
+;
+; Models the systemic miscompilation pattern where multiple unrelated
+; store-load pairs in the same function all need :mem_noshuf
+; protection. Each pair uses a different TBAA type combination,
+; modeling accesses to different C++ class hierarchies that share
+; memory through pointer casts.
+
+; CHECK-LABEL: test_multi_store_load:
+; CHECK: {
+; CHECK-DAG: memw(r0+#0) = #10
+; CHECK-DAG: r{{[0-9]+}} = memw(r1+#0)
+; CHECK: } :mem_noshuf
+; CHECK: {
+; CHECK-DAG: memw(r2+#0) = r
+; CHECK-DAG: r{{[0-9]+}} = memw(r3+#0)
+; CHECK: } :mem_noshuf
+
+; V62-LABEL: test_multi_store_load:
+; V62-NOT: :mem_noshuf
+
+define i32 @test_multi_store_load(ptr %p1, ptr %p2, ptr %p3, ptr %p4) #0 {
+entry:
+ store i32 10, ptr %p1, align 4, !tbaa !0
+ %v1 = load i32, ptr %p2, align 4, !tbaa !3
+ store i32 %v1, ptr %p3, align 4, !tbaa !5
+ %v2 = load i32, ptr %p4, align 4, !tbaa !7
+ %sum = add i32 %v1, %v2
+ ret i32 %sum
+}
+
+attributes #0 = { nounwind }
+
+; TBAA type hierarchy: four unrelated types under the same root.
+; The scheduler sees these as non-aliasing, but they may alias at
+; runtime through pointer casts (e.g., libc++ tree node base/derived).
+!0 = !{!1, !1, i64 0} ; type_a
+!1 = !{!"type_a", !2}
+!2 = !{!"tbaa_root"}
+!3 = !{!4, !4, i64 0} ; type_b
+!4 = !{!"type_b", !2}
+!5 = !{!6, !6, i64 0} ; type_c
+!6 = !{!"type_c", !2}
+!7 = !{!8, !8, i64 0} ; type_d
+!8 = !{!"type_d", !2}
More information about the llvm-commits
mailing list