[clang] [llvm] [InferAlignment] Propagate alignment between loads/stores of the same base pointer (PR #145733)
Drew Kersnar via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 7 08:36:51 PDT 2025
https://github.com/dakersnar updated https://github.com/llvm/llvm-project/pull/145733
>From 15600f92b838448691eb3d27fbd97331fcdc7bb2 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Wed, 25 Jun 2025 15:58:20 +0000
Subject: [PATCH 01/16] [LoadStoreVectorizer] Propagate alignment through
contiguous chain to improve vectorization
---
.../Vectorize/LoadStoreVectorizer.cpp | 35 +++
.../LoadStoreVectorizer/prop-align.ll | 296 ++++++++++++++++++
2 files changed, 331 insertions(+)
create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 89f63c3b66aad..e14a936b764e5 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -343,6 +343,9 @@ class Vectorizer {
/// Postcondition: For all i, ret[i][0].second == 0, because the first instr
/// in the chain is the leader, and an instr touches distance 0 from itself.
std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
+
+ /// Propagates the best alignment in a chain of contiguous accesses
+ void propagateBestAlignmentsInChain(ArrayRef<ChainElem> C) const;
};
class LoadStoreVectorizerLegacyPass : public FunctionPass {
@@ -716,6 +719,14 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
+ // We know that the accesses are contiguous. Propagate alignment
+ // information so that slices of the chain can still be vectorized.
+ propagateBestAlignmentsInChain(C);
+ LLVM_DEBUG({
+ dbgs() << "LSV: Chain after alignment propagation:\n";
+ dumpChain(C);
+ });
+
std::vector<Chain> Ret;
for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
// Find candidate chains of size not greater than the largest vector reg.
@@ -1634,3 +1645,27 @@ std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
.sextOrTrunc(OrigBitWidth);
return std::nullopt;
}
+
+void Vectorizer::propagateBestAlignmentsInChain(ArrayRef<ChainElem> C) const {
+ ChainElem BestAlignedElem = C[0];
+ Align BestAlignSoFar = getLoadStoreAlignment(C[0].Inst);
+
+ for (const ChainElem &E : C) {
+ Align OrigAlign = getLoadStoreAlignment(E.Inst);
+ if (OrigAlign > BestAlignSoFar) {
+ BestAlignedElem = E;
+ BestAlignSoFar = OrigAlign;
+ }
+
+ APInt OffsetFromBestAlignedElem =
+ E.OffsetFromLeader - BestAlignedElem.OffsetFromLeader;
+ assert(OffsetFromBestAlignedElem.isNonNegative());
+ // commonAlignment is equivalent to a greatest common power-of-two divisor;
+ // it returns the largest power of 2 that divides both A and B.
+ Align NewAlign = commonAlignment(
+ BestAlignSoFar, OffsetFromBestAlignedElem.getLimitedValue());
+ if (NewAlign > OrigAlign)
+ setLoadStoreAlignment(E.Inst, NewAlign);
+ }
+ return;
+}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll b/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll
new file mode 100644
index 0000000000000..a1878dc051d99
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll
@@ -0,0 +1,296 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=load-store-vectorizer -S < %s | FileCheck %s
+
+; The IR has the first float3 labeled with align 16, and that 16 should
+; be propagated such that the second set of 4 values
+; can also be vectorized together.
+%struct.float3 = type { float, float, float }
+%struct.S1 = type { %struct.float3, %struct.float3, i32, i32 }
+
+define void @testStore(ptr nocapture writeonly %1) {
+; CHECK-LABEL: define void @testStore(
+; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[TMP0]], align 16
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i64 0, i32 1, i32 1
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
+; CHECK-NEXT: ret void
+;
+ store float 0.000000e+00, ptr %1, align 16
+ %getElem = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 1
+ store float 0.000000e+00, ptr %getElem, align 4
+ %getElem8 = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 2
+ store float 0.000000e+00, ptr %getElem8, align 8
+ %getElem9 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1
+ store float 0.000000e+00, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 1
+ store float 0.000000e+00, ptr %getElem10, align 4
+ %getElem11 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 2
+ store float 0.000000e+00, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 2
+ store i32 0, ptr %getElem12, align 8
+ %getElem13 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 3
+ store i32 0, ptr %getElem13, align 4
+ ret void
+}
+
+define void @testLoad(ptr nocapture writeonly %1) {
+; CHECK-LABEL: define void @testLoad(
+; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0]], align 16
+; CHECK-NEXT: [[L11:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT: [[L22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i64 0, i32 1, i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
+; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[L55]] to float
+; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L66]] to float
+; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; CHECK-NEXT: ret void
+;
+ %l1 = load float, ptr %1, align 16
+ %getElem = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 1
+ %l2 = load float, ptr %getElem, align 4
+ %getElem8 = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 2
+ %l3 = load float, ptr %getElem8, align 8
+ %getElem9 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1
+ %l4 = load float, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 1
+ %l5 = load float, ptr %getElem10, align 4
+ %getElem11 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 2
+ %l6 = load float, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 2
+ %l7 = load i32, ptr %getElem12, align 8
+ %getElem13 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 3
+ %l8 = load i32, ptr %getElem13, align 4
+ ret void
+}
+
+; Also, test without the struct geps, to see if it still works with i8 geps/ptradd
+
+define void @testStorei8(ptr nocapture writeonly %1) {
+; CHECK-LABEL: define void @testStorei8(
+; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[TMP0]], align 16
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
+; CHECK-NEXT: ret void
+;
+ store float 0.000000e+00, ptr %1, align 16
+ %getElem = getelementptr inbounds i8, ptr %1, i64 4
+ store float 0.000000e+00, ptr %getElem, align 4
+ %getElem8 = getelementptr inbounds i8, ptr %1, i64 8
+ store float 0.000000e+00, ptr %getElem8, align 8
+ %getElem9 = getelementptr inbounds i8, ptr %1, i64 12
+ store float 0.000000e+00, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds i8, ptr %1, i64 16
+ store float 0.000000e+00, ptr %getElem10, align 4
+ %getElem11 = getelementptr inbounds i8, ptr %1, i64 20
+ store float 0.000000e+00, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds i8, ptr %1, i64 24
+ store i32 0, ptr %getElem12, align 8
+ %getElem13 = getelementptr inbounds i8, ptr %1, i64 28
+ store i32 0, ptr %getElem13, align 4
+ ret void
+}
+
+define void @testLoadi8(ptr nocapture writeonly %1) {
+; CHECK-LABEL: define void @testLoadi8(
+; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0]], align 16
+; CHECK-NEXT: [[L11:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT: [[L22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
+; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[L55]] to float
+; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L66]] to float
+; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; CHECK-NEXT: ret void
+;
+ %l1 = load float, ptr %1, align 16
+ %getElem = getelementptr inbounds i8, ptr %1, i64 4
+ %l2 = load float, ptr %getElem, align 4
+ %getElem8 = getelementptr inbounds i8, ptr %1, i64 8
+ %l3 = load float, ptr %getElem8, align 8
+ %getElem9 = getelementptr inbounds i8, ptr %1, i64 12
+ %l4 = load float, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds i8, ptr %1, i64 16
+ %l5 = load float, ptr %getElem10, align 4
+ %getElem11 = getelementptr inbounds i8, ptr %1, i64 20
+ %l6 = load float, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds i8, ptr %1, i64 24
+ %l7 = load i32, ptr %getElem12, align 8
+ %getElem13 = getelementptr inbounds i8, ptr %1, i64 28
+ %l8 = load i32, ptr %getElem13, align 4
+ ret void
+}
+
+
+; This version of the test adjusts the struct to hold two i32s at the beginning,
+; but still assumes that the first float3 is 16 aligned. If the alignment
+; propagation works correctly, it should be able to load this struct in three
+; loads: a 2x32, a 4x32, and a 4x32. Without the alignment propagation, the last
+; 4x32 will instead be a 2x32 and a 2x32
+%struct.S2 = type { i32, i32, %struct.float3, %struct.float3, i32, i32 }
+
+define void @testStore_2(ptr nocapture writeonly %1) {
+; CHECK-LABEL: define void @testStore_2(
+; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], ptr [[TMP0]], i64 0, i32 2
+; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[GETELEM1]], align 16
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S2]], ptr [[TMP0]], i64 0, i32 3, i32 1
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
+; CHECK-NEXT: ret void
+;
+ store i32 0, ptr %1, align 8
+ %getElem = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 1
+ store i32 0, ptr %getElem, align 4
+ %getElem1 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2
+ store float 0.000000e+00, ptr %getElem1, align 16
+ %getElem2 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 1
+ store float 0.000000e+00, ptr %getElem2, align 4
+ %getElem8 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 2
+ store float 0.000000e+00, ptr %getElem8, align 8
+ %getElem9 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3
+ store float 0.000000e+00, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 1
+ store float 0.000000e+00, ptr %getElem10, align 4
+ %getElem11 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 2
+ store float 0.000000e+00, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 4
+ store i32 0, ptr %getElem12, align 8
+ %getElem13 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 5
+ store i32 0, ptr %getElem13, align 4
+ ret void
+}
+
+define void @testLoad_2(ptr nocapture writeonly %1) {
+; CHECK-LABEL: define void @testLoad_2(
+; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], ptr [[TMP0]], i64 0, i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16
+; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S2]], ptr [[TMP0]], i64 0, i32 3, i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
+; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L77]] to float
+; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[L88]] to float
+; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT: ret void
+;
+ %l = load i32, ptr %1, align 8
+ %getElem = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 1
+ %l2 = load i32, ptr %getElem, align 4
+ %getElem1 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2
+ %l3 = load float, ptr %getElem1, align 16
+ %getElem2 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 1
+ %l4 = load float, ptr %getElem2, align 4
+ %getElem8 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 2
+ %l5 = load float, ptr %getElem8, align 8
+ %getElem9 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3
+ %l6 = load float, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 1
+ %l7 = load float, ptr %getElem10, align 4
+ %getElem11 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 2
+ %l8 = load float, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 4
+ %l9 = load i32, ptr %getElem12, align 8
+ %getElem13 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 5
+ %l0 = load i32, ptr %getElem13, align 4
+ ret void
+}
+
+; Also, test without the struct geps, to see if it still works with i8 geps/ptradd
+
+define void @testStorei8_2(ptr nocapture writeonly %1) {
+; CHECK-LABEL: define void @testStorei8_2(
+; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[GETELEM1]], align 16
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
+; CHECK-NEXT: ret void
+;
+ store i32 0, ptr %1, align 8
+ %getElem = getelementptr inbounds i8, ptr %1, i64 4
+ store i32 0, ptr %getElem, align 4
+ %getElem1 = getelementptr inbounds i8, ptr %1, i64 8
+ store float 0.000000e+00, ptr %getElem1, align 16
+ %getElem2 = getelementptr inbounds i8, ptr %1, i64 12
+ store float 0.000000e+00, ptr %getElem2, align 4
+ %getElem8 = getelementptr inbounds i8, ptr %1, i64 16
+ store float 0.000000e+00, ptr %getElem8, align 8
+ %getElem9 = getelementptr inbounds i8, ptr %1, i64 20
+ store float 0.000000e+00, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds i8, ptr %1, i64 24
+ store float 0.000000e+00, ptr %getElem10, align 4
+ %getElem11 = getelementptr inbounds i8, ptr %1, i64 28
+ store float 0.000000e+00, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds i8, ptr %1, i64 32
+ store i32 0, ptr %getElem12, align 8
+ %getElem13 = getelementptr inbounds i8, ptr %1, i64 36
+ store i32 0, ptr %getElem13, align 4
+ ret void
+}
+
+define void @testLoadi8_2(ptr nocapture writeonly %1) {
+; CHECK-LABEL: define void @testLoadi8_2(
+; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16
+; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
+; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L77]] to float
+; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[L88]] to float
+; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT: ret void
+;
+ %l = load i32, ptr %1, align 8
+ %getElem = getelementptr inbounds i8, ptr %1, i64 4
+ %l2 = load i32, ptr %getElem, align 4
+ %getElem1 = getelementptr inbounds i8, ptr %1, i64 8
+ %l3 = load float, ptr %getElem1, align 16
+ %getElem2 = getelementptr inbounds i8, ptr %1, i64 12
+ %l4 = load float, ptr %getElem2, align 4
+ %getElem8 = getelementptr inbounds i8, ptr %1, i64 16
+ %l5 = load float, ptr %getElem8, align 8
+ %getElem9 = getelementptr inbounds i8, ptr %1, i64 20
+ %l6 = load float, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds i8, ptr %1, i64 24
+ %l7 = load float, ptr %getElem10, align 4
+ %getElem11 = getelementptr inbounds i8, ptr %1, i64 28
+ %l8 = load float, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds i8, ptr %1, i64 32
+ %l9 = load i32, ptr %getElem12, align 8
+ %getElem13 = getelementptr inbounds i8, ptr %1, i64 36
+ %l0 = load i32, ptr %getElem13, align 4
+ ret void
+}
>From b905f1c05ff6e0e531bf5cb65fe067635673e76c Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Wed, 25 Jun 2025 17:48:57 +0000
Subject: [PATCH 02/16] Address feedback, add reverse propagation, simplify and
expand unit tests
---
.../Vectorize/LoadStoreVectorizer.cpp | 43 ++--
.../LoadStoreVectorizer/prop-align.ll | 186 ++++++++++++++++--
2 files changed, 194 insertions(+), 35 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index e14a936b764e5..95ec574be7d2c 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -1647,25 +1647,30 @@ std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
}
void Vectorizer::propagateBestAlignmentsInChain(ArrayRef<ChainElem> C) const {
- ChainElem BestAlignedElem = C[0];
- Align BestAlignSoFar = getLoadStoreAlignment(C[0].Inst);
-
- for (const ChainElem &E : C) {
- Align OrigAlign = getLoadStoreAlignment(E.Inst);
- if (OrigAlign > BestAlignSoFar) {
- BestAlignedElem = E;
- BestAlignSoFar = OrigAlign;
+ auto PropagateAlignments = [](auto ChainIt) {
+ ChainElem BestAlignedElem = *ChainIt.begin();
+ Align BestAlignSoFar = getLoadStoreAlignment(BestAlignedElem.Inst);
+
+ for (const ChainElem &E : ChainIt) {
+ Align OrigAlign = getLoadStoreAlignment(E.Inst);
+ if (OrigAlign > BestAlignSoFar) {
+ BestAlignedElem = E;
+ BestAlignSoFar = OrigAlign;
+ continue;
+ }
+
+ APInt DeltaFromBestAlignedElem =
+ APIntOps::abdu(E.OffsetFromLeader, BestAlignedElem.OffsetFromLeader);
+ // commonAlignment is equivalent to a greatest common power-of-two
+ // divisor; it returns the largest power of 2 that divides both A and B.
+ Align NewAlign = commonAlignment(
+ BestAlignSoFar, DeltaFromBestAlignedElem.getLimitedValue());
+ if (NewAlign > OrigAlign)
+ setLoadStoreAlignment(E.Inst, NewAlign);
}
+ };
- APInt OffsetFromBestAlignedElem =
- E.OffsetFromLeader - BestAlignedElem.OffsetFromLeader;
- assert(OffsetFromBestAlignedElem.isNonNegative());
- // commonAlignment is equivalent to a greatest common power-of-two divisor;
- // it returns the largest power of 2 that divides both A and B.
- Align NewAlign = commonAlignment(
- BestAlignSoFar, OffsetFromBestAlignedElem.getLimitedValue());
- if (NewAlign > OrigAlign)
- setLoadStoreAlignment(E.Inst, NewAlign);
- }
- return;
+ // Propagate forwards and backwards.
+ PropagateAlignments(C);
+ PropagateAlignments(reverse(C));
}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll b/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll
index a1878dc051d99..aeface5f91abd 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll
@@ -7,9 +7,9 @@
%struct.float3 = type { float, float, float }
%struct.S1 = type { %struct.float3, %struct.float3, i32, i32 }
-define void @testStore(ptr nocapture writeonly %1) {
+define void @testStore(ptr %1) {
; CHECK-LABEL: define void @testStore(
-; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[TMP0]], align 16
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i64 0, i32 1, i32 1
; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
@@ -33,9 +33,9 @@ define void @testStore(ptr nocapture writeonly %1) {
ret void
}
-define void @testLoad(ptr nocapture writeonly %1) {
+define void @testLoad(ptr %1) {
; CHECK-LABEL: define void @testLoad(
-; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0]], align 16
; CHECK-NEXT: [[L11:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
; CHECK-NEXT: [[L22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
@@ -71,9 +71,9 @@ define void @testLoad(ptr nocapture writeonly %1) {
; Also, test without the struct geps, to see if it still works with i8 geps/ptradd
-define void @testStorei8(ptr nocapture writeonly %1) {
+define void @testStorei8(ptr %1) {
; CHECK-LABEL: define void @testStorei8(
-; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[TMP0]], align 16
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
@@ -97,9 +97,9 @@ define void @testStorei8(ptr nocapture writeonly %1) {
ret void
}
-define void @testLoadi8(ptr nocapture writeonly %1) {
+define void @testLoadi8(ptr %1) {
; CHECK-LABEL: define void @testLoadi8(
-; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0]], align 16
; CHECK-NEXT: [[L11:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
; CHECK-NEXT: [[L22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
@@ -141,9 +141,9 @@ define void @testLoadi8(ptr nocapture writeonly %1) {
; 4x32 will instead be a 2x32 and a 2x32
%struct.S2 = type { i32, i32, %struct.float3, %struct.float3, i32, i32 }
-define void @testStore_2(ptr nocapture writeonly %1) {
+define void @testStore_2(ptr %1) {
; CHECK-LABEL: define void @testStore_2(
-; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP0]], align 8
; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], ptr [[TMP0]], i64 0, i32 2
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[GETELEM1]], align 16
@@ -173,9 +173,9 @@ define void @testStore_2(ptr nocapture writeonly %1) {
ret void
}
-define void @testLoad_2(ptr nocapture writeonly %1) {
+define void @testLoad_2(ptr %1) {
; CHECK-LABEL: define void @testLoad_2(
-; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
@@ -219,9 +219,9 @@ define void @testLoad_2(ptr nocapture writeonly %1) {
; Also, test without the struct geps, to see if it still works with i8 geps/ptradd
-define void @testStorei8_2(ptr nocapture writeonly %1) {
+define void @testStorei8_2(ptr %1) {
; CHECK-LABEL: define void @testStorei8_2(
-; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP0]], align 8
; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[GETELEM1]], align 16
@@ -251,9 +251,9 @@ define void @testStorei8_2(ptr nocapture writeonly %1) {
ret void
}
-define void @testLoadi8_2(ptr nocapture writeonly %1) {
+define void @testLoadi8_2(ptr %1) {
; CHECK-LABEL: define void @testLoadi8_2(
-; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
@@ -294,3 +294,157 @@ define void @testLoadi8_2(ptr nocapture writeonly %1) {
%l0 = load i32, ptr %getElem13, align 4
ret void
}
+
+; Test that the alignment propagation works both forwards and backwards.
+; with the "align 16" placed where it is,
+; we should end up with a v2 followed by two v4s followed by a v2.
+define void @test_forward_and_reverse(ptr %1) {
+; CHECK-LABEL: define void @test_forward_and_reverse(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16
+; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[GETELEM10]], align 16
+; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; CHECK-NEXT: [[GETELEM14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 40
+; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[GETELEM14]], align 16
+; CHECK-NEXT: [[L1111:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0
+; CHECK-NEXT: [[L1212:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
+; CHECK-NEXT: ret void
+;
+ %l = load i32, ptr %1, align 4
+ %getElem = getelementptr inbounds i8, ptr %1, i64 4
+ %l2 = load i32, ptr %getElem, align 4
+ %getElem1 = getelementptr inbounds i8, ptr %1, i64 8
+ %l3 = load float, ptr %getElem1, align 4
+ %getElem2 = getelementptr inbounds i8, ptr %1, i64 12
+ %l4 = load float, ptr %getElem2, align 4
+ %getElem8 = getelementptr inbounds i8, ptr %1, i64 16
+ %l5 = load float, ptr %getElem8, align 4
+ %getElem9 = getelementptr inbounds i8, ptr %1, i64 20
+ %l6 = load float, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds i8, ptr %1, i64 24
+ %l7 = load float, ptr %getElem10, align 16
+ %getElem11 = getelementptr inbounds i8, ptr %1, i64 28
+ %l8 = load float, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds i8, ptr %1, i64 32
+ %l9 = load float, ptr %getElem12, align 4
+ %getElem13 = getelementptr inbounds i8, ptr %1, i64 36
+ %l0 = load float, ptr %getElem13, align 4
+ %getElem14 = getelementptr inbounds i8, ptr %1, i64 40
+ %l11 = load i32, ptr %getElem14, align 4
+ %getElem15 = getelementptr inbounds i8, ptr %1, i64 44
+ %l12 = load i32, ptr %getElem15, align 4
+ ret void
+}
+
+; Test an edge case where the defined alignment is max align
+define void @test_forward_and_reverse_max_align(ptr %1) {
+; CHECK-LABEL: define void @test_forward_and_reverse_max_align(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16
+; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[GETELEM10]], align 4294967296
+; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; CHECK-NEXT: [[GETELEM14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 40
+; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[GETELEM14]], align 16
+; CHECK-NEXT: [[L1111:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
+; CHECK-NEXT: [[L1212:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; CHECK-NEXT: ret void
+;
+ %l = load i32, ptr %1, align 4
+ %getElem = getelementptr inbounds i8, ptr %1, i64 4
+ %l2 = load i32, ptr %getElem, align 4
+ %getElem1 = getelementptr inbounds i8, ptr %1, i64 8
+ %l3 = load float, ptr %getElem1, align 4
+ %getElem2 = getelementptr inbounds i8, ptr %1, i64 12
+ %l4 = load float, ptr %getElem2, align 4
+ %getElem8 = getelementptr inbounds i8, ptr %1, i64 16
+ %l5 = load float, ptr %getElem8, align 4
+ %getElem9 = getelementptr inbounds i8, ptr %1, i64 20
+ %l6 = load float, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds i8, ptr %1, i64 24
+ %l7 = load float, ptr %getElem10, align 4294967296
+ %getElem11 = getelementptr inbounds i8, ptr %1, i64 28
+ %l8 = load float, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds i8, ptr %1, i64 32
+ %l9 = load float, ptr %getElem12, align 4
+ %getElem13 = getelementptr inbounds i8, ptr %1, i64 36
+ %l0 = load float, ptr %getElem13, align 4
+ %getElem14 = getelementptr inbounds i8, ptr %1, i64 40
+ %l11 = load i32, ptr %getElem14, align 4
+ %getElem15 = getelementptr inbounds i8, ptr %1, i64 44
+ %l12 = load i32, ptr %getElem15, align 4
+ ret void
+}
+
+define void @test_i8_elements(ptr %1) {
+; CHECK-LABEL: define void @test_i8_elements(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr [[TMP0]], align 2
+; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
+; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
+; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GETELEM1]], align 4
+; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x i8> [[TMP3]], i32 0
+; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x i8> [[TMP3]], i32 1
+; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x i8> [[TMP3]], i32 2
+; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x i8> [[TMP3]], i32 3
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 6
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[GETELEM10]], align 4
+; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i8> [[TMP4]], i32 0
+; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i8> [[TMP4]], i32 1
+; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x i8> [[TMP4]], i32 2
+; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3
+; CHECK-NEXT: [[GETELEM14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 10
+; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i8>, ptr [[GETELEM14]], align 4
+; CHECK-NEXT: [[L1111:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
+; CHECK-NEXT: [[L1212:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
+; CHECK-NEXT: ret void
+;
+ %l = load i8, ptr %1, align 1
+ %getElem = getelementptr inbounds i8, ptr %1, i64 1
+ %l2 = load i8, ptr %getElem, align 1
+ %getElem1 = getelementptr inbounds i8, ptr %1, i64 2
+ %l3 = load i8, ptr %getElem1, align 1
+ %getElem2 = getelementptr inbounds i8, ptr %1, i64 3
+ %l4 = load i8, ptr %getElem2, align 1
+ %getElem8 = getelementptr inbounds i8, ptr %1, i64 4
+ %l5 = load i8, ptr %getElem8, align 1
+ %getElem9 = getelementptr inbounds i8, ptr %1, i64 5
+ %l6 = load i8, ptr %getElem9, align 1
+ %getElem10 = getelementptr inbounds i8, ptr %1, i64 6
+ %l7 = load i8, ptr %getElem10, align 4
+ %getElem11 = getelementptr inbounds i8, ptr %1, i64 7
+ %l8 = load i8, ptr %getElem11, align 1
+ %getElem12 = getelementptr inbounds i8, ptr %1, i64 8
+ %l9 = load i8, ptr %getElem12, align 1
+ %getElem13 = getelementptr inbounds i8, ptr %1, i64 9
+ %l0 = load i8, ptr %getElem13, align 1
+ %getElem14 = getelementptr inbounds i8, ptr %1, i64 10
+ %l11 = load i8, ptr %getElem14, align 1
+ %getElem15 = getelementptr inbounds i8, ptr %1, i64 11
+ %l12 = load i8, ptr %getElem15, align 1
+ ret void
+}
>From cd9e9f6cec8a170f3037a0dbbbd5a001a10c821e Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Wed, 25 Jun 2025 22:49:08 +0000
Subject: [PATCH 03/16] For consistency, remove the delaying of load/store
alignment upgrading
---
llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 9 +--------
.../LoadStoreVectorizer/X86/massive_indirection.ll | 2 +-
2 files changed, 2 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 95ec574be7d2c..d3c22dea72efb 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -834,6 +834,7 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
<< Alignment.value() << " to " << NewAlign.value()
<< "\n");
Alignment = NewAlign;
+ setLoadStoreAlignment(C[CBegin].Inst, Alignment);
}
}
@@ -891,14 +892,6 @@ bool Vectorizer::vectorizeChain(Chain &C) {
VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
Align Alignment = getLoadStoreAlignment(C[0].Inst);
- // If this is a load/store of an alloca, we might have upgraded the alloca's
- // alignment earlier. Get the new alignment.
- if (AS == DL.getAllocaAddrSpace()) {
- Alignment = std::max(
- Alignment,
- getOrEnforceKnownAlignment(getLoadStorePointerOperand(C[0].Inst),
- MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
- }
// All elements of the chain must have the same scalar-type size.
#ifndef NDEBUG
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
index fe8a7e58a6a57..0931caa1fde8a 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
@@ -155,7 +155,7 @@ define void @variadics1(ptr %vlist) {
; CHECK-NEXT: [[ARGP_NEXT12:%.*]] = getelementptr i8, ptr [[ARGP_CUR11_ALIGNED]], i64 8
; CHECK-NEXT: [[X2:%.*]] = getelementptr i8, ptr [[ARGP_NEXT12]], i32 7
; CHECK-NEXT: [[ARGP_CUR16_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[X2]], i64 0)
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARGP_CUR16_ALIGNED]], align 4294967296
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARGP_CUR16_ALIGNED]], align 8
; CHECK-NEXT: [[X31:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
; CHECK-NEXT: [[X42:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
; CHECK-NEXT: [[X5:%.*]] = fadd double [[X42]], [[X31]]
>From 9c99e0a59ea12fe08c46b9627e9ce6d08ed4faf8 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Thu, 10 Jul 2025 19:49:32 +0000
Subject: [PATCH 04/16] Refactor algorithm
---
.../Vectorize/LoadStoreVectorizer.cpp | 46 ++++++++-----------
1 file changed, 19 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index d3c22dea72efb..c9dba72f46e47 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -345,7 +345,7 @@ class Vectorizer {
std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
/// Propagates the best alignment in a chain of contiguous accesses
- void propagateBestAlignmentsInChain(ArrayRef<ChainElem> C) const;
+ void propagateBestAlignmentInChain(ArrayRef<ChainElem> C) const;
};
class LoadStoreVectorizerLegacyPass : public FunctionPass {
@@ -721,7 +721,7 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
// We know that the accesses are contiguous. Propagate alignment
// information so that slices of the chain can still be vectorized.
- propagateBestAlignmentsInChain(C);
+ propagateBestAlignmentInChain(C);
LLVM_DEBUG({
dbgs() << "LSV: Chain after alignment propagation:\n";
dumpChain(C);
@@ -1639,31 +1639,23 @@ std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
return std::nullopt;
}
-void Vectorizer::propagateBestAlignmentsInChain(ArrayRef<ChainElem> C) const {
- auto PropagateAlignments = [](auto ChainIt) {
- ChainElem BestAlignedElem = *ChainIt.begin();
- Align BestAlignSoFar = getLoadStoreAlignment(BestAlignedElem.Inst);
-
- for (const ChainElem &E : ChainIt) {
- Align OrigAlign = getLoadStoreAlignment(E.Inst);
- if (OrigAlign > BestAlignSoFar) {
- BestAlignedElem = E;
- BestAlignSoFar = OrigAlign;
- continue;
- }
-
- APInt DeltaFromBestAlignedElem =
- APIntOps::abdu(E.OffsetFromLeader, BestAlignedElem.OffsetFromLeader);
- // commonAlignment is equivalent to a greatest common power-of-two
- // divisor; it returns the largest power of 2 that divides both A and B.
- Align NewAlign = commonAlignment(
- BestAlignSoFar, DeltaFromBestAlignedElem.getLimitedValue());
- if (NewAlign > OrigAlign)
- setLoadStoreAlignment(E.Inst, NewAlign);
+void Vectorizer::propagateBestAlignmentInChain(ArrayRef<ChainElem> C) const {
+ // Find the element in the chain with the best alignment and its offset.
+ Align BestAlign = getLoadStoreAlignment(C[0].Inst);
+ APInt BestAlignOffset = C[0].OffsetFromLeader;
+ for (const ChainElem &Elem : C) {
+ Align ElemAlign = getLoadStoreAlignment(Elem.Inst);
+ if (ElemAlign > BestAlign) {
+ BestAlign = ElemAlign;
+ BestAlignOffset = Elem.OffsetFromLeader;
}
- };
+ }
- // Propagate forwards and backwards.
- PropagateAlignments(C);
- PropagateAlignments(reverse(C));
+ // Propagate the best alignment to other elements in the chain, if possible.
+ for (const ChainElem &Elem : C) {
+ APInt OffsetDelta = APIntOps::abdu(Elem.OffsetFromLeader, BestAlignOffset);
+ Align NewAlign = commonAlignment(BestAlign, OffsetDelta.getLimitedValue());
+ if (NewAlign > getLoadStoreAlignment(Elem.Inst))
+ setLoadStoreAlignment(Elem.Inst, NewAlign);
+ }
}
>From 8db304a8e0f37ab7199832ede79654c81d3e46e7 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Tue, 22 Jul 2025 16:08:03 +0000
Subject: [PATCH 05/16] Alternate approach to solve the problem in
InferAlignment
---
llvm/lib/Transforms/Scalar/InferAlignment.cpp | 41 ++++++
.../propagate-from-other-load-stores.ll | 136 ++++++++++++++++++
2 files changed, 177 insertions(+)
create mode 100644 llvm/test/Transforms/InferAlignment/propagate-from-other-load-stores.ll
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index 0ddc23152d84f..0570ba653c2dc 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -21,6 +21,43 @@
using namespace llvm;
+static bool tryToPropagateAlign(Function &F, const DataLayout &DL) {
+ bool Changed = false;
+
+ for (BasicBlock &BB : F) {
+ // We need to reset the map for each block because alignment information
+ // can't be propagated across blocks. This is because control flow could
+ // be dependent on the address at runtime, making an alignment assumption
+ // within one block not true in another. Some sort of dominator tree
+ // approach could be better, but restricting within a basic block is correct
+ // too.
+ DenseMap<Value *, Align> BestBasePointerAligns;
+ for (Instruction &I : BB) {
+ if (auto *PtrOp = getLoadStorePointerOperand(&I)) {
+ Align LoadStoreAlign = getLoadStoreAlignment(&I);
+ APInt OffsetFromBase = APInt(
+ DL.getIndexSizeInBits(PtrOp->getType()->getPointerAddressSpace()),
+ 0);
+ PtrOp = PtrOp->stripAndAccumulateInBoundsConstantOffsets(
+ DL, OffsetFromBase);
+ Align BasePointerAlign =
+ commonAlignment(LoadStoreAlign, OffsetFromBase.getLimitedValue());
+
+ if (BestBasePointerAligns.count(PtrOp) &&
+ BestBasePointerAligns[PtrOp] > BasePointerAlign) {
+ Align BetterLoadStoreAlign = commonAlignment(
+ BestBasePointerAligns[PtrOp], OffsetFromBase.getLimitedValue());
+ setLoadStoreAlignment(&I, BetterLoadStoreAlign);
+ Changed = true;
+ } else {
+ BestBasePointerAligns[PtrOp] = BasePointerAlign;
+ }
+ }
+ }
+ }
+ return Changed;
+}
+
static bool tryToImproveAlign(
const DataLayout &DL, Instruction *I,
function_ref<Align(Value *PtrOp, Align OldAlign, Align PrefAlign)> Fn) {
@@ -70,6 +107,10 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
}
}
+ // Propagate alignment between loads and stores that originate from the same
+ // base pointer
+ Changed |= tryToPropagateAlign(F, DL);
+
return Changed;
}
diff --git a/llvm/test/Transforms/InferAlignment/propagate-from-other-load-stores.ll b/llvm/test/Transforms/InferAlignment/propagate-from-other-load-stores.ll
new file mode 100644
index 0000000000000..4115d599ccd79
--- /dev/null
+++ b/llvm/test/Transforms/InferAlignment/propagate-from-other-load-stores.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=infer-alignment -S | FileCheck %s
+%struct.S1 = type { %struct.float3, %struct.float3, i32, i32 }
+%struct.float3 = type { float, float, float }
+
+
+; ------------------------------------------------------------------------------
+; Test that we can propagate the align 16 to the load and store that are set to align 4
+; ------------------------------------------------------------------------------
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define void @prop_align(ptr noundef readonly captures(none) %v, ptr noundef writeonly captures(none) initializes((0, 32)) %vout) local_unnamed_addr #0 {
+; CHECK-LABEL: define void @prop_align(
+; CHECK-SAME: ptr noundef readonly captures(none) [[V:%.*]], ptr noundef writeonly captures(none) initializes((0, 32)) [[VOUT:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load float, ptr [[V]], align 16
+; CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 4
+; CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load float, ptr [[DOTUNPACK_ELT7]], align 4
+; CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 8
+; CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load float, ptr [[DOTUNPACK_ELT9]], align 8
+; CHECK-NEXT: [[DOTELT1:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 12
+; CHECK-NEXT: [[DOTUNPACK2_UNPACK:%.*]] = load float, ptr [[DOTELT1]], align 4
+; CHECK-NEXT: [[DOTUNPACK2_ELT12:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 16
+; CHECK-NEXT: [[DOTUNPACK2_UNPACK13:%.*]] = load float, ptr [[DOTUNPACK2_ELT12]], align 16
+; CHECK-NEXT: [[DOTUNPACK2_ELT14:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 20
+; CHECK-NEXT: [[DOTUNPACK2_UNPACK15:%.*]] = load float, ptr [[DOTUNPACK2_ELT14]], align 4
+; CHECK-NEXT: [[DOTELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 24
+; CHECK-NEXT: [[DOTUNPACK4:%.*]] = load i32, ptr [[DOTELT3]], align 8
+; CHECK-NEXT: [[DOTELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 28
+; CHECK-NEXT: [[DOTUNPACK6:%.*]] = load i32, ptr [[DOTELT5]], align 4
+; CHECK-NEXT: store float [[DOTUNPACK_UNPACK]], ptr [[VOUT]], align 16
+; CHECK-NEXT: [[VOUT_REPACK23:%.*]] = getelementptr inbounds nuw i8, ptr [[VOUT]], i64 4
+; CHECK-NEXT: store float [[DOTUNPACK_UNPACK8]], ptr [[VOUT_REPACK23]], align 4
+; CHECK-NEXT: [[VOUT_REPACK25:%.*]] = getelementptr inbounds nuw i8, ptr [[VOUT]], i64 8
+; CHECK-NEXT: store float [[DOTUNPACK_UNPACK10]], ptr [[VOUT_REPACK25]], align 8
+; CHECK-NEXT: [[VOUT_REPACK17:%.*]] = getelementptr inbounds nuw i8, ptr [[VOUT]], i64 12
+; CHECK-NEXT: store float [[DOTUNPACK2_UNPACK]], ptr [[VOUT_REPACK17]], align 4
+; CHECK-NEXT: [[VOUT_REPACK17_REPACK27:%.*]] = getelementptr inbounds nuw i8, ptr [[VOUT]], i64 16
+; CHECK-NEXT: store float [[DOTUNPACK2_UNPACK13]], ptr [[VOUT_REPACK17_REPACK27]], align 16
+; CHECK-NEXT: [[VOUT_REPACK17_REPACK29:%.*]] = getelementptr inbounds nuw i8, ptr [[VOUT]], i64 20
+; CHECK-NEXT: store float [[DOTUNPACK2_UNPACK15]], ptr [[VOUT_REPACK17_REPACK29]], align 4
+; CHECK-NEXT: [[VOUT_REPACK19:%.*]] = getelementptr inbounds nuw i8, ptr [[VOUT]], i64 24
+; CHECK-NEXT: store i32 [[DOTUNPACK4]], ptr [[VOUT_REPACK19]], align 8
+; CHECK-NEXT: [[VOUT_REPACK21:%.*]] = getelementptr inbounds nuw i8, ptr [[VOUT]], i64 28
+; CHECK-NEXT: store i32 [[DOTUNPACK6]], ptr [[VOUT_REPACK21]], align 4
+; CHECK-NEXT: ret void
+;
+ %.unpack.unpack = load float, ptr %v, align 16
+ %.unpack.elt7 = getelementptr inbounds nuw i8, ptr %v, i64 4
+ %.unpack.unpack8 = load float, ptr %.unpack.elt7, align 4
+ %.unpack.elt9 = getelementptr inbounds nuw i8, ptr %v, i64 8
+ %.unpack.unpack10 = load float, ptr %.unpack.elt9, align 8
+ %.elt1 = getelementptr inbounds nuw i8, ptr %v, i64 12
+ %.unpack2.unpack = load float, ptr %.elt1, align 4
+ %.unpack2.elt12 = getelementptr inbounds nuw i8, ptr %v, i64 16
+ %.unpack2.unpack13 = load float, ptr %.unpack2.elt12, align 4
+ %.unpack2.elt14 = getelementptr inbounds nuw i8, ptr %v, i64 20
+ %.unpack2.unpack15 = load float, ptr %.unpack2.elt14, align 4
+ %.elt3 = getelementptr inbounds nuw i8, ptr %v, i64 24
+ %.unpack4 = load i32, ptr %.elt3, align 8
+ %.elt5 = getelementptr inbounds nuw i8, ptr %v, i64 28
+ %.unpack6 = load i32, ptr %.elt5, align 4
+ store float %.unpack.unpack, ptr %vout, align 16
+ %vout.repack23 = getelementptr inbounds nuw i8, ptr %vout, i64 4
+ store float %.unpack.unpack8, ptr %vout.repack23, align 4
+ %vout.repack25 = getelementptr inbounds nuw i8, ptr %vout, i64 8
+ store float %.unpack.unpack10, ptr %vout.repack25, align 8
+ %vout.repack17 = getelementptr inbounds nuw i8, ptr %vout, i64 12
+ store float %.unpack2.unpack, ptr %vout.repack17, align 4
+ %vout.repack17.repack27 = getelementptr inbounds nuw i8, ptr %vout, i64 16
+ store float %.unpack2.unpack13, ptr %vout.repack17.repack27, align 4
+ %vout.repack17.repack29 = getelementptr inbounds nuw i8, ptr %vout, i64 20
+ store float %.unpack2.unpack15, ptr %vout.repack17.repack29, align 4
+ %vout.repack19 = getelementptr inbounds nuw i8, ptr %vout, i64 24
+ store i32 %.unpack4, ptr %vout.repack19, align 8
+ %vout.repack21 = getelementptr inbounds nuw i8, ptr %vout, i64 28
+ store i32 %.unpack6, ptr %vout.repack21, align 4
+ ret void
+}
+
+; ------------------------------------------------------------------------------
+; Test that alignment is not propagated from a source that does not dominate the destination
+; ------------------------------------------------------------------------------
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define void @no_prop_align(ptr noundef readonly captures(none) %v, ptr noundef writeonly captures(none) initializes((0, 32)) %vout, i1 %cond) local_unnamed_addr #0 {
+; CHECK-LABEL: define void @no_prop_align(
+; CHECK-SAME: ptr noundef readonly captures(none) [[V:%.*]], ptr noundef writeonly captures(none) initializes((0, 32)) [[VOUT:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: br i1 [[COND]], label %[[BRANCH1:.*]], label %[[BRANCH2:.*]]
+; CHECK: [[BRANCH1]]:
+; CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load float, ptr [[V]], align 16
+; CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 4
+; CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load float, ptr [[DOTUNPACK_ELT7]], align 4
+; CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 8
+; CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load float, ptr [[DOTUNPACK_ELT9]], align 8
+; CHECK-NEXT: [[DOTELT1:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 12
+; CHECK-NEXT: [[DOTUNPACK2_UNPACK:%.*]] = load float, ptr [[DOTELT1]], align 4
+; CHECK-NEXT: br label %[[END:.*]]
+; CHECK: [[BRANCH2]]:
+; CHECK-NEXT: [[DOTUNPACK2_ELT12:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 16
+; CHECK-NEXT: [[DOTUNPACK2_UNPACK13:%.*]] = load float, ptr [[DOTUNPACK2_ELT12]], align 4
+; CHECK-NEXT: [[DOTUNPACK2_ELT14:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 20
+; CHECK-NEXT: [[DOTUNPACK2_UNPACK15:%.*]] = load float, ptr [[DOTUNPACK2_ELT14]], align 4
+; CHECK-NEXT: [[DOTELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 24
+; CHECK-NEXT: [[DOTUNPACK4:%.*]] = load i32, ptr [[DOTELT3]], align 8
+; CHECK-NEXT: [[DOTELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 28
+; CHECK-NEXT: [[DOTUNPACK6:%.*]] = load i32, ptr [[DOTELT5]], align 4
+; CHECK-NEXT: br label %[[END]]
+; CHECK: [[END]]:
+; CHECK-NEXT: ret void
+;
+ br i1 %cond, label %branch1, label %branch2
+
+branch1:
+ %.unpack.unpack = load float, ptr %v, align 16
+ %.unpack.elt7 = getelementptr inbounds nuw i8, ptr %v, i64 4
+ %.unpack.unpack8 = load float, ptr %.unpack.elt7, align 4
+ %.unpack.elt9 = getelementptr inbounds nuw i8, ptr %v, i64 8
+ %.unpack.unpack10 = load float, ptr %.unpack.elt9, align 8
+ %.elt1 = getelementptr inbounds nuw i8, ptr %v, i64 12
+ %.unpack2.unpack = load float, ptr %.elt1, align 4
+ br label %end
+
+branch2:
+ %.unpack2.elt12 = getelementptr inbounds nuw i8, ptr %v, i64 16
+ %.unpack2.unpack13 = load float, ptr %.unpack2.elt12, align 4
+ %.unpack2.elt14 = getelementptr inbounds nuw i8, ptr %v, i64 20
+ %.unpack2.unpack15 = load float, ptr %.unpack2.elt14, align 4
+ %.elt3 = getelementptr inbounds nuw i8, ptr %v, i64 24
+ %.unpack4 = load i32, ptr %.elt3, align 8
+ %.elt5 = getelementptr inbounds nuw i8, ptr %v, i64 28
+ %.unpack6 = load i32, ptr %.elt5, align 4
+ br label %end
+
+end:
+ ret void
+}
>From d696eee1ae7d0db28b930e7cb621038e38e55417 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Tue, 22 Jul 2025 16:15:52 +0000
Subject: [PATCH 06/16] Revert original solution
---
.../Vectorize/LoadStoreVectorizer.cpp | 41 +-
.../X86/massive_indirection.ll | 2 +-
.../LoadStoreVectorizer/prop-align.ll | 450 ------------------
3 files changed, 9 insertions(+), 484 deletions(-)
delete mode 100644 llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index c9dba72f46e47..89f63c3b66aad 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -343,9 +343,6 @@ class Vectorizer {
/// Postcondition: For all i, ret[i][0].second == 0, because the first instr
/// in the chain is the leader, and an instr touches distance 0 from itself.
std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
-
- /// Propagates the best alignment in a chain of contiguous accesses
- void propagateBestAlignmentInChain(ArrayRef<ChainElem> C) const;
};
class LoadStoreVectorizerLegacyPass : public FunctionPass {
@@ -719,14 +716,6 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
- // We know that the accesses are contiguous. Propagate alignment
- // information so that slices of the chain can still be vectorized.
- propagateBestAlignmentInChain(C);
- LLVM_DEBUG({
- dbgs() << "LSV: Chain after alignment propagation:\n";
- dumpChain(C);
- });
-
std::vector<Chain> Ret;
for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
// Find candidate chains of size not greater than the largest vector reg.
@@ -834,7 +823,6 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
<< Alignment.value() << " to " << NewAlign.value()
<< "\n");
Alignment = NewAlign;
- setLoadStoreAlignment(C[CBegin].Inst, Alignment);
}
}
@@ -892,6 +880,14 @@ bool Vectorizer::vectorizeChain(Chain &C) {
VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
Align Alignment = getLoadStoreAlignment(C[0].Inst);
+ // If this is a load/store of an alloca, we might have upgraded the alloca's
+ // alignment earlier. Get the new alignment.
+ if (AS == DL.getAllocaAddrSpace()) {
+ Alignment = std::max(
+ Alignment,
+ getOrEnforceKnownAlignment(getLoadStorePointerOperand(C[0].Inst),
+ MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
+ }
// All elements of the chain must have the same scalar-type size.
#ifndef NDEBUG
@@ -1638,24 +1634,3 @@ std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
.sextOrTrunc(OrigBitWidth);
return std::nullopt;
}
-
-void Vectorizer::propagateBestAlignmentInChain(ArrayRef<ChainElem> C) const {
- // Find the element in the chain with the best alignment and its offset.
- Align BestAlign = getLoadStoreAlignment(C[0].Inst);
- APInt BestAlignOffset = C[0].OffsetFromLeader;
- for (const ChainElem &Elem : C) {
- Align ElemAlign = getLoadStoreAlignment(Elem.Inst);
- if (ElemAlign > BestAlign) {
- BestAlign = ElemAlign;
- BestAlignOffset = Elem.OffsetFromLeader;
- }
- }
-
- // Propagate the best alignment to other elements in the chain, if possible.
- for (const ChainElem &Elem : C) {
- APInt OffsetDelta = APIntOps::abdu(Elem.OffsetFromLeader, BestAlignOffset);
- Align NewAlign = commonAlignment(BestAlign, OffsetDelta.getLimitedValue());
- if (NewAlign > getLoadStoreAlignment(Elem.Inst))
- setLoadStoreAlignment(Elem.Inst, NewAlign);
- }
-}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
index 0931caa1fde8a..fe8a7e58a6a57 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
@@ -155,7 +155,7 @@ define void @variadics1(ptr %vlist) {
; CHECK-NEXT: [[ARGP_NEXT12:%.*]] = getelementptr i8, ptr [[ARGP_CUR11_ALIGNED]], i64 8
; CHECK-NEXT: [[X2:%.*]] = getelementptr i8, ptr [[ARGP_NEXT12]], i32 7
; CHECK-NEXT: [[ARGP_CUR16_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[X2]], i64 0)
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARGP_CUR16_ALIGNED]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARGP_CUR16_ALIGNED]], align 4294967296
; CHECK-NEXT: [[X31:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
; CHECK-NEXT: [[X42:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
; CHECK-NEXT: [[X5:%.*]] = fadd double [[X42]], [[X31]]
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll b/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll
deleted file mode 100644
index aeface5f91abd..0000000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll
+++ /dev/null
@@ -1,450 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=load-store-vectorizer -S < %s | FileCheck %s
-
-; The IR has the first float3 labeled with align 16, and that 16 should
-; be propagated such that the second set of 4 values
-; can also be vectorized together.
-%struct.float3 = type { float, float, float }
-%struct.S1 = type { %struct.float3, %struct.float3, i32, i32 }
-
-define void @testStore(ptr %1) {
-; CHECK-LABEL: define void @testStore(
-; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[TMP0]], align 16
-; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i64 0, i32 1, i32 1
-; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
-; CHECK-NEXT: ret void
-;
- store float 0.000000e+00, ptr %1, align 16
- %getElem = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 1
- store float 0.000000e+00, ptr %getElem, align 4
- %getElem8 = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 2
- store float 0.000000e+00, ptr %getElem8, align 8
- %getElem9 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1
- store float 0.000000e+00, ptr %getElem9, align 4
- %getElem10 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 1
- store float 0.000000e+00, ptr %getElem10, align 4
- %getElem11 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 2
- store float 0.000000e+00, ptr %getElem11, align 4
- %getElem12 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 2
- store i32 0, ptr %getElem12, align 8
- %getElem13 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 3
- store i32 0, ptr %getElem13, align 4
- ret void
-}
-
-define void @testLoad(ptr %1) {
-; CHECK-LABEL: define void @testLoad(
-; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0]], align 16
-; CHECK-NEXT: [[L11:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT: [[L22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
-; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i64 0, i32 1, i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
-; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[L55]] to float
-; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L66]] to float
-; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; CHECK-NEXT: ret void
-;
- %l1 = load float, ptr %1, align 16
- %getElem = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 1
- %l2 = load float, ptr %getElem, align 4
- %getElem8 = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 2
- %l3 = load float, ptr %getElem8, align 8
- %getElem9 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1
- %l4 = load float, ptr %getElem9, align 4
- %getElem10 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 1
- %l5 = load float, ptr %getElem10, align 4
- %getElem11 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 2
- %l6 = load float, ptr %getElem11, align 4
- %getElem12 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 2
- %l7 = load i32, ptr %getElem12, align 8
- %getElem13 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 3
- %l8 = load i32, ptr %getElem13, align 4
- ret void
-}
-
-; Also, test without the struct geps, to see if it still works with i8 geps/ptradd
-
-define void @testStorei8(ptr %1) {
-; CHECK-LABEL: define void @testStorei8(
-; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[TMP0]], align 16
-; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
-; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
-; CHECK-NEXT: ret void
-;
- store float 0.000000e+00, ptr %1, align 16
- %getElem = getelementptr inbounds i8, ptr %1, i64 4
- store float 0.000000e+00, ptr %getElem, align 4
- %getElem8 = getelementptr inbounds i8, ptr %1, i64 8
- store float 0.000000e+00, ptr %getElem8, align 8
- %getElem9 = getelementptr inbounds i8, ptr %1, i64 12
- store float 0.000000e+00, ptr %getElem9, align 4
- %getElem10 = getelementptr inbounds i8, ptr %1, i64 16
- store float 0.000000e+00, ptr %getElem10, align 4
- %getElem11 = getelementptr inbounds i8, ptr %1, i64 20
- store float 0.000000e+00, ptr %getElem11, align 4
- %getElem12 = getelementptr inbounds i8, ptr %1, i64 24
- store i32 0, ptr %getElem12, align 8
- %getElem13 = getelementptr inbounds i8, ptr %1, i64 28
- store i32 0, ptr %getElem13, align 4
- ret void
-}
-
-define void @testLoadi8(ptr %1) {
-; CHECK-LABEL: define void @testLoadi8(
-; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0]], align 16
-; CHECK-NEXT: [[L11:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT: [[L22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
-; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
-; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[L55]] to float
-; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L66]] to float
-; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; CHECK-NEXT: ret void
-;
- %l1 = load float, ptr %1, align 16
- %getElem = getelementptr inbounds i8, ptr %1, i64 4
- %l2 = load float, ptr %getElem, align 4
- %getElem8 = getelementptr inbounds i8, ptr %1, i64 8
- %l3 = load float, ptr %getElem8, align 8
- %getElem9 = getelementptr inbounds i8, ptr %1, i64 12
- %l4 = load float, ptr %getElem9, align 4
- %getElem10 = getelementptr inbounds i8, ptr %1, i64 16
- %l5 = load float, ptr %getElem10, align 4
- %getElem11 = getelementptr inbounds i8, ptr %1, i64 20
- %l6 = load float, ptr %getElem11, align 4
- %getElem12 = getelementptr inbounds i8, ptr %1, i64 24
- %l7 = load i32, ptr %getElem12, align 8
- %getElem13 = getelementptr inbounds i8, ptr %1, i64 28
- %l8 = load i32, ptr %getElem13, align 4
- ret void
-}
-
-
-; This version of the test adjusts the struct to hold two i32s at the beginning,
-; but still assumes that the first float3 is 16 aligned. If the alignment
-; propagation works correctly, it should be able to load this struct in three
-; loads: a 2x32, a 4x32, and a 4x32. Without the alignment propagation, the last
-; 4x32 will instead be a 2x32 and a 2x32
-%struct.S2 = type { i32, i32, %struct.float3, %struct.float3, i32, i32 }
-
-define void @testStore_2(ptr %1) {
-; CHECK-LABEL: define void @testStore_2(
-; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP0]], align 8
-; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], ptr [[TMP0]], i64 0, i32 2
-; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[GETELEM1]], align 16
-; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S2]], ptr [[TMP0]], i64 0, i32 3, i32 1
-; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
-; CHECK-NEXT: ret void
-;
- store i32 0, ptr %1, align 8
- %getElem = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 1
- store i32 0, ptr %getElem, align 4
- %getElem1 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2
- store float 0.000000e+00, ptr %getElem1, align 16
- %getElem2 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 1
- store float 0.000000e+00, ptr %getElem2, align 4
- %getElem8 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 2
- store float 0.000000e+00, ptr %getElem8, align 8
- %getElem9 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3
- store float 0.000000e+00, ptr %getElem9, align 4
- %getElem10 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 1
- store float 0.000000e+00, ptr %getElem10, align 4
- %getElem11 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 2
- store float 0.000000e+00, ptr %getElem11, align 4
- %getElem12 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 4
- store i32 0, ptr %getElem12, align 8
- %getElem13 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 5
- store i32 0, ptr %getElem13, align 4
- ret void
-}
-
-define void @testLoad_2(ptr %1) {
-; CHECK-LABEL: define void @testLoad_2(
-; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
-; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], ptr [[TMP0]], i64 0, i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16
-; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
-; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
-; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
-; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
-; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S2]], ptr [[TMP0]], i64 0, i32 3, i32 1
-; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
-; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L77]] to float
-; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[L88]] to float
-; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
-; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
-; CHECK-NEXT: ret void
-;
- %l = load i32, ptr %1, align 8
- %getElem = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 1
- %l2 = load i32, ptr %getElem, align 4
- %getElem1 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2
- %l3 = load float, ptr %getElem1, align 16
- %getElem2 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 1
- %l4 = load float, ptr %getElem2, align 4
- %getElem8 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 2
- %l5 = load float, ptr %getElem8, align 8
- %getElem9 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3
- %l6 = load float, ptr %getElem9, align 4
- %getElem10 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 1
- %l7 = load float, ptr %getElem10, align 4
- %getElem11 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 2
- %l8 = load float, ptr %getElem11, align 4
- %getElem12 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 4
- %l9 = load i32, ptr %getElem12, align 8
- %getElem13 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 5
- %l0 = load i32, ptr %getElem13, align 4
- ret void
-}
-
-; Also, test without the struct geps, to see if it still works with i8 geps/ptradd
-
-define void @testStorei8_2(ptr %1) {
-; CHECK-LABEL: define void @testStorei8_2(
-; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP0]], align 8
-; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
-; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[GETELEM1]], align 16
-; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
-; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
-; CHECK-NEXT: ret void
-;
- store i32 0, ptr %1, align 8
- %getElem = getelementptr inbounds i8, ptr %1, i64 4
- store i32 0, ptr %getElem, align 4
- %getElem1 = getelementptr inbounds i8, ptr %1, i64 8
- store float 0.000000e+00, ptr %getElem1, align 16
- %getElem2 = getelementptr inbounds i8, ptr %1, i64 12
- store float 0.000000e+00, ptr %getElem2, align 4
- %getElem8 = getelementptr inbounds i8, ptr %1, i64 16
- store float 0.000000e+00, ptr %getElem8, align 8
- %getElem9 = getelementptr inbounds i8, ptr %1, i64 20
- store float 0.000000e+00, ptr %getElem9, align 4
- %getElem10 = getelementptr inbounds i8, ptr %1, i64 24
- store float 0.000000e+00, ptr %getElem10, align 4
- %getElem11 = getelementptr inbounds i8, ptr %1, i64 28
- store float 0.000000e+00, ptr %getElem11, align 4
- %getElem12 = getelementptr inbounds i8, ptr %1, i64 32
- store i32 0, ptr %getElem12, align 8
- %getElem13 = getelementptr inbounds i8, ptr %1, i64 36
- store i32 0, ptr %getElem13, align 4
- ret void
-}
-
-define void @testLoadi8_2(ptr %1) {
-; CHECK-LABEL: define void @testLoadi8_2(
-; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
-; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16
-; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
-; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
-; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
-; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
-; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
-; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
-; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L77]] to float
-; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[L88]] to float
-; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
-; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
-; CHECK-NEXT: ret void
-;
- %l = load i32, ptr %1, align 8
- %getElem = getelementptr inbounds i8, ptr %1, i64 4
- %l2 = load i32, ptr %getElem, align 4
- %getElem1 = getelementptr inbounds i8, ptr %1, i64 8
- %l3 = load float, ptr %getElem1, align 16
- %getElem2 = getelementptr inbounds i8, ptr %1, i64 12
- %l4 = load float, ptr %getElem2, align 4
- %getElem8 = getelementptr inbounds i8, ptr %1, i64 16
- %l5 = load float, ptr %getElem8, align 8
- %getElem9 = getelementptr inbounds i8, ptr %1, i64 20
- %l6 = load float, ptr %getElem9, align 4
- %getElem10 = getelementptr inbounds i8, ptr %1, i64 24
- %l7 = load float, ptr %getElem10, align 4
- %getElem11 = getelementptr inbounds i8, ptr %1, i64 28
- %l8 = load float, ptr %getElem11, align 4
- %getElem12 = getelementptr inbounds i8, ptr %1, i64 32
- %l9 = load i32, ptr %getElem12, align 8
- %getElem13 = getelementptr inbounds i8, ptr %1, i64 36
- %l0 = load i32, ptr %getElem13, align 4
- ret void
-}
-
-; Test that the alignment propagation works both forwards and backwards.
-; with the "align 16" placed where it is,
-; we should end up with a v2 followed by two v4s followed by a v2.
-define void @test_forward_and_reverse(ptr %1) {
-; CHECK-LABEL: define void @test_forward_and_reverse(
-; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
-; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16
-; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
-; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
-; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
-; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
-; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
-; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[GETELEM10]], align 16
-; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
-; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
-; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
-; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
-; CHECK-NEXT: [[GETELEM14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 40
-; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[GETELEM14]], align 16
-; CHECK-NEXT: [[L1111:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0
-; CHECK-NEXT: [[L1212:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
-; CHECK-NEXT: ret void
-;
- %l = load i32, ptr %1, align 4
- %getElem = getelementptr inbounds i8, ptr %1, i64 4
- %l2 = load i32, ptr %getElem, align 4
- %getElem1 = getelementptr inbounds i8, ptr %1, i64 8
- %l3 = load float, ptr %getElem1, align 4
- %getElem2 = getelementptr inbounds i8, ptr %1, i64 12
- %l4 = load float, ptr %getElem2, align 4
- %getElem8 = getelementptr inbounds i8, ptr %1, i64 16
- %l5 = load float, ptr %getElem8, align 4
- %getElem9 = getelementptr inbounds i8, ptr %1, i64 20
- %l6 = load float, ptr %getElem9, align 4
- %getElem10 = getelementptr inbounds i8, ptr %1, i64 24
- %l7 = load float, ptr %getElem10, align 16
- %getElem11 = getelementptr inbounds i8, ptr %1, i64 28
- %l8 = load float, ptr %getElem11, align 4
- %getElem12 = getelementptr inbounds i8, ptr %1, i64 32
- %l9 = load float, ptr %getElem12, align 4
- %getElem13 = getelementptr inbounds i8, ptr %1, i64 36
- %l0 = load float, ptr %getElem13, align 4
- %getElem14 = getelementptr inbounds i8, ptr %1, i64 40
- %l11 = load i32, ptr %getElem14, align 4
- %getElem15 = getelementptr inbounds i8, ptr %1, i64 44
- %l12 = load i32, ptr %getElem15, align 4
- ret void
-}
-
-; Test an edge case where the defined alignment is max align
-define void @test_forward_and_reverse_max_align(ptr %1) {
-; CHECK-LABEL: define void @test_forward_and_reverse_max_align(
-; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
-; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16
-; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
-; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
-; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
-; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
-; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
-; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[GETELEM10]], align 4294967296
-; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
-; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
-; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
-; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
-; CHECK-NEXT: [[GETELEM14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 40
-; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[GETELEM14]], align 16
-; CHECK-NEXT: [[L1111:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
-; CHECK-NEXT: [[L1212:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
-; CHECK-NEXT: ret void
-;
- %l = load i32, ptr %1, align 4
- %getElem = getelementptr inbounds i8, ptr %1, i64 4
- %l2 = load i32, ptr %getElem, align 4
- %getElem1 = getelementptr inbounds i8, ptr %1, i64 8
- %l3 = load float, ptr %getElem1, align 4
- %getElem2 = getelementptr inbounds i8, ptr %1, i64 12
- %l4 = load float, ptr %getElem2, align 4
- %getElem8 = getelementptr inbounds i8, ptr %1, i64 16
- %l5 = load float, ptr %getElem8, align 4
- %getElem9 = getelementptr inbounds i8, ptr %1, i64 20
- %l6 = load float, ptr %getElem9, align 4
- %getElem10 = getelementptr inbounds i8, ptr %1, i64 24
- %l7 = load float, ptr %getElem10, align 4294967296
- %getElem11 = getelementptr inbounds i8, ptr %1, i64 28
- %l8 = load float, ptr %getElem11, align 4
- %getElem12 = getelementptr inbounds i8, ptr %1, i64 32
- %l9 = load float, ptr %getElem12, align 4
- %getElem13 = getelementptr inbounds i8, ptr %1, i64 36
- %l0 = load float, ptr %getElem13, align 4
- %getElem14 = getelementptr inbounds i8, ptr %1, i64 40
- %l11 = load i32, ptr %getElem14, align 4
- %getElem15 = getelementptr inbounds i8, ptr %1, i64 44
- %l12 = load i32, ptr %getElem15, align 4
- ret void
-}
-
-define void @test_i8_elements(ptr %1) {
-; CHECK-LABEL: define void @test_i8_elements(
-; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr [[TMP0]], align 2
-; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
-; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
-; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GETELEM1]], align 4
-; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x i8> [[TMP3]], i32 0
-; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x i8> [[TMP3]], i32 1
-; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x i8> [[TMP3]], i32 2
-; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x i8> [[TMP3]], i32 3
-; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 6
-; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[GETELEM10]], align 4
-; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i8> [[TMP4]], i32 0
-; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i8> [[TMP4]], i32 1
-; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x i8> [[TMP4]], i32 2
-; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3
-; CHECK-NEXT: [[GETELEM14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 10
-; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i8>, ptr [[GETELEM14]], align 4
-; CHECK-NEXT: [[L1111:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
-; CHECK-NEXT: [[L1212:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
-; CHECK-NEXT: ret void
-;
- %l = load i8, ptr %1, align 1
- %getElem = getelementptr inbounds i8, ptr %1, i64 1
- %l2 = load i8, ptr %getElem, align 1
- %getElem1 = getelementptr inbounds i8, ptr %1, i64 2
- %l3 = load i8, ptr %getElem1, align 1
- %getElem2 = getelementptr inbounds i8, ptr %1, i64 3
- %l4 = load i8, ptr %getElem2, align 1
- %getElem8 = getelementptr inbounds i8, ptr %1, i64 4
- %l5 = load i8, ptr %getElem8, align 1
- %getElem9 = getelementptr inbounds i8, ptr %1, i64 5
- %l6 = load i8, ptr %getElem9, align 1
- %getElem10 = getelementptr inbounds i8, ptr %1, i64 6
- %l7 = load i8, ptr %getElem10, align 4
- %getElem11 = getelementptr inbounds i8, ptr %1, i64 7
- %l8 = load i8, ptr %getElem11, align 1
- %getElem12 = getelementptr inbounds i8, ptr %1, i64 8
- %l9 = load i8, ptr %getElem12, align 1
- %getElem13 = getelementptr inbounds i8, ptr %1, i64 9
- %l0 = load i8, ptr %getElem13, align 1
- %getElem14 = getelementptr inbounds i8, ptr %1, i64 10
- %l11 = load i8, ptr %getElem14, align 1
- %getElem15 = getelementptr inbounds i8, ptr %1, i64 11
- %l12 = load i8, ptr %getElem15, align 1
- ret void
-}
>From c53e595d8a96ae21508c2177848e15d82a4e4efe Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Tue, 22 Jul 2025 16:56:03 +0000
Subject: [PATCH 07/16] Reviewer feedback
---
llvm/lib/Transforms/Scalar/InferAlignment.cpp | 27 ++++++++++---------
.../propagate-from-other-load-stores.ll | 10 +++----
2 files changed, 18 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index 0570ba653c2dc..6785dd86ae64e 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -35,22 +35,23 @@ static bool tryToPropagateAlign(Function &F, const DataLayout &DL) {
for (Instruction &I : BB) {
if (auto *PtrOp = getLoadStorePointerOperand(&I)) {
Align LoadStoreAlign = getLoadStoreAlignment(&I);
- APInt OffsetFromBase = APInt(
- DL.getIndexSizeInBits(PtrOp->getType()->getPointerAddressSpace()),
- 0);
- PtrOp = PtrOp->stripAndAccumulateInBoundsConstantOffsets(
- DL, OffsetFromBase);
+ APInt OffsetFromBase =
+ APInt(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0);
+ PtrOp = PtrOp->stripAndAccumulateConstantOffsets(DL, OffsetFromBase, true);
Align BasePointerAlign =
commonAlignment(LoadStoreAlign, OffsetFromBase.getLimitedValue());
- if (BestBasePointerAligns.count(PtrOp) &&
- BestBasePointerAligns[PtrOp] > BasePointerAlign) {
- Align BetterLoadStoreAlign = commonAlignment(
- BestBasePointerAligns[PtrOp], OffsetFromBase.getLimitedValue());
- setLoadStoreAlignment(&I, BetterLoadStoreAlign);
- Changed = true;
- } else {
- BestBasePointerAligns[PtrOp] = BasePointerAlign;
+ auto [It, Inserted] =
+ BestBasePointerAligns.try_emplace(PtrOp, BasePointerAlign);
+ if (!Inserted) {
+ if (It->second > BasePointerAlign) {
+ Align BetterLoadStoreAlign =
+ commonAlignment(It->second, OffsetFromBase.getLimitedValue());
+ setLoadStoreAlignment(&I, BetterLoadStoreAlign);
+ Changed = true;
+ } else {
+ It->second = BasePointerAlign;
+ }
}
}
}
diff --git a/llvm/test/Transforms/InferAlignment/propagate-from-other-load-stores.ll b/llvm/test/Transforms/InferAlignment/propagate-from-other-load-stores.ll
index 4115d599ccd79..598c2c9e5766c 100644
--- a/llvm/test/Transforms/InferAlignment/propagate-from-other-load-stores.ll
+++ b/llvm/test/Transforms/InferAlignment/propagate-from-other-load-stores.ll
@@ -8,10 +8,9 @@
; Test that we can propagate the align 16 to the load and store that are set to align 4
; ------------------------------------------------------------------------------
-; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define void @prop_align(ptr noundef readonly captures(none) %v, ptr noundef writeonly captures(none) initializes((0, 32)) %vout) local_unnamed_addr #0 {
+define void @prop_align(ptr %v, ptr %vout) {
; CHECK-LABEL: define void @prop_align(
-; CHECK-SAME: ptr noundef readonly captures(none) [[V:%.*]], ptr noundef writeonly captures(none) initializes((0, 32)) [[VOUT:%.*]]) local_unnamed_addr {
+; CHECK-SAME: ptr [[V:%.*]], ptr [[VOUT:%.*]]) {
; CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load float, ptr [[V]], align 16
; CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 4
; CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load float, ptr [[DOTUNPACK_ELT7]], align 4
@@ -81,10 +80,9 @@ define void @prop_align(ptr noundef readonly captures(none) %v, ptr noundef writ
; Test that alignment is not propagated from a source that does not dominate the destination
; ------------------------------------------------------------------------------
-; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define void @no_prop_align(ptr noundef readonly captures(none) %v, ptr noundef writeonly captures(none) initializes((0, 32)) %vout, i1 %cond) local_unnamed_addr #0 {
+define void @no_prop_align(ptr %v, ptr %vout, i1 %cond) {
; CHECK-LABEL: define void @no_prop_align(
-; CHECK-SAME: ptr noundef readonly captures(none) [[V:%.*]], ptr noundef writeonly captures(none) initializes((0, 32)) [[VOUT:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
+; CHECK-SAME: ptr [[V:%.*]], ptr [[VOUT:%.*]], i1 [[COND:%.*]]) {
; CHECK-NEXT: br i1 [[COND]], label %[[BRANCH1:.*]], label %[[BRANCH2:.*]]
; CHECK: [[BRANCH1]]:
; CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load float, ptr [[V]], align 16
>From 39dc0099dfa49417d3f1045e76ab5e4199909f8e Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Tue, 22 Jul 2025 17:39:37 +0000
Subject: [PATCH 08/16] Reviewer feedback
---
llvm/lib/Transforms/Scalar/InferAlignment.cpp | 76 ++++++++-----------
1 file changed, 33 insertions(+), 43 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index 6785dd86ae64e..4cb3eca83ecf3 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -21,44 +21,6 @@
using namespace llvm;
-static bool tryToPropagateAlign(Function &F, const DataLayout &DL) {
- bool Changed = false;
-
- for (BasicBlock &BB : F) {
- // We need to reset the map for each block because alignment information
- // can't be propagated across blocks. This is because control flow could
- // be dependent on the address at runtime, making an alignment assumption
- // within one block not true in another. Some sort of dominator tree
- // approach could be better, but restricting within a basic block is correct
- // too.
- DenseMap<Value *, Align> BestBasePointerAligns;
- for (Instruction &I : BB) {
- if (auto *PtrOp = getLoadStorePointerOperand(&I)) {
- Align LoadStoreAlign = getLoadStoreAlignment(&I);
- APInt OffsetFromBase =
- APInt(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0);
- PtrOp = PtrOp->stripAndAccumulateConstantOffsets(DL, OffsetFromBase, true);
- Align BasePointerAlign =
- commonAlignment(LoadStoreAlign, OffsetFromBase.getLimitedValue());
-
- auto [It, Inserted] =
- BestBasePointerAligns.try_emplace(PtrOp, BasePointerAlign);
- if (!Inserted) {
- if (It->second > BasePointerAlign) {
- Align BetterLoadStoreAlign =
- commonAlignment(It->second, OffsetFromBase.getLimitedValue());
- setLoadStoreAlignment(&I, BetterLoadStoreAlign);
- Changed = true;
- } else {
- It->second = BasePointerAlign;
- }
- }
- }
- }
- }
- return Changed;
-}
-
static bool tryToImproveAlign(
const DataLayout &DL, Instruction *I,
function_ref<Align(Value *PtrOp, Align OldAlign, Align PrefAlign)> Fn) {
@@ -95,9 +57,17 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
}
}
- // Compute alignment from known bits.
for (BasicBlock &BB : F) {
+ // We need to reset the map for each block because alignment information
+ // can't be propagated across blocks. This is because control flow could
+ // be dependent on the address at runtime, making an alignment assumption
+ // within one block not true in another. Some sort of dominator tree
+ // approach could be better, but restricting within a basic block is correct
+ // too.
+ DenseMap<Value *, Align> BestBasePointerAligns;
+
for (Instruction &I : BB) {
+ // Compute alignment from known bits.
Changed |= tryToImproveAlign(
DL, &I, [&](Value *PtrOp, Align OldAlign, Align PrefAlign) {
KnownBits Known = computeKnownBits(PtrOp, DL, &AC, &I, &DT);
@@ -105,13 +75,33 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
+Value::MaxAlignmentExponent);
return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
});
+
+ // Propagate alignment between loads and stores that originate from the
+ // same base pointer
+ Changed |= tryToImproveAlign(
+ DL, &I, [&](Value *PtrOp, Align LoadStoreAlign, Align PrefAlign) {
+ APInt OffsetFromBase =
+ APInt(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0);
+ PtrOp = PtrOp->stripAndAccumulateConstantOffsets(DL, OffsetFromBase,
+ true);
+ Align BasePointerAlign = commonAlignment(
+ LoadStoreAlign, OffsetFromBase.getLimitedValue());
+
+ auto [It, Inserted] =
+ BestBasePointerAligns.try_emplace(PtrOp, BasePointerAlign);
+ if (!Inserted) {
+ if (It->second > BasePointerAlign) {
+ Align BetterLoadStoreAlign = commonAlignment(
+ It->second, OffsetFromBase.getLimitedValue());
+ return BetterLoadStoreAlign;
+ }
+ It->second = BasePointerAlign;
+ }
+ return LoadStoreAlign;
+ });
}
}
- // Propagate alignment between loads and stores that originate from the same
- // base pointer
- Changed |= tryToPropagateAlign(F, DL);
-
return Changed;
}
>From 2f73c04b9d5bf21b0723a683e7d052002b526a8a Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Tue, 22 Jul 2025 17:39:55 +0000
Subject: [PATCH 09/16] Test improvement
---
.../Transforms/PhaseOrdering/X86/masked-memory-ops-with-cf.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops-with-cf.ll b/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops-with-cf.ll
index 405a26de3d6af..c649f29effeda 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops-with-cf.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops-with-cf.ll
@@ -13,7 +13,7 @@ define void @basic(i1 %cond, ptr %b, ptr %p, ptr %q) {
; CHECK-NEXT: [[TMP5:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[B:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> poison)
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <1 x i16>
-; CHECK-NEXT: call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP7]], ptr [[B]], i32 2, <1 x i1> [[TMP0]])
+; CHECK-NEXT: call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP7]], ptr [[B]], i32 8, <1 x i1> [[TMP0]])
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP8]], ptr [[P]], i32 4, <1 x i1> [[TMP0]])
; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[TMP6]] to <1 x i64>
>From 3e55c80eeab448fe990d59075f3c5dcc8aa2f42c Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Tue, 22 Jul 2025 17:45:33 +0000
Subject: [PATCH 10/16] Add comments
---
llvm/lib/Transforms/Scalar/InferAlignment.cpp | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index 4cb3eca83ecf3..9ae2d33c14307 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -77,19 +77,25 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
});
// Propagate alignment between loads and stores that originate from the
- // same base pointer
+ // same base pointer.
Changed |= tryToImproveAlign(
DL, &I, [&](Value *PtrOp, Align LoadStoreAlign, Align PrefAlign) {
APInt OffsetFromBase =
APInt(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0);
PtrOp = PtrOp->stripAndAccumulateConstantOffsets(DL, OffsetFromBase,
true);
+ // Derive the base pointer alignment from the load/store alignment
+ // and the offset from the base pointer.
Align BasePointerAlign = commonAlignment(
LoadStoreAlign, OffsetFromBase.getLimitedValue());
auto [It, Inserted] =
BestBasePointerAligns.try_emplace(PtrOp, BasePointerAlign);
if (!Inserted) {
+ // If the stored base pointer alignment is better than the
+ // base pointer alignment we derived, we may be able to use it
+ // to improve the load/store alignment. If not, store the
+ // improved base pointer alignment for future iterations.
if (It->second > BasePointerAlign) {
Align BetterLoadStoreAlign = commonAlignment(
It->second, OffsetFromBase.getLimitedValue());
>From 1ef400864ef82f515838d26b1f20d3b619cd9c29 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Thu, 31 Jul 2025 16:02:17 +0000
Subject: [PATCH 11/16] One call to tryToImproveAlign
---
llvm/lib/Transforms/Scalar/InferAlignment.cpp | 76 ++++++++++---------
1 file changed, 40 insertions(+), 36 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index 9ae2d33c14307..3878d92664c55 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -57,6 +57,43 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
}
}
+ // Compute alignment from known bits.
+ auto InferFromKnownBits = [&](Instruction &I, Value *PtrOp) {
+ KnownBits Known = computeKnownBits(PtrOp, DL, &AC, &I, &DT);
+ unsigned TrailZ =
+ std::min(Known.countMinTrailingZeros(), +Value::MaxAlignmentExponent);
+ return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+ };
+
+ // Propagate alignment between loads and stores that originate from the
+ // same base pointer.
+ DenseMap<Value *, Align> BestBasePointerAligns;
+ auto InferFromBasePointer = [&](Value *PtrOp, Align LoadStoreAlign) {
+ APInt OffsetFromBase =
+ APInt(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0);
+ PtrOp = PtrOp->stripAndAccumulateConstantOffsets(DL, OffsetFromBase, true);
+ // Derive the base pointer alignment from the load/store alignment
+ // and the offset from the base pointer.
+ Align BasePointerAlign =
+ commonAlignment(LoadStoreAlign, OffsetFromBase.getLimitedValue());
+
+ auto [It, Inserted] =
+ BestBasePointerAligns.try_emplace(PtrOp, BasePointerAlign);
+ if (!Inserted) {
+ // If the stored base pointer alignment is better than the
+ // base pointer alignment we derived, we may be able to use it
+ // to improve the load/store alignment. If not, store the
+ // improved base pointer alignment for future iterations.
+ if (It->second > BasePointerAlign) {
+ Align BetterLoadStoreAlign =
+ commonAlignment(It->second, OffsetFromBase.getLimitedValue());
+ return BetterLoadStoreAlign;
+ }
+ It->second = BasePointerAlign;
+ }
+ return LoadStoreAlign;
+ };
+
for (BasicBlock &BB : F) {
// We need to reset the map for each block because alignment information
// can't be propagated across blocks. This is because control flow could
@@ -64,46 +101,13 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
// within one block not true in another. Some sort of dominator tree
// approach could be better, but restricting within a basic block is correct
// too.
- DenseMap<Value *, Align> BestBasePointerAligns;
+ BestBasePointerAligns.clear();
for (Instruction &I : BB) {
- // Compute alignment from known bits.
Changed |= tryToImproveAlign(
DL, &I, [&](Value *PtrOp, Align OldAlign, Align PrefAlign) {
- KnownBits Known = computeKnownBits(PtrOp, DL, &AC, &I, &DT);
- unsigned TrailZ = std::min(Known.countMinTrailingZeros(),
- +Value::MaxAlignmentExponent);
- return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
- });
-
- // Propagate alignment between loads and stores that originate from the
- // same base pointer.
- Changed |= tryToImproveAlign(
- DL, &I, [&](Value *PtrOp, Align LoadStoreAlign, Align PrefAlign) {
- APInt OffsetFromBase =
- APInt(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0);
- PtrOp = PtrOp->stripAndAccumulateConstantOffsets(DL, OffsetFromBase,
- true);
- // Derive the base pointer alignment from the load/store alignment
- // and the offset from the base pointer.
- Align BasePointerAlign = commonAlignment(
- LoadStoreAlign, OffsetFromBase.getLimitedValue());
-
- auto [It, Inserted] =
- BestBasePointerAligns.try_emplace(PtrOp, BasePointerAlign);
- if (!Inserted) {
- // If the stored base pointer alignment is better than the
- // base pointer alignment we derived, we may be able to use it
- // to improve the load/store alignment. If not, store the
- // improved base pointer alignment for future iterations.
- if (It->second > BasePointerAlign) {
- Align BetterLoadStoreAlign = commonAlignment(
- It->second, OffsetFromBase.getLimitedValue());
- return BetterLoadStoreAlign;
- }
- It->second = BasePointerAlign;
- }
- return LoadStoreAlign;
+ return std::max(InferFromKnownBits(I, PtrOp),
+ InferFromBasePointer(PtrOp, OldAlign));
});
}
}
>From 622cf4880f441f7da6cf0b715a0afb918bcbfb8e Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dakersnar at me.com>
Date: Fri, 1 Aug 2025 09:32:30 -0500
Subject: [PATCH 12/16] Update InferAlignment.cpp
Co-authored-by: Nikita Popov <github at npopov.com>
---
llvm/lib/Transforms/Scalar/InferAlignment.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index 3878d92664c55..30ee21d49475f 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -69,8 +69,7 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
// same base pointer.
DenseMap<Value *, Align> BestBasePointerAligns;
auto InferFromBasePointer = [&](Value *PtrOp, Align LoadStoreAlign) {
- APInt OffsetFromBase =
- APInt(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0);
+ APInt OffsetFromBase(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0);
PtrOp = PtrOp->stripAndAccumulateConstantOffsets(DL, OffsetFromBase, true);
// Derive the base pointer alignment from the load/store alignment
// and the offset from the base pointer.
>From 5d15526d043285239e8daf450c72d9ce154c6ae0 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Mon, 4 Aug 2025 17:47:13 +0000
Subject: [PATCH 13/16] Fix clang tests (all improvements), add negative gep
tests
---
.../CodeGen/attr-counted-by-for-pointers.c | 8 ++---
clang/test/OpenMP/bug57757.cpp | 2 +-
.../propagate-from-other-load-stores.ll | 32 +++++++++++++++++++
3 files changed, 37 insertions(+), 5 deletions(-)
diff --git a/clang/test/CodeGen/attr-counted-by-for-pointers.c b/clang/test/CodeGen/attr-counted-by-for-pointers.c
index e939e49a61d4d..0d72b58c78fd1 100644
--- a/clang/test/CodeGen/attr-counted-by-for-pointers.c
+++ b/clang/test/CodeGen/attr-counted-by-for-pointers.c
@@ -32,7 +32,7 @@ struct annotated_ptr {
// SANITIZE-WITH-ATTR-NEXT: entry:
// SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
-// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 8
// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2:![0-9]+]]
// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
// SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP1]], label [[CONT10:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
@@ -85,7 +85,7 @@ void test1(struct annotated_ptr *p, int index, struct foo *value) {
// SANITIZE-WITH-ATTR-NEXT: entry:
// SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
-// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 8
// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
// SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP1]], label [[CONT10:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
@@ -138,7 +138,7 @@ void test2(struct annotated_ptr *p, int index, struct foo *value) {
// SANITIZE-WITH-ATTR-NEXT: entry:
// SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
-// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 8
// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
// SANITIZE-WITH-ATTR-NEXT: [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
// SANITIZE-WITH-ATTR-NEXT: br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], label [[CONT10:%.*]], !prof [[PROF15:![0-9]+]], !nosanitize [[META2]]
@@ -311,7 +311,7 @@ size_t test6(struct annotated_ptr *p, int index) {
// SANITIZE-WITH-ATTR-NEXT: entry:
// SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
-// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 8
// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
// SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP1]], label [[CONT10:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
diff --git a/clang/test/OpenMP/bug57757.cpp b/clang/test/OpenMP/bug57757.cpp
index eabf233dde247..caf53a5b62c1c 100644
--- a/clang/test/OpenMP/bug57757.cpp
+++ b/clang/test/OpenMP/bug57757.cpp
@@ -46,7 +46,7 @@ void foo() {
// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52
// CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 48
// CHECK-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA19:![0-9]+]], !noalias [[META13]]
-// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA16]], !noalias [[META13]]
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 8, !tbaa [[TBAA16]], !noalias [[META13]]
// CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA20:![0-9]+]], !noalias [[META13]]
// CHECK-NEXT: tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef [[TMP10]]) #[[ATTR2:[0-9]+]], !noalias [[META13]]
// CHECK-NEXT: br label [[DOTOMP_OUTLINED__EXIT]]
diff --git a/llvm/test/Transforms/InferAlignment/propagate-from-other-load-stores.ll b/llvm/test/Transforms/InferAlignment/propagate-from-other-load-stores.ll
index 598c2c9e5766c..f2d8253343c08 100644
--- a/llvm/test/Transforms/InferAlignment/propagate-from-other-load-stores.ll
+++ b/llvm/test/Transforms/InferAlignment/propagate-from-other-load-stores.ll
@@ -132,3 +132,35 @@ branch2:
end:
ret void
}
+
+; ------------------------------------------------------------------------------
+; Test that we can propagate to/from negative offset GEPs
+; ------------------------------------------------------------------------------
+
+define void @prop_align_negative_offset(ptr %v) {
+; CHECK-LABEL: define void @prop_align_negative_offset(
+; CHECK-SAME: ptr [[V:%.*]]) {
+; CHECK-NEXT: [[LOADALIGNED:%.*]] = load float, ptr [[V]], align 16
+; CHECK-NEXT: [[GEPNEGATIVE:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 -16
+; CHECK-NEXT: [[LOADUNALIGNED:%.*]] = load float, ptr [[GEPNEGATIVE]], align 16
+; CHECK-NEXT: ret void
+;
+ %loadAligned= load float, ptr %v, align 16
+ %gepNegative = getelementptr inbounds nuw i8, ptr %v, i64 -16
+ %loadUnaligned = load float, ptr %gepNegative, align 4
+ ret void
+}
+
+define void @prop_align_negative_offset_2(ptr %v) {
+; CHECK-LABEL: define void @prop_align_negative_offset_2(
+; CHECK-SAME: ptr [[V:%.*]]) {
+; CHECK-NEXT: [[GEPNEGATIVE:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 -16
+; CHECK-NEXT: [[LOADALIGNED:%.*]] = load float, ptr [[GEPNEGATIVE]], align 16
+; CHECK-NEXT: [[LOADUNALIGNED:%.*]] = load float, ptr [[V]], align 16
+; CHECK-NEXT: ret void
+;
+ %gepNegative = getelementptr inbounds nuw i8, ptr %v, i64 -16
+ %loadAligned = load float, ptr %gepNegative, align 16
+ %loadUnaligned= load float, ptr %v, align 4
+ ret void
+}
>From 8d0d3fc69614c925242f48d521a0d6d0cd02e1f2 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Tue, 5 Aug 2025 16:08:22 +0000
Subject: [PATCH 14/16] Fix negative offsets calculation
---
llvm/lib/Transforms/Scalar/InferAlignment.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index 30ee21d49475f..1396272ce2006 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -74,7 +74,7 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
// Derive the base pointer alignment from the load/store alignment
// and the offset from the base pointer.
Align BasePointerAlign =
- commonAlignment(LoadStoreAlign, OffsetFromBase.getLimitedValue());
+ commonAlignment(LoadStoreAlign, OffsetFromBase.abs().getLimitedValue());
auto [It, Inserted] =
BestBasePointerAligns.try_emplace(PtrOp, BasePointerAlign);
@@ -85,7 +85,7 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
// improved base pointer alignment for future iterations.
if (It->second > BasePointerAlign) {
Align BetterLoadStoreAlign =
- commonAlignment(It->second, OffsetFromBase.getLimitedValue());
+ commonAlignment(It->second, OffsetFromBase.abs().getLimitedValue());
return BetterLoadStoreAlign;
}
It->second = BasePointerAlign;
>From c6eb67aab6c1e3472794bf5994c2c977868ccf20 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Wed, 6 Aug 2025 20:11:09 +0000
Subject: [PATCH 15/16] Remove abs, add a couple more tests
---
llvm/lib/Transforms/Scalar/InferAlignment.cpp | 4 +--
.../propagate-from-other-load-stores.ll | 28 +++++++++++++++++++
2 files changed, 30 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index 1396272ce2006..30ee21d49475f 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -74,7 +74,7 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
// Derive the base pointer alignment from the load/store alignment
// and the offset from the base pointer.
Align BasePointerAlign =
- commonAlignment(LoadStoreAlign, OffsetFromBase.abs().getLimitedValue());
+ commonAlignment(LoadStoreAlign, OffsetFromBase.getLimitedValue());
auto [It, Inserted] =
BestBasePointerAligns.try_emplace(PtrOp, BasePointerAlign);
@@ -85,7 +85,7 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
// improved base pointer alignment for future iterations.
if (It->second > BasePointerAlign) {
Align BetterLoadStoreAlign =
- commonAlignment(It->second, OffsetFromBase.abs().getLimitedValue());
+ commonAlignment(It->second, OffsetFromBase.getLimitedValue());
return BetterLoadStoreAlign;
}
It->second = BasePointerAlign;
diff --git a/llvm/test/Transforms/InferAlignment/propagate-from-other-load-stores.ll b/llvm/test/Transforms/InferAlignment/propagate-from-other-load-stores.ll
index f2d8253343c08..3fc7c59a512a5 100644
--- a/llvm/test/Transforms/InferAlignment/propagate-from-other-load-stores.ll
+++ b/llvm/test/Transforms/InferAlignment/propagate-from-other-load-stores.ll
@@ -164,3 +164,31 @@ define void @prop_align_negative_offset_2(ptr %v) {
%loadUnaligned= load float, ptr %v, align 4
ret void
}
+
+define void @prop_align_negative_offset_3(ptr %v) {
+; CHECK-LABEL: define void @prop_align_negative_offset_3(
+; CHECK-SAME: ptr [[V:%.*]]) {
+; CHECK-NEXT: [[LOADALIGNED:%.*]] = load float, ptr [[V]], align 16
+; CHECK-NEXT: [[GEPNEGATIVE:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 -8
+; CHECK-NEXT: [[LOADUNALIGNED:%.*]] = load float, ptr [[GEPNEGATIVE]], align 8
+; CHECK-NEXT: ret void
+;
+ %loadAligned= load float, ptr %v, align 16
+ %gepNegative = getelementptr inbounds nuw i8, ptr %v, i64 -8
+ %loadUnaligned = load float, ptr %gepNegative, align 4
+ ret void
+}
+
+define void @prop_align_negative_offset_4(ptr %v) {
+; CHECK-LABEL: define void @prop_align_negative_offset_4(
+; CHECK-SAME: ptr [[V:%.*]]) {
+; CHECK-NEXT: [[LOADALIGNED:%.*]] = load float, ptr [[V]], align 16
+; CHECK-NEXT: [[GEPNEGATIVE:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 -20
+; CHECK-NEXT: [[LOADUNALIGNED:%.*]] = load float, ptr [[GEPNEGATIVE]], align 4
+; CHECK-NEXT: ret void
+;
+ %loadAligned= load float, ptr %v, align 16
+ %gepNegative = getelementptr inbounds nuw i8, ptr %v, i64 -20
+ %loadUnaligned = load float, ptr %gepNegative, align 4
+ ret void
+}
>From 215d65f7c6a9798afe5dba5f46a307063c69d034 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Thu, 7 Aug 2025 15:36:17 +0000
Subject: [PATCH 16/16] Adjust comment to more correctly clarify the
limitations of this optimization
---
llvm/lib/Transforms/Scalar/InferAlignment.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index 30ee21d49475f..e9bf59c6850a3 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -95,11 +95,11 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
for (BasicBlock &BB : F) {
// We need to reset the map for each block because alignment information
- // can't be propagated across blocks. This is because control flow could
- // be dependent on the address at runtime, making an alignment assumption
- // within one block not true in another. Some sort of dominator tree
- // approach could be better, but restricting within a basic block is correct
- // too.
+ // can only be propagated from instruction A to B if A dominates B.
+ // This is because control flow (and exception throwing) could be dependent
+ // on the address (and its alignment) at runtime. Some sort of dominator
+ // tree approach could be better, but doing a simple forward pass through a
+ // single basic block is correct too.
BestBasePointerAligns.clear();
for (Instruction &I : BB) {
More information about the llvm-commits
mailing list