[llvm] [LoadStoreVectorizer] Propagate alignment through contiguous chain (PR #145733)
Drew Kersnar via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 25 10:57:30 PDT 2025
https://github.com/dakersnar updated https://github.com/llvm/llvm-project/pull/145733
>From 15600f92b838448691eb3d27fbd97331fcdc7bb2 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Wed, 25 Jun 2025 15:58:20 +0000
Subject: [PATCH 1/2] [LoadStoreVectorizer] Propagate alignment through
contiguous chain to improve vectorization
---
.../Vectorize/LoadStoreVectorizer.cpp | 35 +++
.../LoadStoreVectorizer/prop-align.ll | 296 ++++++++++++++++++
2 files changed, 331 insertions(+)
create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 89f63c3b66aad..e14a936b764e5 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -343,6 +343,9 @@ class Vectorizer {
/// Postcondition: For all i, ret[i][0].second == 0, because the first instr
/// in the chain is the leader, and an instr touches distance 0 from itself.
std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
+
+ /// Propagates the best alignment in a chain of contiguous accesses
+ void propagateBestAlignmentsInChain(ArrayRef<ChainElem> C) const;
};
class LoadStoreVectorizerLegacyPass : public FunctionPass {
@@ -716,6 +719,14 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
+ // We know that the accesses are contiguous. Propagate alignment
+ // information so that slices of the chain can still be vectorized.
+ propagateBestAlignmentsInChain(C);
+ LLVM_DEBUG({
+ dbgs() << "LSV: Chain after alignment propagation:\n";
+ dumpChain(C);
+ });
+
std::vector<Chain> Ret;
for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
// Find candidate chains of size not greater than the largest vector reg.
@@ -1634,3 +1645,27 @@ std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
.sextOrTrunc(OrigBitWidth);
return std::nullopt;
}
+
+void Vectorizer::propagateBestAlignmentsInChain(ArrayRef<ChainElem> C) const {
+ ChainElem BestAlignedElem = C[0];
+ Align BestAlignSoFar = getLoadStoreAlignment(C[0].Inst);
+
+ for (const ChainElem &E : C) {
+ Align OrigAlign = getLoadStoreAlignment(E.Inst);
+ if (OrigAlign > BestAlignSoFar) {
+ BestAlignedElem = E;
+ BestAlignSoFar = OrigAlign;
+ }
+
+ APInt OffsetFromBestAlignedElem =
+ E.OffsetFromLeader - BestAlignedElem.OffsetFromLeader;
+ assert(OffsetFromBestAlignedElem.isNonNegative());
+ // commonAlignment is equivalent to a greatest common power-of-two divisor;
+ // it returns the largest power of 2 that divides both A and B.
+ Align NewAlign = commonAlignment(
+ BestAlignSoFar, OffsetFromBestAlignedElem.getLimitedValue());
+ if (NewAlign > OrigAlign)
+ setLoadStoreAlignment(E.Inst, NewAlign);
+ }
+ return;
+}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll b/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll
new file mode 100644
index 0000000000000..a1878dc051d99
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll
@@ -0,0 +1,296 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=load-store-vectorizer -S < %s | FileCheck %s
+
+; The IR has the first float3 labeled with align 16, and that 16 should
+; be propagated such that the second set of 4 values
+; can also be vectorized together.
+%struct.float3 = type { float, float, float }
+%struct.S1 = type { %struct.float3, %struct.float3, i32, i32 }
+
+define void @testStore(ptr nocapture writeonly %1) {
+; CHECK-LABEL: define void @testStore(
+; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[TMP0]], align 16
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i64 0, i32 1, i32 1
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
+; CHECK-NEXT: ret void
+;
+ store float 0.000000e+00, ptr %1, align 16
+ %getElem = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 1
+ store float 0.000000e+00, ptr %getElem, align 4
+ %getElem8 = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 2
+ store float 0.000000e+00, ptr %getElem8, align 8
+ %getElem9 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1
+ store float 0.000000e+00, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 1
+ store float 0.000000e+00, ptr %getElem10, align 4
+ %getElem11 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 2
+ store float 0.000000e+00, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 2
+ store i32 0, ptr %getElem12, align 8
+ %getElem13 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 3
+ store i32 0, ptr %getElem13, align 4
+ ret void
+}
+
+define void @testLoad(ptr nocapture writeonly %1) {
+; CHECK-LABEL: define void @testLoad(
+; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0]], align 16
+; CHECK-NEXT: [[L11:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT: [[L22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i64 0, i32 1, i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
+; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[L55]] to float
+; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L66]] to float
+; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; CHECK-NEXT: ret void
+;
+ %l1 = load float, ptr %1, align 16
+ %getElem = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 1
+ %l2 = load float, ptr %getElem, align 4
+ %getElem8 = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 2
+ %l3 = load float, ptr %getElem8, align 8
+ %getElem9 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1
+ %l4 = load float, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 1
+ %l5 = load float, ptr %getElem10, align 4
+ %getElem11 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 2
+ %l6 = load float, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 2
+ %l7 = load i32, ptr %getElem12, align 8
+ %getElem13 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 3
+ %l8 = load i32, ptr %getElem13, align 4
+ ret void
+}
+
+; Also, test without the struct geps, to see if it still works with i8 geps/ptradd
+
+define void @testStorei8(ptr nocapture writeonly %1) {
+; CHECK-LABEL: define void @testStorei8(
+; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[TMP0]], align 16
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
+; CHECK-NEXT: ret void
+;
+ store float 0.000000e+00, ptr %1, align 16
+ %getElem = getelementptr inbounds i8, ptr %1, i64 4
+ store float 0.000000e+00, ptr %getElem, align 4
+ %getElem8 = getelementptr inbounds i8, ptr %1, i64 8
+ store float 0.000000e+00, ptr %getElem8, align 8
+ %getElem9 = getelementptr inbounds i8, ptr %1, i64 12
+ store float 0.000000e+00, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds i8, ptr %1, i64 16
+ store float 0.000000e+00, ptr %getElem10, align 4
+ %getElem11 = getelementptr inbounds i8, ptr %1, i64 20
+ store float 0.000000e+00, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds i8, ptr %1, i64 24
+ store i32 0, ptr %getElem12, align 8
+ %getElem13 = getelementptr inbounds i8, ptr %1, i64 28
+ store i32 0, ptr %getElem13, align 4
+ ret void
+}
+
+define void @testLoadi8(ptr nocapture writeonly %1) {
+; CHECK-LABEL: define void @testLoadi8(
+; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0]], align 16
+; CHECK-NEXT: [[L11:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT: [[L22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
+; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[L55]] to float
+; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L66]] to float
+; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; CHECK-NEXT: ret void
+;
+ %l1 = load float, ptr %1, align 16
+ %getElem = getelementptr inbounds i8, ptr %1, i64 4
+ %l2 = load float, ptr %getElem, align 4
+ %getElem8 = getelementptr inbounds i8, ptr %1, i64 8
+ %l3 = load float, ptr %getElem8, align 8
+ %getElem9 = getelementptr inbounds i8, ptr %1, i64 12
+ %l4 = load float, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds i8, ptr %1, i64 16
+ %l5 = load float, ptr %getElem10, align 4
+ %getElem11 = getelementptr inbounds i8, ptr %1, i64 20
+ %l6 = load float, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds i8, ptr %1, i64 24
+ %l7 = load i32, ptr %getElem12, align 8
+ %getElem13 = getelementptr inbounds i8, ptr %1, i64 28
+ %l8 = load i32, ptr %getElem13, align 4
+ ret void
+}
+
+
+; This version of the test adjusts the struct to hold two i32s at the beginning,
+; but still assumes that the first float3 is 16 aligned. If the alignment
+; propagation works correctly, it should be able to load this struct in three
+; loads: a 2x32, a 4x32, and a 4x32. Without the alignment propagation, the last
+; 4x32 will instead be a 2x32 and a 2x32
+%struct.S2 = type { i32, i32, %struct.float3, %struct.float3, i32, i32 }
+
+define void @testStore_2(ptr nocapture writeonly %1) {
+; CHECK-LABEL: define void @testStore_2(
+; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], ptr [[TMP0]], i64 0, i32 2
+; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[GETELEM1]], align 16
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S2]], ptr [[TMP0]], i64 0, i32 3, i32 1
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
+; CHECK-NEXT: ret void
+;
+ store i32 0, ptr %1, align 8
+ %getElem = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 1
+ store i32 0, ptr %getElem, align 4
+ %getElem1 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2
+ store float 0.000000e+00, ptr %getElem1, align 16
+ %getElem2 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 1
+ store float 0.000000e+00, ptr %getElem2, align 4
+ %getElem8 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 2
+ store float 0.000000e+00, ptr %getElem8, align 8
+ %getElem9 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3
+ store float 0.000000e+00, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 1
+ store float 0.000000e+00, ptr %getElem10, align 4
+ %getElem11 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 2
+ store float 0.000000e+00, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 4
+ store i32 0, ptr %getElem12, align 8
+ %getElem13 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 5
+ store i32 0, ptr %getElem13, align 4
+ ret void
+}
+
+define void @testLoad_2(ptr nocapture writeonly %1) {
+; CHECK-LABEL: define void @testLoad_2(
+; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], ptr [[TMP0]], i64 0, i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16
+; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S2]], ptr [[TMP0]], i64 0, i32 3, i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
+; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L77]] to float
+; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[L88]] to float
+; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT: ret void
+;
+ %l = load i32, ptr %1, align 8
+ %getElem = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 1
+ %l2 = load i32, ptr %getElem, align 4
+ %getElem1 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2
+ %l3 = load float, ptr %getElem1, align 16
+ %getElem2 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 1
+ %l4 = load float, ptr %getElem2, align 4
+ %getElem8 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 2
+ %l5 = load float, ptr %getElem8, align 8
+ %getElem9 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3
+ %l6 = load float, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 1
+ %l7 = load float, ptr %getElem10, align 4
+ %getElem11 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 2
+ %l8 = load float, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 4
+ %l9 = load i32, ptr %getElem12, align 8
+ %getElem13 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 5
+ %l0 = load i32, ptr %getElem13, align 4
+ ret void
+}
+
+; Also, test without the struct geps, to see if it still works with i8 geps/ptradd
+
+define void @testStorei8_2(ptr nocapture writeonly %1) {
+; CHECK-LABEL: define void @testStorei8_2(
+; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[GETELEM1]], align 16
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
+; CHECK-NEXT: ret void
+;
+ store i32 0, ptr %1, align 8
+ %getElem = getelementptr inbounds i8, ptr %1, i64 4
+ store i32 0, ptr %getElem, align 4
+ %getElem1 = getelementptr inbounds i8, ptr %1, i64 8
+ store float 0.000000e+00, ptr %getElem1, align 16
+ %getElem2 = getelementptr inbounds i8, ptr %1, i64 12
+ store float 0.000000e+00, ptr %getElem2, align 4
+ %getElem8 = getelementptr inbounds i8, ptr %1, i64 16
+ store float 0.000000e+00, ptr %getElem8, align 8
+ %getElem9 = getelementptr inbounds i8, ptr %1, i64 20
+ store float 0.000000e+00, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds i8, ptr %1, i64 24
+ store float 0.000000e+00, ptr %getElem10, align 4
+ %getElem11 = getelementptr inbounds i8, ptr %1, i64 28
+ store float 0.000000e+00, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds i8, ptr %1, i64 32
+ store i32 0, ptr %getElem12, align 8
+ %getElem13 = getelementptr inbounds i8, ptr %1, i64 36
+ store i32 0, ptr %getElem13, align 4
+ ret void
+}
+
+define void @testLoadi8_2(ptr nocapture writeonly %1) {
+; CHECK-LABEL: define void @testLoadi8_2(
+; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16
+; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
+; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L77]] to float
+; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[L88]] to float
+; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT: ret void
+;
+ %l = load i32, ptr %1, align 8
+ %getElem = getelementptr inbounds i8, ptr %1, i64 4
+ %l2 = load i32, ptr %getElem, align 4
+ %getElem1 = getelementptr inbounds i8, ptr %1, i64 8
+ %l3 = load float, ptr %getElem1, align 16
+ %getElem2 = getelementptr inbounds i8, ptr %1, i64 12
+ %l4 = load float, ptr %getElem2, align 4
+ %getElem8 = getelementptr inbounds i8, ptr %1, i64 16
+ %l5 = load float, ptr %getElem8, align 8
+ %getElem9 = getelementptr inbounds i8, ptr %1, i64 20
+ %l6 = load float, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds i8, ptr %1, i64 24
+ %l7 = load float, ptr %getElem10, align 4
+ %getElem11 = getelementptr inbounds i8, ptr %1, i64 28
+ %l8 = load float, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds i8, ptr %1, i64 32
+ %l9 = load i32, ptr %getElem12, align 8
+ %getElem13 = getelementptr inbounds i8, ptr %1, i64 36
+ %l0 = load i32, ptr %getElem13, align 4
+ ret void
+}
>From b905f1c05ff6e0e531bf5cb65fe067635673e76c Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Wed, 25 Jun 2025 17:48:57 +0000
Subject: [PATCH 2/2] Address feedback, add reverse propagation, simplify and
expand unit tests
---
.../Vectorize/LoadStoreVectorizer.cpp | 43 ++--
.../LoadStoreVectorizer/prop-align.ll | 186 ++++++++++++++++--
2 files changed, 194 insertions(+), 35 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index e14a936b764e5..95ec574be7d2c 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -1647,25 +1647,30 @@ std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
}
void Vectorizer::propagateBestAlignmentsInChain(ArrayRef<ChainElem> C) const {
- ChainElem BestAlignedElem = C[0];
- Align BestAlignSoFar = getLoadStoreAlignment(C[0].Inst);
-
- for (const ChainElem &E : C) {
- Align OrigAlign = getLoadStoreAlignment(E.Inst);
- if (OrigAlign > BestAlignSoFar) {
- BestAlignedElem = E;
- BestAlignSoFar = OrigAlign;
+ auto PropagateAlignments = [](auto ChainIt) {
+ ChainElem BestAlignedElem = *ChainIt.begin();
+ Align BestAlignSoFar = getLoadStoreAlignment(BestAlignedElem.Inst);
+
+ for (const ChainElem &E : ChainIt) {
+ Align OrigAlign = getLoadStoreAlignment(E.Inst);
+ if (OrigAlign > BestAlignSoFar) {
+ BestAlignedElem = E;
+ BestAlignSoFar = OrigAlign;
+ continue;
+ }
+
+ APInt DeltaFromBestAlignedElem =
+ APIntOps::abdu(E.OffsetFromLeader, BestAlignedElem.OffsetFromLeader);
+ // commonAlignment is equivalent to a greatest common power-of-two
+ // divisor; it returns the largest power of 2 that divides both A and B.
+ Align NewAlign = commonAlignment(
+ BestAlignSoFar, DeltaFromBestAlignedElem.getLimitedValue());
+ if (NewAlign > OrigAlign)
+ setLoadStoreAlignment(E.Inst, NewAlign);
}
+ };
- APInt OffsetFromBestAlignedElem =
- E.OffsetFromLeader - BestAlignedElem.OffsetFromLeader;
- assert(OffsetFromBestAlignedElem.isNonNegative());
- // commonAlignment is equivalent to a greatest common power-of-two divisor;
- // it returns the largest power of 2 that divides both A and B.
- Align NewAlign = commonAlignment(
- BestAlignSoFar, OffsetFromBestAlignedElem.getLimitedValue());
- if (NewAlign > OrigAlign)
- setLoadStoreAlignment(E.Inst, NewAlign);
- }
- return;
+ // Propagate forwards and backwards.
+ PropagateAlignments(C);
+ PropagateAlignments(reverse(C));
}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll b/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll
index a1878dc051d99..aeface5f91abd 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll
@@ -7,9 +7,9 @@
%struct.float3 = type { float, float, float }
%struct.S1 = type { %struct.float3, %struct.float3, i32, i32 }
-define void @testStore(ptr nocapture writeonly %1) {
+define void @testStore(ptr %1) {
; CHECK-LABEL: define void @testStore(
-; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[TMP0]], align 16
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i64 0, i32 1, i32 1
; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
@@ -33,9 +33,9 @@ define void @testStore(ptr nocapture writeonly %1) {
ret void
}
-define void @testLoad(ptr nocapture writeonly %1) {
+define void @testLoad(ptr %1) {
; CHECK-LABEL: define void @testLoad(
-; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0]], align 16
; CHECK-NEXT: [[L11:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
; CHECK-NEXT: [[L22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
@@ -71,9 +71,9 @@ define void @testLoad(ptr nocapture writeonly %1) {
; Also, test without the struct geps, to see if it still works with i8 geps/ptradd
-define void @testStorei8(ptr nocapture writeonly %1) {
+define void @testStorei8(ptr %1) {
; CHECK-LABEL: define void @testStorei8(
-; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[TMP0]], align 16
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
@@ -97,9 +97,9 @@ define void @testStorei8(ptr nocapture writeonly %1) {
ret void
}
-define void @testLoadi8(ptr nocapture writeonly %1) {
+define void @testLoadi8(ptr %1) {
; CHECK-LABEL: define void @testLoadi8(
-; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0]], align 16
; CHECK-NEXT: [[L11:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
; CHECK-NEXT: [[L22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
@@ -141,9 +141,9 @@ define void @testLoadi8(ptr nocapture writeonly %1) {
; 4x32 will instead be a 2x32 and a 2x32
%struct.S2 = type { i32, i32, %struct.float3, %struct.float3, i32, i32 }
-define void @testStore_2(ptr nocapture writeonly %1) {
+define void @testStore_2(ptr %1) {
; CHECK-LABEL: define void @testStore_2(
-; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP0]], align 8
; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], ptr [[TMP0]], i64 0, i32 2
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[GETELEM1]], align 16
@@ -173,9 +173,9 @@ define void @testStore_2(ptr nocapture writeonly %1) {
ret void
}
-define void @testLoad_2(ptr nocapture writeonly %1) {
+define void @testLoad_2(ptr %1) {
; CHECK-LABEL: define void @testLoad_2(
-; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
@@ -219,9 +219,9 @@ define void @testLoad_2(ptr nocapture writeonly %1) {
; Also, test without the struct geps, to see if it still works with i8 geps/ptradd
-define void @testStorei8_2(ptr nocapture writeonly %1) {
+define void @testStorei8_2(ptr %1) {
; CHECK-LABEL: define void @testStorei8_2(
-; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP0]], align 8
; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[GETELEM1]], align 16
@@ -251,9 +251,9 @@ define void @testStorei8_2(ptr nocapture writeonly %1) {
ret void
}
-define void @testLoadi8_2(ptr nocapture writeonly %1) {
+define void @testLoadi8_2(ptr %1) {
; CHECK-LABEL: define void @testLoadi8_2(
-; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
@@ -294,3 +294,157 @@ define void @testLoadi8_2(ptr nocapture writeonly %1) {
%l0 = load i32, ptr %getElem13, align 4
ret void
}
+
+; Test that the alignment propagation works both forwards and backwards.
+; with the "align 16" placed where it is,
+; we should end up with a v2 followed by two v4s followed by a v2.
+define void @test_forward_and_reverse(ptr %1) {
+; CHECK-LABEL: define void @test_forward_and_reverse(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16
+; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[GETELEM10]], align 16
+; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; CHECK-NEXT: [[GETELEM14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 40
+; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[GETELEM14]], align 16
+; CHECK-NEXT: [[L1111:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0
+; CHECK-NEXT: [[L1212:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
+; CHECK-NEXT: ret void
+;
+ %l = load i32, ptr %1, align 4
+ %getElem = getelementptr inbounds i8, ptr %1, i64 4
+ %l2 = load i32, ptr %getElem, align 4
+ %getElem1 = getelementptr inbounds i8, ptr %1, i64 8
+ %l3 = load float, ptr %getElem1, align 4
+ %getElem2 = getelementptr inbounds i8, ptr %1, i64 12
+ %l4 = load float, ptr %getElem2, align 4
+ %getElem8 = getelementptr inbounds i8, ptr %1, i64 16
+ %l5 = load float, ptr %getElem8, align 4
+ %getElem9 = getelementptr inbounds i8, ptr %1, i64 20
+ %l6 = load float, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds i8, ptr %1, i64 24
+ %l7 = load float, ptr %getElem10, align 16
+ %getElem11 = getelementptr inbounds i8, ptr %1, i64 28
+ %l8 = load float, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds i8, ptr %1, i64 32
+ %l9 = load float, ptr %getElem12, align 4
+ %getElem13 = getelementptr inbounds i8, ptr %1, i64 36
+ %l0 = load float, ptr %getElem13, align 4
+ %getElem14 = getelementptr inbounds i8, ptr %1, i64 40
+ %l11 = load i32, ptr %getElem14, align 4
+ %getElem15 = getelementptr inbounds i8, ptr %1, i64 44
+ %l12 = load i32, ptr %getElem15, align 4
+ ret void
+}
+
+; Test an edge case where the defined alignment is max align
+define void @test_forward_and_reverse_max_align(ptr %1) {
+; CHECK-LABEL: define void @test_forward_and_reverse_max_align(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16
+; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[GETELEM10]], align 4294967296
+; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; CHECK-NEXT: [[GETELEM14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 40
+; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[GETELEM14]], align 16
+; CHECK-NEXT: [[L1111:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
+; CHECK-NEXT: [[L1212:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; CHECK-NEXT: ret void
+;
+ %l = load i32, ptr %1, align 4
+ %getElem = getelementptr inbounds i8, ptr %1, i64 4
+ %l2 = load i32, ptr %getElem, align 4
+ %getElem1 = getelementptr inbounds i8, ptr %1, i64 8
+ %l3 = load float, ptr %getElem1, align 4
+ %getElem2 = getelementptr inbounds i8, ptr %1, i64 12
+ %l4 = load float, ptr %getElem2, align 4
+ %getElem8 = getelementptr inbounds i8, ptr %1, i64 16
+ %l5 = load float, ptr %getElem8, align 4
+ %getElem9 = getelementptr inbounds i8, ptr %1, i64 20
+ %l6 = load float, ptr %getElem9, align 4
+ %getElem10 = getelementptr inbounds i8, ptr %1, i64 24
+ %l7 = load float, ptr %getElem10, align 4294967296
+ %getElem11 = getelementptr inbounds i8, ptr %1, i64 28
+ %l8 = load float, ptr %getElem11, align 4
+ %getElem12 = getelementptr inbounds i8, ptr %1, i64 32
+ %l9 = load float, ptr %getElem12, align 4
+ %getElem13 = getelementptr inbounds i8, ptr %1, i64 36
+ %l0 = load float, ptr %getElem13, align 4
+ %getElem14 = getelementptr inbounds i8, ptr %1, i64 40
+ %l11 = load i32, ptr %getElem14, align 4
+ %getElem15 = getelementptr inbounds i8, ptr %1, i64 44
+ %l12 = load i32, ptr %getElem15, align 4
+ ret void
+}
+
+define void @test_i8_elements(ptr %1) {
+; CHECK-LABEL: define void @test_i8_elements(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr [[TMP0]], align 2
+; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
+; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
+; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GETELEM1]], align 4
+; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x i8> [[TMP3]], i32 0
+; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x i8> [[TMP3]], i32 1
+; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x i8> [[TMP3]], i32 2
+; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x i8> [[TMP3]], i32 3
+; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 6
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[GETELEM10]], align 4
+; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i8> [[TMP4]], i32 0
+; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i8> [[TMP4]], i32 1
+; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x i8> [[TMP4]], i32 2
+; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3
+; CHECK-NEXT: [[GETELEM14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 10
+; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i8>, ptr [[GETELEM14]], align 4
+; CHECK-NEXT: [[L1111:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
+; CHECK-NEXT: [[L1212:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
+; CHECK-NEXT: ret void
+;
+ %l = load i8, ptr %1, align 1
+ %getElem = getelementptr inbounds i8, ptr %1, i64 1
+ %l2 = load i8, ptr %getElem, align 1
+ %getElem1 = getelementptr inbounds i8, ptr %1, i64 2
+ %l3 = load i8, ptr %getElem1, align 1
+ %getElem2 = getelementptr inbounds i8, ptr %1, i64 3
+ %l4 = load i8, ptr %getElem2, align 1
+ %getElem8 = getelementptr inbounds i8, ptr %1, i64 4
+ %l5 = load i8, ptr %getElem8, align 1
+ %getElem9 = getelementptr inbounds i8, ptr %1, i64 5
+ %l6 = load i8, ptr %getElem9, align 1
+ %getElem10 = getelementptr inbounds i8, ptr %1, i64 6
+ %l7 = load i8, ptr %getElem10, align 4
+ %getElem11 = getelementptr inbounds i8, ptr %1, i64 7
+ %l8 = load i8, ptr %getElem11, align 1
+ %getElem12 = getelementptr inbounds i8, ptr %1, i64 8
+ %l9 = load i8, ptr %getElem12, align 1
+ %getElem13 = getelementptr inbounds i8, ptr %1, i64 9
+ %l0 = load i8, ptr %getElem13, align 1
+ %getElem14 = getelementptr inbounds i8, ptr %1, i64 10
+ %l11 = load i8, ptr %getElem14, align 1
+ %getElem15 = getelementptr inbounds i8, ptr %1, i64 11
+ %l12 = load i8, ptr %getElem15, align 1
+ ret void
+}
More information about the llvm-commits
mailing list