[llvm] [LowerBufferFatPointers] Don't lose data from unaligned < word vectors (PR #132981)
Krzysztof Drewniak via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 26 13:14:02 PDT 2025
https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/132981
>From 86a963489af6531bce2c3b6506ddd772b496ecd8 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Tue, 25 Mar 2025 19:25:16 +0000
Subject: [PATCH 1/2] [LowerBufferFatPointers] Don't lose data from unaligned <
word vectors
Previously, AMDGPULowerBufferFatPointers would always cast long
vectors of small types, such as <8 x i8> or <2 x f16> to instructions
that operate on words, like loading/storing <2 x i32> and i32,
respectively.
This transformation is correct - and correctly returns 0s for loads
where the vector is partially in bounds - only if the offset of the
vector is word-aligned. That is, supposing a buffer has a numRecords
of 8, loading a <8 x i8> starting at offset 4 will correctly return
the last word of the buffer and 4 0s. However, if one instead starts
at offset 6 or 7 (for an alignment of 2 and 1, respectively), the
hardware will mask off the entire word, causing an all-0 result.
To inhibit this surprising and undesired behavior, loads/stores of
vectors with sub-word elements that aren't aligned to at least a word
will be broken down into scalar reads and writes, preserving the
expected out-of-bounds behavior.
This transformation will still load at least one element at a time, so
a <4 x half>, align 1 load will still use buffer_load_ushort.
---
.../AMDGPU/AMDGPULowerBufferFatPointers.cpp | 47 ++--
...ffer-fat-pointers-contents-legalization.ll | 204 +++++++++++++++++-
...ffer-fat-pointers-contents-legalization.ll | 116 +++++++++-
3 files changed, 347 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 5dd1fe14e5626..ee23a14960767 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -697,8 +697,9 @@ class LegalizeBufferContentTypesVisitor
/// Convert a vector or scalar type that can't be operated on by buffer
/// intrinsics to one that would be legal through bitcasts and/or truncation.
- /// Uses the wider of i32, i16, or i8 where possible.
- Type *legalNonAggregateFor(Type *T);
+ /// Uses the wider of i32, i16, or i8 where possible, accounting for the
+ /// alignment of the load or store.
+ Type *legalNonAggregateFor(Type *T, Align A);
Value *makeLegalNonAggregate(Value *V, Type *TargetType, const Twine &Name);
Value *makeIllegalNonAggregate(Value *V, Type *OrigType, const Twine &Name);
@@ -712,8 +713,10 @@ class LegalizeBufferContentTypesVisitor
/// Return the [index, length] pairs into which `T` needs to be cut to form
/// legal buffer load or store operations. Clears `Slices`. Creates an empty
/// `Slices` for non-vector inputs and creates one slice if no slicing will be
- /// needed.
- void getVecSlices(Type *T, SmallVectorImpl<VecSlice> &Slices);
+ /// needed. If `T` is a vector of sub-word type (i8, half, etc.) and `align`
+ /// is less than 4, splits the load into scalar loads so that reading off the
+ /// end of a byte buffer doesn't lose data.
+ void getVecSlices(Type *T, Align A, SmallVectorImpl<VecSlice> &Slices);
Value *extractSlice(Value *Vec, VecSlice S, const Twine &Name);
Value *insertSlice(Value *Whole, Value *Part, VecSlice S, const Twine &Name);
@@ -790,7 +793,8 @@ Value *LegalizeBufferContentTypesVisitor::vectorToArray(Value *V,
return ArrayRes;
}
-Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) {
+Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T,
+ Align A) {
TypeSize Size = DL.getTypeStoreSizeInBits(T);
// Implicitly zero-extend to the next byte if needed
if (!DL.typeSizeEqualsStoreSize(T))
@@ -802,15 +806,18 @@ Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) {
return T;
}
unsigned ElemSize = DL.getTypeSizeInBits(ElemTy).getFixedValue();
- if (isPowerOf2_32(ElemSize) && ElemSize >= 16 && ElemSize <= 128) {
+ bool IsUnaligned16BitVector = ElemSize == 16 && Size > ElemSize && A < 4;
+ if (isPowerOf2_32(ElemSize) && ElemSize >= 16 && ElemSize <= 128 &&
+ !IsUnaligned16BitVector) {
// [vectors of] anything that's 16/32/64/128 bits can be cast and split into
- // legal buffer operations.
+ // legal buffer operations, except that unaligned 16-bit vectors need to be
+ // split.
return T;
}
Type *BestVectorElemType = nullptr;
- if (Size.isKnownMultipleOf(32))
+ if (Size.isKnownMultipleOf(32) && A >= Align(4))
BestVectorElemType = IRB.getInt32Ty();
- else if (Size.isKnownMultipleOf(16))
+ else if (Size.isKnownMultipleOf(16) && A >= Align(2))
BestVectorElemType = IRB.getInt16Ty();
else
BestVectorElemType = IRB.getInt8Ty();
@@ -883,7 +890,7 @@ Type *LegalizeBufferContentTypesVisitor::intrinsicTypeFor(Type *LegalType) {
}
void LegalizeBufferContentTypesVisitor::getVecSlices(
- Type *T, SmallVectorImpl<VecSlice> &Slices) {
+ Type *T, Align A, SmallVectorImpl<VecSlice> &Slices) {
Slices.clear();
auto *VT = dyn_cast<FixedVectorType>(T);
if (!VT)
@@ -902,6 +909,16 @@ void LegalizeBufferContentTypesVisitor::getVecSlices(
// example, <3 x i64>, since that's not slicing.
uint64_t ElemsPer3Words = ElemsPerWord * 3;
+ if (ElemBitWidth < 32 && A < Align(4)) {
+ // Don't use wide loads when loading unaligned vectors of 16- or 8-bit
+ // types, as that can cause something like a load of <4 x half>
+ // from %base + 6 with numRecords = 8 bytes to not load the last element
+ // as one might expect.
+ ElemsPer4Words = ElemsPer3Words = ElemsPer2Words = ElemsPerWord = 0;
+ if (ElemBitWidth < 16 && A < Align(2)) {
+ ElemsPerShort = 0;
+ }
+ }
uint64_t TotalElems = VT->getNumElements();
uint64_t Index = 0;
auto TrySlice = [&](unsigned MaybeLen) {
@@ -1003,11 +1020,12 @@ bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
// Typical case
+ Align PartAlign = commonAlignment(OrigLI.getAlign(), AggByteOff);
Type *ArrayAsVecType = scalarArrayTypeAsVector(PartType);
- Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
+ Type *LegalType = legalNonAggregateFor(ArrayAsVecType, PartAlign);
SmallVector<VecSlice> Slices;
- getVecSlices(LegalType, Slices);
+ getVecSlices(LegalType, PartAlign, Slices);
bool HasSlices = Slices.size() > 1;
bool IsAggPart = !AggIdxs.empty();
Value *LoadsRes;
@@ -1133,13 +1151,14 @@ std::pair<bool, bool> LegalizeBufferContentTypesVisitor::visitStoreImpl(
NewData = arrayToVector(NewData, ArrayAsVecType, Name);
}
- Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
+ Align PartAlign = commonAlignment(OrigSI.getAlign(), AggByteOff);
+ Type *LegalType = legalNonAggregateFor(ArrayAsVecType, PartAlign);
if (LegalType != ArrayAsVecType) {
NewData = makeLegalNonAggregate(NewData, LegalType, Name);
}
SmallVector<VecSlice> Slices;
- getVecSlices(LegalType, Slices);
+ getVecSlices(LegalType, PartAlign, Slices);
bool NeedToSplit = Slices.size() > 1 || IsAggPart;
if (!NeedToSplit) {
Type *StorableType = intrinsicTypeFor(LegalType);
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 405058b24dcc2..37eb82a2efef9 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -274,6 +274,44 @@ define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) inreg %buf) {
ret void
}
+define <2 x i32> @load_v2i32_align1(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i32_align1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i32_align1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x i32>, ptr addrspace(7) %p, align 1
+ ret <2 x i32> %ret
+}
+
+define void @store_v2i32_align1(<2 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i32_align1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i32_align1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x i32> %data, ptr addrspace(7) %p, align 1
+ ret void
+}
+
define <3 x i32> @load_v3i32(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v3i32:
; SDAG: ; %bb.0:
@@ -616,6 +654,56 @@ define void @store_v2f16(<2 x half> %data, ptr addrspace(8) inreg %buf) {
ret void
}
+define <2 x half> @load_v2f16_align2(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2f16_align2:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:2
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2f16_align2:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:2
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x half>, ptr addrspace(7) %p, align 2
+ ret <2 x half> %ret
+}
+
+define void @store_v2f16_align2(<2 x half> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2f16_align2:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2f16_align2:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x half> %data, ptr addrspace(7) %p, align 2
+ ret void
+}
+
define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v4bf16:
; SDAG: ; %bb.0:
@@ -2391,6 +2479,72 @@ define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) inreg %buf) {
ret void
}
+define <8 x i8> @load_v8i8_align1(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8i8_align1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
+; SDAG-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:5
+; SDAG-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_load_ubyte v7, off, s[16:19], 0 offset:7
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8i8_align1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
+; GISEL-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:5
+; GISEL-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_load_ubyte v7, off, s[16:19], 0 offset:7
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <8 x i8>, ptr addrspace(7) %p, align 1
+ ret <8 x i8> %ret
+}
+
+define void @store_v8i8_align1(<8 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8i8_align1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
+; SDAG-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5
+; SDAG-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_store_byte v7, off, s[16:19], 0 offset:7
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8i8_align1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
+; GISEL-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5
+; GISEL-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_store_byte v7, off, s[16:19], 0 offset:7
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <8 x i8> %data, ptr addrspace(7) %p, align 1
+ ret void
+}
+
define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v12i8:
; SDAG: ; %bb.0:
@@ -2912,12 +3066,33 @@ define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_a2f16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:2
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:2
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load [2 x half], ptr addrspace(7) %p
+ ret [2 x half] %ret
+}
+
+define [2 x half] @load_a2f16_align4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2f16_align4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: load_a2f16:
+; GISEL-LABEL: load_a2f16_align4:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
@@ -2925,7 +3100,7 @@ define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) {
; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
- %ret = load [2 x half], ptr addrspace(7) %p
+ %ret = load [2 x half], ptr addrspace(7) %p, align 4
ret [2 x half] %ret
}
@@ -2933,13 +3108,34 @@ define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_a2f16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:2
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:2
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store [2 x half] %data, ptr addrspace(7) %p
+ ret void
+}
+
+define void @store_a2f16_align4([2 x half] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2f16_align4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: s_mov_b32 s4, 0x5040100
; SDAG-NEXT: v_perm_b32 v0, v1, v0, s4
; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: store_a2f16:
+; GISEL-LABEL: store_a2f16_align4:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -2948,7 +3144,7 @@ define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) {
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
- store [2 x half] %data, ptr addrspace(7) %p
+ store [2 x half] %data, ptr addrspace(7) %p, align 4
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
index d18f0f8bd1ff9..53c0a742597a1 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
@@ -165,6 +165,28 @@ define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) inreg %buf) {
ret void
}
+define <2 x i32> @load_v2i32_align1(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define <2 x i32> @load_v2i32_align1(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: ret <2 x i32> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x i32>, ptr addrspace(7) %p, align 1
+ ret <2 x i32> %ret
+}
+
+define void @store_v2i32_align1(<2 x i32> %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_v2i32_align1(
+; CHECK-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x i32> %data, ptr addrspace(7) %p, align 1
+ ret void
+}
+
define <3 x i32> @load_v3i32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <3 x i32> @load_v3i32(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
@@ -363,6 +385,36 @@ define void @store_v2f16(<2 x half> %data, ptr addrspace(8) inreg %buf) {
ret void
}
+define <2 x half> @load_v2f16_align2(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define <2 x half> @load_v2f16_align2(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <2 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; CHECK-NEXT: [[RET:%.*]] = bitcast <2 x i16> [[RET_SLICE_1]] to <2 x half>
+; CHECK-NEXT: ret <2 x half> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x half>, ptr addrspace(7) %p, align 2
+ ret <2 x half> %ret
+}
+
+define void @store_v2f16_align2(<2 x half> %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_v2f16_align2(
+; CHECK-SAME: <2 x half> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <2 x half> [[DATA]] to <2 x i16>
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i16> [[DATA_LEGAL]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i16> [[DATA_LEGAL]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x half> %data, ptr addrspace(7) %p, align 2
+ ret void
+}
+
define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <4 x bfloat> @load_v4bf16(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
@@ -1400,6 +1452,58 @@ define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) inreg %buf) {
ret void
}
+define <8 x i8> @load_v8i8_align1(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define <8 x i8> @load_v8i8_align1(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; CHECK-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; CHECK-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; CHECK-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; CHECK-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 7, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <8 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
+; CHECK-NEXT: ret <8 x i8> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <8 x i8>, ptr addrspace(7) %p, align 1
+ ret <8 x i8> %ret
+}
+
+define void @store_v8i8_align1(<8 x i8> %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_v8i8_align1(
+; CHECK-SAME: <8 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i8> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i8> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i8> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 1 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i8> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i8> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 1 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i8> [[DATA]], i64 5
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i8> [[DATA]], i64 6
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 1 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i8> [[DATA]], i64 7
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 7, i32 0, i32 0)
+; CHECK-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <8 x i8> %data, ptr addrspace(7) %p, align 1
+ ret void
+}
+
define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <12 x i8> @load_v12i8(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
@@ -1543,7 +1647,11 @@ define void @store_a2i32([2 x i32] %data, ptr addrspace(8) inreg %buf) {
define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define [2 x half] @load_a2f16(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.v2f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <2 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = bitcast <2 x i16> [[RET_SLICE_1]] to <2 x half>
; CHECK-NEXT: [[RET_ELEM_0:%.*]] = extractelement <2 x half> [[RET_LOADABLE]], i64 0
; CHECK-NEXT: [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x half] poison, half [[RET_ELEM_0]], 0
; CHECK-NEXT: [[RET_ELEM_1:%.*]] = extractelement <2 x half> [[RET_LOADABLE]], i64 1
@@ -1562,7 +1670,11 @@ define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) {
; CHECK-NEXT: [[DATA_AS_VEC_0:%.*]] = insertelement <2 x half> poison, half [[DATA_ELEM_0]], i64 0
; CHECK-NEXT: [[DATA_ELEM_1:%.*]] = extractvalue [2 x half] [[DATA]], 1
; CHECK-NEXT: [[DATA_AS_VEC_1:%.*]] = insertelement <2 x half> [[DATA_AS_VEC_0]], half [[DATA_ELEM_1]], i64 1
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> [[DATA_AS_VEC_1]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <2 x half> [[DATA_AS_VEC_1]] to <2 x i16>
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i16> [[DATA_LEGAL]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i16> [[DATA_LEGAL]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
>From 004c8521509dbb8a4039cb8ece45c89d489b4ff5 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Wed, 26 Mar 2025 20:13:50 +0000
Subject: [PATCH 2/2] Rename function
---
.../lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index ee23a14960767..756f7fd12367c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -699,7 +699,7 @@ class LegalizeBufferContentTypesVisitor
/// intrinsics to one that would be legal through bitcasts and/or truncation.
/// Uses the wider of i32, i16, or i8 where possible, accounting for the
/// alignment of the load or store.
- Type *legalNonAggregateFor(Type *T, Align A);
+ Type *legalNonAggregateForMemOp(Type *T, Align A);
Value *makeLegalNonAggregate(Value *V, Type *TargetType, const Twine &Name);
Value *makeIllegalNonAggregate(Value *V, Type *OrigType, const Twine &Name);
@@ -793,8 +793,8 @@ Value *LegalizeBufferContentTypesVisitor::vectorToArray(Value *V,
return ArrayRes;
}
-Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T,
- Align A) {
+Type *LegalizeBufferContentTypesVisitor::legalNonAggregateForMemOp(Type *T,
+ Align A) {
TypeSize Size = DL.getTypeStoreSizeInBits(T);
// Implicitly zero-extend to the next byte if needed
if (!DL.typeSizeEqualsStoreSize(T))
@@ -1022,7 +1022,7 @@ bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
Align PartAlign = commonAlignment(OrigLI.getAlign(), AggByteOff);
Type *ArrayAsVecType = scalarArrayTypeAsVector(PartType);
- Type *LegalType = legalNonAggregateFor(ArrayAsVecType, PartAlign);
+ Type *LegalType = legalNonAggregateForMemOp(ArrayAsVecType, PartAlign);
SmallVector<VecSlice> Slices;
getVecSlices(LegalType, PartAlign, Slices);
@@ -1152,7 +1152,7 @@ std::pair<bool, bool> LegalizeBufferContentTypesVisitor::visitStoreImpl(
}
Align PartAlign = commonAlignment(OrigSI.getAlign(), AggByteOff);
- Type *LegalType = legalNonAggregateFor(ArrayAsVecType, PartAlign);
+ Type *LegalType = legalNonAggregateForMemOp(ArrayAsVecType, PartAlign);
if (LegalType != ArrayAsVecType) {
NewData = makeLegalNonAggregate(NewData, LegalType, Name);
}
More information about the llvm-commits
mailing list