[llvm] [LowerBufferFatPointers] Correctly handle alignment modes (PR #134329)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 3 17:23:44 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Krzysztof Drewniak (krzysz00)
<details>
<summary>Changes</summary>
Previously, AMDGPULowerBufferFatPointers would emit unaligned buffer loads/stores, even when such unaligned accesses were disabled (that is, on non-HSA platforms).
In addition, the lowering did not respect the newly-added relaxed-buffer-oob-mode feature, which now must be enabled in order to vectorize unaligned loads from buffers.
This commit fixes both issues and adds tests.
---
Patch is 1.32 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134329.diff
11 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp (+106-28)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+1135-554)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+829-379)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+829-379)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll (+1076-719)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll (+160-210)
- (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll (+1-1)
- (added) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization-alignment.ll (+3163)
- (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll (+802-172)
- (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll (+967-389)
- (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll (+1-1)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index e6250ddf2c26b..a17511d71b997 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -250,6 +250,7 @@
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
@@ -688,6 +689,10 @@ class LegalizeBufferContentTypesVisitor
const DataLayout &DL;
+ // Subtarget info, needed for determining what cache control bits to set.
+ const TargetMachine *TM;
+ const GCNSubtarget *ST = nullptr;
+
/// If T is [N x U], where U is a scalar type, return the vector type
/// <N x U>, otherwise, return T.
Type *scalarArrayTypeAsVector(Type *MaybeArrayType);
@@ -696,10 +701,32 @@ class LegalizeBufferContentTypesVisitor
/// Break up the loads of a struct into the loads of its components
+ /// Return the maximum allowed load/store width for the given type and
+ /// alignment combination based on subtarget flags.
+ /// 1. If unaligned accesses are not enabled, then any load/store that is less
+ /// than word-aligned has to be handled one byte or ushort at a time.
+ /// 2. If relaxed OOB mode is not set, we must ensure that the in-bounds
+ /// part of a partially out of bounds read/write is performed correctly. This
+ /// means that any load that isn't naturally aligned has to be split into
+ /// parts that are naturally aligned, so that, after bitcasting, we don't have
+ /// unaligned loads that could discard valid data.
+ ///
+ /// For example, if we're loading a <8 x i8>, that's actually a load of a <2 x
+ /// i32>, and if we load from an align(2) address, that address might be 2
+ /// bytes from the end of the buffer. The hardware will, when performing the
+ /// <2 x i32> load, mask off the entire first word, causing the two in-bounds
+ /// bytes to be masked off.
+ ///
+ /// Unlike the complete disablement of unaligned accesses from point 1,
+ /// this does not apply to unaligned scalars, but will apply to cases like
+ /// `load <2 x i32>, align 4` since the left elemenvt might be out of bounds.
+ uint64_t maxIntrinsicWidth(Type *Ty, Align A);
+
/// Convert a vector or scalar type that can't be operated on by buffer
/// intrinsics to one that would be legal through bitcasts and/or truncation.
- /// Uses the wider of i32, i16, or i8 where possible.
- Type *legalNonAggregateFor(Type *T);
+ /// Uses the wider of i32, i16, or i8 where possible, clamping to the maximum
+ /// allowed width under the alignment rules and subtarget flags.
+ Type *legalNonAggregateForMemOp(Type *T, uint64_t MaxWidth);
Value *makeLegalNonAggregate(Value *V, Type *TargetType, const Twine &Name);
Value *makeIllegalNonAggregate(Value *V, Type *OrigType, const Twine &Name);
@@ -713,8 +740,9 @@ class LegalizeBufferContentTypesVisitor
/// Return the [index, length] pairs into which `T` needs to be cut to form
/// legal buffer load or store operations. Clears `Slices`. Creates an empty
/// `Slices` for non-vector inputs and creates one slice if no slicing will be
- /// needed.
- void getVecSlices(Type *T, SmallVectorImpl<VecSlice> &Slices);
+ /// needed. No slice may be larger than `MaxWidth`.
+ void getVecSlices(Type *T, uint64_t MaxWidth,
+ SmallVectorImpl<VecSlice> &Slices);
Value *extractSlice(Value *Vec, VecSlice S, const Twine &Name);
Value *insertSlice(Value *Whole, Value *Part, VecSlice S, const Twine &Name);
@@ -743,8 +771,9 @@ class LegalizeBufferContentTypesVisitor
bool visitStoreInst(StoreInst &SI);
public:
- LegalizeBufferContentTypesVisitor(const DataLayout &DL, LLVMContext &Ctx)
- : IRB(Ctx, InstSimplifyFolder(DL)), DL(DL) {}
+ LegalizeBufferContentTypesVisitor(const DataLayout &DL, LLVMContext &Ctx,
+ const TargetMachine *TM)
+ : IRB(Ctx, InstSimplifyFolder(DL)), DL(DL), TM(TM) {}
bool processFunction(Function &F);
};
} // namespace
@@ -791,7 +820,48 @@ Value *LegalizeBufferContentTypesVisitor::vectorToArray(Value *V,
return ArrayRes;
}
-Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) {
+uint64_t LegalizeBufferContentTypesVisitor::maxIntrinsicWidth(Type *T,
+ Align A) {
+ Align Result(16);
+ if (!ST->hasUnalignedBufferAccessEnabled() && A < Align(4))
+ Result = A;
+ auto *VT = dyn_cast<VectorType>(T);
+ if (!ST->hasRelaxedBufferOOBMode() && VT) {
+ TypeSize ElemBits = DL.getTypeSizeInBits(VT->getElementType());
+ if (ElemBits.isKnownMultipleOf(32)) {
+ // Word-sized operations are bounds-checked per word. So, the only case we
+ // have to worry about is stores that start out of bounds and then go in,
+ // and those can only become in-bounds on a multiple of their alignment.
+ // Therefore, we can use the declared alignment of the operation as the
+ // maximum width, rounding up to 4.
+ Result = std::min(Result, std::max(A, Align(4)));
+ } else if (ElemBits.isKnownMultipleOf(8) ||
+ isPowerOf2_64(ElemBits.getKnownMinValue())) {
+ // To ensure correct behavior for sub-word types, we must always scalarize
+ // unaligned loads of sub-word types. For example, if you load
+ // a <4 x i8> from offset 7 in an 8-byte buffer, expecting the vector
+ // to be padded out with 0s after that last byte, you'll get all 0s
+ // instead. To prevent this behavior when not requested, de-vectorize such
+ // loads.
+ //
+ // This condition could be looser and mirror the word-length condition
+ // if we were allowed to assume that the number of records in a buffer
+ // was a multiple of 4 - then, we could always use the vector's
+ // alignment of the access on the assumption that no one wants their
+ // mask to kick in mid-word.
+ //
+ // Strict OOB checking isn't supported if the size of each element is a
+ // non-power-of-2 value less than 8, since there's no feasible way to
+ // apply such a strict bounds check.
+ Result =
+ commonAlignment(Result, divideCeil(ElemBits.getKnownMinValue(), 8));
+ }
+ }
+ return Result.value() * 8;
+}
+
+Type *LegalizeBufferContentTypesVisitor::legalNonAggregateForMemOp(
+ Type *T, uint64_t MaxWidth) {
TypeSize Size = DL.getTypeStoreSizeInBits(T);
// Implicitly zero-extend to the next byte if needed
if (!DL.typeSizeEqualsStoreSize(T))
@@ -803,15 +873,16 @@ Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) {
return T;
}
unsigned ElemSize = DL.getTypeSizeInBits(ElemTy).getFixedValue();
- if (isPowerOf2_32(ElemSize) && ElemSize >= 16 && ElemSize <= 128) {
+ if (isPowerOf2_32(ElemSize) && ElemSize >= 16 && ElemSize <= MaxWidth) {
// [vectors of] anything that's 16/32/64/128 bits can be cast and split into
- // legal buffer operations.
+ // legal buffer operations, except that we might need to cut them into
+ // smaller values if we're not allowed to do unaligned vector loads.
return T;
}
Type *BestVectorElemType = nullptr;
- if (Size.isKnownMultipleOf(32))
+ if (Size.isKnownMultipleOf(32) && MaxWidth >= 32)
BestVectorElemType = IRB.getInt32Ty();
- else if (Size.isKnownMultipleOf(16))
+ else if (Size.isKnownMultipleOf(16) && MaxWidth >= 16)
BestVectorElemType = IRB.getInt16Ty();
else
BestVectorElemType = IRB.getInt8Ty();
@@ -884,7 +955,7 @@ Type *LegalizeBufferContentTypesVisitor::intrinsicTypeFor(Type *LegalType) {
}
void LegalizeBufferContentTypesVisitor::getVecSlices(
- Type *T, SmallVectorImpl<VecSlice> &Slices) {
+ Type *T, uint64_t MaxWidth, SmallVectorImpl<VecSlice> &Slices) {
Slices.clear();
auto *VT = dyn_cast<FixedVectorType>(T);
if (!VT)
@@ -905,8 +976,8 @@ void LegalizeBufferContentTypesVisitor::getVecSlices(
uint64_t TotalElems = VT->getNumElements();
uint64_t Index = 0;
- auto TrySlice = [&](unsigned MaybeLen) {
- if (MaybeLen > 0 && Index + MaybeLen <= TotalElems) {
+ auto TrySlice = [&](unsigned MaybeLen, unsigned Width) {
+ if (MaybeLen > 0 && Width <= MaxWidth && Index + MaybeLen <= TotalElems) {
VecSlice Slice{/*Index=*/Index, /*Length=*/MaybeLen};
Slices.push_back(Slice);
Index += MaybeLen;
@@ -915,9 +986,9 @@ void LegalizeBufferContentTypesVisitor::getVecSlices(
return false;
};
while (Index < TotalElems) {
- TrySlice(ElemsPer4Words) || TrySlice(ElemsPer3Words) ||
- TrySlice(ElemsPer2Words) || TrySlice(ElemsPerWord) ||
- TrySlice(ElemsPerShort) || TrySlice(ElemsPerByte);
+ TrySlice(ElemsPer4Words, 128) || TrySlice(ElemsPer3Words, 96) ||
+ TrySlice(ElemsPer2Words, 64) || TrySlice(ElemsPerWord, 32) ||
+ TrySlice(ElemsPerShort, 16) || TrySlice(ElemsPerByte, 8);
}
}
@@ -1004,11 +1075,13 @@ bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
// Typical case
+ Align PartAlign = commonAlignment(OrigLI.getAlign(), AggByteOff);
Type *ArrayAsVecType = scalarArrayTypeAsVector(PartType);
- Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
+ uint64_t MaxWidth = maxIntrinsicWidth(ArrayAsVecType, PartAlign);
+ Type *LegalType = legalNonAggregateForMemOp(ArrayAsVecType, MaxWidth);
SmallVector<VecSlice> Slices;
- getVecSlices(LegalType, Slices);
+ getVecSlices(LegalType, MaxWidth, Slices);
bool HasSlices = Slices.size() > 1;
bool IsAggPart = !AggIdxs.empty();
Value *LoadsRes;
@@ -1045,7 +1118,8 @@ bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
Value *NewPtr = IRB.CreateGEP(
IRB.getInt8Ty(), OrigLI.getPointerOperand(), IRB.getInt32(ByteOffset),
OrigPtr->getName() + ".off.ptr." + Twine(ByteOffset),
- GEPNoWrapFlags::noUnsignedWrap());
+ ST->hasRelaxedBufferOOBMode() ? GEPNoWrapFlags::noUnsignedWrap()
+ : GEPNoWrapFlags::none());
Type *LoadableType = intrinsicTypeFor(SliceType);
LoadInst *NewLI = IRB.CreateAlignedLoad(
LoadableType, NewPtr, commonAlignment(OrigLI.getAlign(), ByteOffset),
@@ -1134,13 +1208,15 @@ std::pair<bool, bool> LegalizeBufferContentTypesVisitor::visitStoreImpl(
NewData = arrayToVector(NewData, ArrayAsVecType, Name);
}
- Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
+ Align PartAlign = commonAlignment(OrigSI.getAlign(), AggByteOff);
+ uint64_t MaxWidth = maxIntrinsicWidth(ArrayAsVecType, PartAlign);
+ Type *LegalType = legalNonAggregateForMemOp(ArrayAsVecType, MaxWidth);
if (LegalType != ArrayAsVecType) {
NewData = makeLegalNonAggregate(NewData, LegalType, Name);
}
SmallVector<VecSlice> Slices;
- getVecSlices(LegalType, Slices);
+ getVecSlices(LegalType, MaxWidth, Slices);
bool NeedToSplit = Slices.size() > 1 || IsAggPart;
if (!NeedToSplit) {
Type *StorableType = intrinsicTypeFor(LegalType);
@@ -1161,10 +1237,11 @@ std::pair<bool, bool> LegalizeBufferContentTypesVisitor::visitStoreImpl(
Type *SliceType =
S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType;
int64_t ByteOffset = AggByteOff + S.Index * ElemBytes;
- Value *NewPtr =
- IRB.CreateGEP(IRB.getInt8Ty(), OrigPtr, IRB.getInt32(ByteOffset),
- OrigPtr->getName() + ".part." + Twine(S.Index),
- GEPNoWrapFlags::noUnsignedWrap());
+ Value *NewPtr = IRB.CreateGEP(
+ IRB.getInt8Ty(), OrigPtr, IRB.getInt32(ByteOffset),
+ OrigPtr->getName() + ".part." + Twine(S.Index),
+ ST->hasRelaxedBufferOOBMode() ? GEPNoWrapFlags::noUnsignedWrap()
+ : GEPNoWrapFlags::none());
Value *DataSlice = extractSlice(NewData, S, Name);
Type *StorableType = intrinsicTypeFor(SliceType);
DataSlice = IRB.CreateBitCast(DataSlice, StorableType,
@@ -1193,6 +1270,7 @@ bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) {
}
bool LegalizeBufferContentTypesVisitor::processFunction(Function &F) {
+ ST = &TM->getSubtarget<GCNSubtarget>(F);
bool Changed = false;
// Note, memory transfer intrinsics won't
for (Instruction &I : make_early_inc_range(instructions(F))) {
@@ -2438,8 +2516,8 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, DL,
M.getContext(), &TM);
- LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL,
- M.getContext());
+ LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(
+ DL, M.getContext(), &TM);
for (Function &F : M.functions()) {
bool InterfaceChange = hasFatPointerInterface(F, &StructTM);
bool BodyChanges = containsBufferFatPointers(F, &StructTM);
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index b66ee994ce7ee..0532b82caf422 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -6254,19 +6254,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
-; GFX11-NEXT: v_mov_b32_e32 v0, s16
+; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v1, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v0, v0, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -6287,12 +6294,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v1, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v3, v0, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_pk_add_f16 v4, v5, v2
@@ -6324,13 +6336,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v1, v0, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v3, v0, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v5, v0
; GFX908-NEXT: v_pk_add_f16 v4, v5, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
@@ -6351,13 +6367,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_ushort v1, v0, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v0
; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
@@ -6379,17 +6399,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6425,17 +6446,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX6-NEXT: s_waitcnt vmcnt(1)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4
+; ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/134329
More information about the llvm-commits
mailing list